Exemple #1
0
def test_stratified_shuffle_split_iter_no_indices():
    y = np.asarray([0, 1, 2] * 10)

    sss1 = cval.StratifiedShuffleSplit(y, indices=False, random_state=0)
    train_mask, test_mask = next(iter(sss1))

    sss2 = cval.StratifiedShuffleSplit(y, indices=True, random_state=0)
    train_indices, test_indices = next(iter(sss2))

    assert_array_equal(sorted(test_indices), np.where(test_mask)[0])
Exemple #2
0
def sample_random_n(table,
                    n,
                    stratified=False,
                    replace=False,
                    random_state=None):
    assert n > 0
    n = int(n)
    if replace:
        ind = cross_validation.Bootstrap(len(table),
                                         train_size=n,
                                         random_state=random_state)
    elif stratified and is_discrete(table.domain.class_var):
        train_size = max(len(table.domain.class_var.values), n)
        test_size = max(len(table) - train_size, 0)
        ind = cross_validation.StratifiedShuffleSplit(
            table.Y.ravel(),
            n_iter=1,
            test_size=test_size,
            train_size=train_size,
            random_state=random_state)
    else:
        train_size = max(len(table.domain.class_var.values), n)
        test_size = max(len(table) - train_size, 0)
        ind = cross_validation.ShuffleSplit(len(table),
                                            n_iter=1,
                                            test_size=test_size,
                                            train_size=train_size,
                                            random_state=random_state)
    return next(iter(ind))
Exemple #3
0
def sample_random_n(table,
                    n,
                    stratified=False,
                    replace=False,
                    random_state=None):
    if replace:
        if random_state is None:
            rgen = np.random
        else:
            rgen = np.random.mtrand.RandomState(random_state)
        sample = rgen.random_integers(0, len(table) - 1, n)
        o = np.ones(len(table))
        o[sample] = 0
        others = np.nonzero(o)[0]
        return others, sample
    if stratified and is_discrete(table.domain.class_var):
        test_size = max(len(table.domain.class_var.values), n)
        ind = skl_cross_validation.StratifiedShuffleSplit(
            table.Y.ravel(),
            n_iter=1,
            test_size=test_size,
            train_size=len(table) - test_size,
            random_state=random_state)
    else:
        ind = skl_cross_validation.ShuffleSplit(len(table),
                                                n_iter=1,
                                                test_size=n,
                                                random_state=random_state)
    return next(iter(ind))
Exemple #4
0
def testing_cycle(data, classes, gamma=0, C=0, loops=50):
    '''
  Takes data, does a stratified split (70/30) and trains/tests the provided
  classifier $loops times
  '''

    #Create random (but balanced) test/train groups
    splits = cval.StratifiedShuffleSplit(classes, n_iter=loops, test_size=0.3)

    scores = []
    for train_indices, test_indices in splits:

        classifier = svm.SVC(gamma=gamma, C=C, kernel='rbf')

        train_data = [data[i] for i in train_indices]
        train_classes = [classes[i] for i in train_indices]
        test_data = [data[i] for i in test_indices]
        test_classes = [classes[i] for i in test_indices]

        #train_data = np.array(train_data)
        #Train model
        classifier.fit(train_data, train_classes)
        #Get predictions
        predictions = classifier.predict(test_data)
        score = sum([1 for i, t in zip(predictions, test_classes) if i == t
                     ]) / float(len(predictions))
        scores.append(score)

    print 'Average score: ', np.mean(scores)
    print 'Best score: ', max(scores)
    print '\n\n\n'
    plt.hist(scores)
    plt.show()
def fit_ann(data_X, data_Y):
    ann = MLPClassifier(
        alpha=1,
        hidden_layer_sizes=(12, 12, 12),
        solver='adam',  #sgd  adam
        learning_rate='adaptive',
        learning_rate_init=0.001,  #0.001, invscaling adaptive
        momentum=0.4,
        max_iter=500)
    cv_score = 0

    t_start = time()
    ann.fit(data_X, data_Y)
    time_ann = time() - t_start

    cv = cross_validation.StratifiedShuffleSplit(data_Y,
                                                 n_iter=5,
                                                 test_size=0.3,
                                                 random_state=42)

    train_score = round(ann.score(data_X, data_Y), 4) * 100
    cv_score, auc = training_score(ann, data_X, data_Y, cv)

    print "\ntrain {0:.2f}   cv: {1:.2f}  auc: {2:.2f}   time {3:.4f}".format(
        train_score, cv_score, auc, time_ann)

    # run learning curve
    #run_learning_curve(ann, data_X, data_Y, cv)
    return cv_score, train_score, auc, time_ann
Exemple #6
0
def cv_select(y, random_state, n_cv, cv, test_size=0.1):
    if isinstance(cv, basestring):
        if cv == 'shuffle':
            return cross_validation.StratifiedShuffleSplit(
                y, n_cv, test_size=test_size, random_state=random_state)
        elif cv == 'loo':
            return cross_validation.LeaveOneOut(n_cv)
        elif cv == 'kfold':
            return cross_validation.StratifiedKFold(y, n_folds=n_cv)
        elif cv == 'boot':
            return cross_validation.Bootstrap(len(y),
                                              n_iter=n_cv,
                                              train_size=(1 - test_size),
                                              random_state=random_state)
        elif cv == 'boot632':
            return bootstrap_632(len(y),
                                 n_iter=n_cv,
                                 random_state=random_state)
        # for regression
        elif cv == '_shuffle':
            return cross_validation.ShuffleSplit(len(y),
                                                 n_iter=n_cv,
                                                 test_size=test_size,
                                                 random_state=random_state)
        elif cv == '_kfold':
            return cross_validation.KFold(len(y), n_folds=n_cv)
        else:
            raise ValueError("bad cv:%s" % cv)
    else:
        return cv
    def train_on_features(self, clf=None, parameters=None, cv=None):
        """Train a support vector machine classifier on the features and labels
        that have been produced using self.create_features_set.
        """
        if not hasattr(self, "features"):
            raise ValueError(
                "No features present, have you run create_features_set?")
        if not hasattr(self, "labels"):
            raise ValueError(
                "No labels present, have you run create_features_set?")

        if clf is None:
            self.svc = LinearSVC()
        else:
            self.svc = clf

        score_func = f1_score

        if cv is None:
            cv = cross_validation.StratifiedShuffleSplit((self.labels),
                                                         test_size=1 / 2.,
                                                         n_iterations=10)

        if parameters is None:
            parameters = {"dual": [False, False]}

        grid = GridSearchCV(self.svc,
                            parameters,
                            score_func=score_func,
                            cv=cv,
                            verbose=0,
                            n_jobs=1)
        grid.fit(self.features, self.labels)
        self.svc = grid.best_estimator_
        return self
Exemple #8
0
def test_stratified_shuffle_split_iter():
    ys = [
        np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3]),
        np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]),
        np.array([0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2]),
        np.array([1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4]),
        np.array([-1] * 800 + [1] * 50)
    ]

    for y in ys:
        sss = cval.StratifiedShuffleSplit(y,
                                          6,
                                          test_size=0.33,
                                          random_state=0,
                                          indices=True)
        for train, test in sss:
            assert_array_equal(unique(y[train]), unique(y[test]))
            # Checks if folds keep classes proportions
            p_train = (np.bincount(unique(y[train], return_inverse=True)[1]) /
                       float(len(y[train])))
            p_test = (np.bincount(unique(y[test], return_inverse=True)[1]) /
                      float(len(y[test])))
            assert_array_almost_equal(p_train, p_test, 1)
            assert_equal(y[train].size + y[test].size, y.size)
            assert_array_equal(np.lib.arraysetops.intersect1d(train, test), [])
Exemple #9
0
def do_cv(clf,
          X,
          y,
          n_samples=1000,
          n_iter=3,
          test_size=0.1,
          quiet=False,
          scoring=None,
          stratified=False,
          fit_params=None,
          reseed_classifier=True,
          n_jobs=-1):
    t0 = time.time()
    if reseed_classifier: reseed(clf)
    if type(n_samples) is float: n_samples = int(n_samples)
    try:
        if (n_samples > X.shape[0]): n_samples = X.shape[0]
    except:
        pass
    cv = cross_validation.ShuffleSplit(n_samples, n_iter=n_iter, test_size=test_size, random_state=cfg['sys_seed']) \
      if not(stratified) else cross_validation.StratifiedShuffleSplit(y, n_iter, train_size=n_samples, test_size=test_size, random_state=cfg['sys_seed'])

    test_scores = cross_validation.cross_val_score(clf,
                                                   X,
                                                   y,
                                                   cv=cv,
                                                   scoring=scoring
                                                   or cfg['scoring'],
                                                   fit_params=fit_params,
                                                   n_jobs=n_jobs)
    if not (quiet):
        dbg('%s took: %.2fm' % (mean_score(test_scores),
                                (time.time() - t0) / 60))
    return (np.mean(test_scores), sem(test_scores))
def splitDatasetInBlocks(data, labels, trainBlockSizes, testSetPercentage):

    trainDataBlocks = []
    trainLabelBlocks = []
    testDataBlocks = []
    testLabelBlocks = []

    for i in range(len(trainBlockSizes)):
        train = trainBlockSizes[i]
        test = testSetPercentage * trainBlockSizes[i]

        skf = cross_validation.StratifiedShuffleSplit(labels,
                                                      5,
                                                      train_size=train,
                                                      test_size=test)

        a = []
        b = []
        c = []
        d = []

        for trainIndex, testIndex in skf:
            a.append(data[trainIndex])
            b.append(labels[trainIndex])
            c.append(data[testIndex])
            d.append(labels[testIndex])

        trainDataBlocks.append(a)
        trainLabelBlocks.append(b)
        testDataBlocks.append(c)
        testLabelBlocks.append(d)

    return trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks
Exemple #11
0
def train_and_test_model(data,
                         response,
                         labels,
                         model_type,
                         split_by,
                         c,
                         impute=True,
                         varname=""):
    """ train and test model of users based on given response variable """
    model, type, model_string = models[model_type]
    if type == 'c':
        split = cross_validation.StratifiedShuffleSplit(response, 1, 0.2)
    else:
        #split = cross_validation.KFold(len(response), 5)
        #split = cross_validation.LeavePLabelOut(labels, 3)
        split = cross_validation.LeaveOneLabelOut(labels)
    predict = np.zeros(response.shape)
    for train, test in split:
        model.fit(data[train], response[train])
        predict[test] = model.predict(data[test])
        #print np.corrcoef(np.vstack((response[test], predict[test])))[0,1]
    plot_obs_pred(predict, response, "%s Model Performance" % model_string,
                  varname)
    model.fit(data, response)
    return model
Exemple #12
0
def loadData(filename):
	infile = open(filename, 'r')
	data = np.array([[item for item in line.strip().split(',')] for line in infile])		
	
	# extract the target values for the samples and the the set of target names. 
	# change the target values to integers 
	targets = data[:,-1]
	target_names = list(set(targets))
	for name in target_names:
		targets[targets == name] = target_names.index(name)
	
	targets = targets.astype(int)
	X = data[:,:-1].astype(float)

	# split the data into training and test sets with the ratio 70-30, preserving the percentage of samples for each class
	np.random.seed(0)
	cv = cross_validation.StratifiedShuffleSplit(targets, n_iter=1, test_size=0.3)

	# vectorize the target values, i.e. replace 0 by (1,0,0), 1 by (0,1,0), 2 by (0,0,1) etc.
	y = np.zeros((len(targets), len(target_names)))
	for i in range(len(targets)):
		y[i, targets[i]] = 1	
	
	for train_index, test_index in cv:
		X_train, X_test = X[train_index], X[test_index]
		y_train = y[train_index]
		targets_train, targets_test = targets[train_index], targets[test_index]

	infile.close()
	return X_train, X_test, y_train, targets_train, targets_test
def test_ionosphere():
    f = open("../results/basic_ionosphere_cv_results.txt", 'w')
    rng = np.random.RandomState()
    params = {
        'window': [5, 10, 15, 20, 25, 30],
        'num_particles': [5, 10, 15, 20]
    }

    X = np.genfromtxt('../data/ionosphere.data', delimiter=',')[:, :-1]
    Y = np.genfromtxt('../data/ionosphere.data',
                      delimiter=',',
                      usecols=[-1],
                      dtype='str')
    le = LabelEncoder()
    y = le.fit_transform(Y)

    for w in params['window']:
        for p in params['num_particles']:
            # do a 5x2 cross val
            sss = cv.StratifiedShuffleSplit(y,
                                            n_iter=5,
                                            test_size=0.5,
                                            random_state=rng)
            mses, accs, evals = [], [], []
            for train_index, test_index in sss:
                mse, acc, ev = xval(
                    BasicOSI(n_hidden=[5],
                             num_particles=p,
                             window=w,
                             random_state=rng,
                             validation_size=0.33,
                             verbose=False), X, y, train_index, test_index)
                mses.append(mse)
                accs.append(acc)
                evals.append(ev)
                mse, acc, ev = xval(
                    BasicOSI(n_hidden=[5],
                             num_particles=p,
                             window=w,
                             random_state=rng,
                             validation_size=0.33,
                             verbose=False), X, y, test_index, train_index)
                mses.append(mse)
                accs.append(acc)
                evals.append(ev)
            print ",".join(
                map(str,
                    [w, p, np.mean(mses),
                     np.mean(accs),
                     np.mean(evals)]))
            f.write("\n" + ",".join(
                map(str,
                    [w, p, np.mean(mses),
                     np.mean(accs),
                     np.mean(evals)])))
            f.write("\n" + ",".join(map(str, mses)))
            f.write("\n" + ",".join(map(str, accs)))
            f.write("\n" + ",".join(map(str, evals)))
            f.flush()
    f.close()
def genTrainTest(df,features,queries,ranks,ts=.5):
	#features
	X =df[features]
	X = np.asarray(X)
	#X=np.asarray(X)
	#X=np.asarray(data0['sSvol'],data0['tSvol'])
	#queries
	blocks=np.asarray(list(df[queries]))
	#ranks
	y=np.asarray(list(df[ranks]))
	#split into test and train
	cv = cross_validation.StratifiedShuffleSplit(df[ranks],test_size=ts)
	train, test = iter(cv).next()
	X_train, y_train, b_train = X[train], y[train], blocks[train]
	X_test, y_test, b_test = X[test], y[test], blocks[test]
	#Scale features to range [0,1] in training dat
	#Mean/SD scaling doesn't make sense with so many factor variables
	#Each topic would have a different range
	scaler = preprocessing.MinMaxScaler()
	X_train = scaler.fit_transform(X_train)
	#Use same transformation on test data (may not have range from 0 to 1 in test)
	X_test = scaler.transform(X_test)
	#output
	#train = [X_train, y_train, b_train]
	#test = [X_test, y_test, b_test]
	return X_train, y_train, b_train, X_test, y_test, b_test
Exemple #15
0
def load_dataset(limit=None, skip=0):
    X, y, ids = db2np(db_trans,limit=limit, skip=skip)
    sss = cross_validation.StratifiedShuffleSplit(y[:,0], n_iter=1, test_size=VALIDATION_SIZE, random_state=SEED)
    for train_index, test_index in sss:
        X_train = X[train_index]
        y_train = y[train_index]
        X_val = X[test_index]
        y_val = y[test_index]
    return X_train, y_train, X_val, y_val, X, y, ids
Exemple #16
0
def sample(table, n=0.7, stratified=False, replace=False, random_state=None):
    """
    Samples data instances from a data table. Returns the sample and
    a data set from input data table that are not in the sample. Also
    uses several sampling functions from
    `scikit-learn <http://scikit-learn.org>`_.

    table : data table
        A data table from which to sample.

    n : float, int (default = 0.7)
        If float, should be between 0.0 and 1.0 and represents
        the proportion of data instances in the resulting sample. If
        int, n is the number of data instances in the resulting sample.

    stratified : bool, optional (default = False)
        If true, sampling will try to consider class values and
        match distribution of class values
        in train and test subsets.

    replace : bool, optional (default = False)
        sample with replacement

    random_state : int or RandomState
        Pseudo-random number generator state used for random sampling.
    """

    if type(n) == float:
        n = int(n * len(table))

    if replace:
        if random_state is None:
            rgen = np.random
        else:
            rgen = np.random.mtrand.RandomState(random_state)
        sample = rgen.randint(0, len(table), n)
        o = np.ones(len(table))
        o[sample] = 0
        others = np.nonzero(o)[0]
        return table[sample], table[others]

    n = len(table) - n
    if stratified and table.domain.has_discrete_class:
        test_size = max(len(table.domain.class_var.values), n)
        ind = skl_cross_validation.StratifiedShuffleSplit(
            table.Y.ravel(),
            n_iter=1,
            test_size=test_size,
            train_size=len(table) - test_size,
            random_state=random_state)
    else:
        ind = skl_cross_validation.ShuffleSplit(len(table),
                                                n_iter=1,
                                                test_size=n,
                                                random_state=random_state)
    ind = next(iter(ind))
    return table[ind[0]], table[ind[1]]
def test_iris():
    f = open("../results/basic_iris_cv_results.txt", 'w')
    rng = np.random.RandomState()
    params = {
        'window': [5, 10, 15, 20, 25, 30],
        'num_particles': [5, 10, 15, 20]
    }

    iris = datasets.load_iris()

    for w in params['window']:
        for p in params['num_particles']:
            # do a 5x2 cross val
            sss = cv.StratifiedShuffleSplit(iris.target,
                                            n_iter=5,
                                            test_size=0.5,
                                            random_state=rng)
            mses, accs, evals = [], [], []
            for train_index, test_index in sss:
                mse, acc, ev = xval(
                    BasicOSI(n_hidden=[3],
                             num_particles=p,
                             window=w,
                             random_state=rng,
                             validation_size=0.33,
                             verbose=False), iris.data, iris.target,
                    train_index, test_index)
                mses.append(mse)
                accs.append(acc)
                evals.append(ev)
                mse, acc, ev = xval(
                    BasicOSI(n_hidden=[3],
                             num_particles=p,
                             window=w,
                             random_state=rng,
                             validation_size=0.33,
                             verbose=False), iris.data, iris.target,
                    test_index, train_index)
                mses.append(mse)
                accs.append(acc)
                evals.append(ev)
            print ",".join(
                map(str,
                    [w, p, np.mean(mses),
                     np.mean(accs),
                     np.mean(evals)]))
            f.write("\n" + ",".join(
                map(str,
                    [w, p, np.mean(mses),
                     np.mean(accs),
                     np.mean(evals)])))
            f.write("\n" + ",".join(map(str, mses)))
            f.write("\n" + ",".join(map(str, accs)))
            f.write("\n" + ",".join(map(str, evals)))
            f.flush()
    f.close()
Exemple #18
0
def load_dataset(limit=None, skip=0):
    db_trans = pymongo.MongoClient("192.168.0.99:30000")["google"]["trainingset"]
    X, y = db2np(db_trans,limit=limit, skip=skip)
    sss = cross_validation.StratifiedShuffleSplit(y[:,1], n_iter=1, test_size=VALIDATION_SIZE, random_state=SEED)
    for train_index, test_index in sss:
        X_train = X[train_index]
        y_train = y[train_index]
        X_val = X[test_index]
        y_val = y[test_index]
    return X_train, y_train, X_val, y_val, X_val, y_val
Exemple #19
0
 def setup_indices(self, train_data, test_data):
     if self.stratified and test_data.domain.has_discrete_class:
         self.indices = skl_cross_validation.StratifiedShuffleSplit(
             test_data.Y, n_iter=self.n_resamples, train_size=self.train_size,
             test_size=self.test_size, random_state=self.random_state
         )
     else:
         self.indices = skl_cross_validation.ShuffleSplit(
             len(test_data), n_iter=self.n_resamples, train_size=self.train_size,
             test_size=self.test_size, random_state=self.random_state
         )
Exemple #20
0
def split_indices(files, labels, test_size=0.1, random_state=RANDOM_STATE):
    names = get_names(files)
    labels = get_labels(names, per_patient=True)
    spl = cross_validation.StratifiedShuffleSplit(labels[:, 0], 
                                                  test_size=test_size, 
                                                  random_state=random_state,
                                                  n_iter=1)
    tr, te = next(iter(spl))
    tr = np.hstack([tr * 2, tr * 2 + 1])
    te = np.hstack([te * 2, te * 2 + 1])
    return tr, te
Exemple #21
0
def cv_loop(X, y, model, rseed=42, n_iter=8):
    cv = cross_validation.StratifiedShuffleSplit(y,
                                                 random_state=rseed,
                                                 n_iter=n_iter)
    scores = cross_validation.cross_val_score(model,
                                              X,
                                              y,
                                              scoring='roc_auc',
                                              n_jobs=1,
                                              cv=cv)
    return np.mean(scores)
Exemple #22
0
def get_classifier_scores(clf_class, clf_kwargs, features_train, features_test,
                          labels_train, labels_test, feature_list):

    # instantiate classifier with related arguments
    clf = clf_class()

    # set up cross validation
    crossval = cross_validation.StratifiedShuffleSplit(
                                labels_train,
                                50,
                                test_size=gl_test_size,
                                random_state=gl_random_state)

    # perform grid search to find optimal parameter configuration
    grid_search = GridSearchCV(clf, clf_kwargs, cv=crossval, scoring='recall')

    #grid_search = GridSearchCV(clf, clf_kwargs, scoring='recall')

    # train
    grid_search.fit(features_train, labels_train)

    # pick a winner
    best_clf = grid_search.best_estimator_

    # predict for test features
    predictions = best_clf.predict(features_test)

    # calculate accuracy
    scores = dict()
    scores["accuracy"] = accuracy_score(labels_test, predictions)
    scores["precision"] = precision_score(labels_test, predictions)
    scores["recall"] = recall_score(labels_test, predictions)

    # declare as string else you point to a changing value
    best_configuration = ""
    best_configuration = str(grid_search.best_estimator_)

    # Print the feature ranking
    try:
        # Get importance of features
        importances = best_clf.feature_importances_
        indices = np.argsort(importances)[::-1]

        print("Feature ranking:")
        for f in range(features_train.shape[1]):
            print("%d. feature %s (%f)" % (f+1, feature_list[indices[f]], importances[indices[f]]))

    except:
        print "no importances available for classifier " + str(clf_class)

    # return cm scores and accuracy
    return scores["precision"], scores["recall"], scores["accuracy"], \
           grid_search.best_params_, best_configuration
Exemple #23
0
def cv_loop(X, y, model, N, N_JOBS=4, seed=25):
    scores = cross_validation.cross_val_score(
        model,
        X,
        y,
        scoring='roc_auc',
        pre_dispatch=N_JOBS,
        n_jobs=N_JOBS,
        cv=cross_validation.StratifiedShuffleSplit(y,
                                                   random_state=seed,
                                                   n_iter=N))
    return sum(scores) / N
Exemple #24
0
def create_test_split(X, y, test_size=0.3):
    # X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = test_size)
    sss = cross_validation.StratifiedShuffleSplit(y, 1, test_size=test_size)
    for train, test in sss:
        train_indices = train
        test_indices = test
        # print train, test
    X_train = X[(train_indices)]
    y_train = y[(train_indices)]
    X_test = X[(test_indices)]
    y_test = y[(test_indices)]
    return X_train, X_test, y_train, y_test
Exemple #25
0
def train_final_model(traindata, targets):
    model = linear_model.LogisticRegression(penalty='l2',
                                            dual=True,
                                            C=0.1,
                                            fit_intercept=True)
    cv = cross_validation.StratifiedShuffleSplit(targets, n_iter=4)
    scores = cross_validation.cross_val_score(model, traindata, targets, \
        cv=cv, n_jobs=-1, score_func=metrics.auc_score)
    print "Cross-validation accuracy on the training set for final model:"
    print "%0.3f (+/-%0.03f)" % (scores.mean(), scores.std() / 2)
    model.fit(traindata, targets)
    return model
Exemple #26
0
def test_stratified_shuffle_split_overlap_train_test_bug():
    # See https://github.com/scikit-learn/scikit-learn/issues/6121 for
    # the original bug report
    labels = [0, 1, 2, 3] * 3 + [4, 5] * 5

    splits = cval.StratifiedShuffleSplit(labels,
                                         n_iter=1,
                                         test_size=0.5,
                                         random_state=0)
    train, test = next(iter(splits))

    assert_array_equal(np.intersect1d(train, test), [])
Exemple #27
0
def createValidation(data, labels, test, train, ids):
    sss = cross_validation.StratifiedShuffleSplit(labels,
                                                  1,
                                                  test_size=test,
                                                  train_size=train,
                                                  random_state=0)
    for train_index, test_index in sss:
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        test_ids = ids[test_index]
    # return X_train, X_test, y_train, y_test, test_ids
    return X_train, X_test, y_train, y_test
    def _split_shuffle(self, data, all_labels):
        # find out other
        other_indices = []
        ## extract ids of posts classified as "Other"
        if not self.config.remove_other:
            for i, ele in enumerate(data):
                for label in ele["label"]:
                    if label == self.config.other_id:
                        other_indices.append(i)
        all_indices = np.arange(len(data))
        re_indices = list(set(all_indices) - set(other_indices))
        ## extract samples according to distributions of classes
        skf = cross_validation.StratifiedShuffleSplit(all_labels,
                                                      2,
                                                      test_size=0.4,
                                                      random_state=0)
        for train_index, test_index in skf:
            # X_train, X_test = re_indices[train_index], re_indices[test_index]
            data_train = [data[id] for id in train_index]
            data_test = [data[id] for id in test_index]
        exp_data = {"data_train": data_train, "data_test": data_test}
        exp_fw = open(os.path.join(".", "data", "sto", "exp_data.json"), "wb")
        pickle.dump(exp_data, exp_fw)
        exp_fw.close()

        train_dict = {}
        for data_ele in data_train:
            for sub_label in data_ele["label"]:
                if sub_label not in train_dict:
                    train_dict[sub_label] = 0
                train_dict[sub_label] += 1
        print("training data ", train_dict)

        test_dict = {}
        for data_ele in data_test:
            for sub_label in data_ele["label"]:
                if sub_label not in test_dict:
                    test_dict[sub_label] = 0
                test_dict[sub_label] += 1
        print("test data ", test_dict)

        print("Len of training data is ", len(data_train),
              "; Len of test data is ", len(data_test))

        ## random shuffle
        # np.random.shuffle(re_indices)
        # num_test = int(len(re_indices) * 0.2)
        #
        # data_train = [data[id] for id in re_indices[:-num_test]]
        # data_test = [data[id] for id in re_indices[-num_test:] + other_indices]

        return data_train, data_test
Exemple #29
0
 def split(df, fraction_test):
     sss = cross_validation.StratifiedShuffleSplit(
         y=df.yyyymm,
         n_iter=1,
         test_size=fraction_test,
         train_size=None,
         random_state=control.random_seed,
     )
     assert len(sss) == 1
     for train_index, test_index in sss:
         train = df.iloc[train_index]
         test = df.iloc[test_index]
     return test, train
Exemple #30
0
def load_dataset(limit=None, skip=0):
    #from get_data.build_trainingset import db2np
    import pymongo
    from sklearn import cross_validation
    db_trans = pymongo.MongoClient("192.168.0.99:30000")["google"]["transformedset"]
    X, y = db2np(db_trans,limit=limit, skip=skip)
    sss = cross_validation.StratifiedShuffleSplit(y, n_iter=1, test_size=.2, random_state=3476)
    for train_index, test_index in sss:
        X_train = X[train_index]
        y_train = y[train_index]
        X_val = X[test_index]
        y_val = y[test_index]
    return X_train, y_train, X_val, y_val, X, y