Example #1
0
 def get_sub_set_with_size(self, data, set_size, random_state=1):
     X, y = data
     ss = ShuffleSplit(X.shape[0], n_iter=1, train_size=set_size,
             test_size=0, random_state=random_state)
     sub_index, other_index = ss.__iter__().next()
     X_sub, y_sub = X[sub_index], y[sub_index]
     return X_sub, y_sub
Example #2
0
	def grid_search(self, n_iter=5):
		rs = ShuffleSplit(self.data.shape[0], n_iter=1, test_size=.1, random_state=0)
		train, test = rs.__iter__().next()

		x_train, y_train, w_train = self.data[train], self.labels[train], self.weights[train]
		x_test, y_test = self.data[test], self.labels[test]

		scores = ['precision', 'recall']
		for score in scores:
			print("# Tuning hyper-parameters for %s" % score)

			clf = GridSearchCV(self.classifier, self.param_grid, cv=2, 
					scoring=score, verbose=3, n_jobs=2)
			clf.fit(x_train, y_train)

			print("Best parameters set found on development set:")
			print(clf.best_estimator_)
			print("Grid scores on development set:")
			for params, mean_score, scores in clf.grid_scores_:
				print("%0.3f (+/-%0.03f) for %r"
					  % (mean_score, scores.std() / 2, params))

			print("Detailed classification report:")
			print("The model is trained on the full development set.")
			print("The scores are computed on the full evaluation set.")
			y_true, y_pred = y_test, clf.predict(x_test)
			print(classification_report(y_true, y_pred))

			test_ams, test_threshold = self.calculateAMS(test, clf.best_estimator_)
			train_ams, train_threshold = self.calculateAMS(train, clf.best_estimator_)

			print(('Test AMS %f, Train AMS %f') % (test_ams, train_ams))
Example #3
0
    def get_sub_set_with_size(self, data, set_size):
        """
        @param train_data is [X, y]
        """
        X, y = data

        ss = ShuffleSplit(X.shape[0], n_iter=1, train_size=set_size, test_size=0, random_state=1)
        sub_index, other_index = ss.__iter__().next()

        X_sub = X[sub_index]
        y_sub = y[sub_index]

        return X_sub, y_sub
Example #4
0
def toNumpyDominant():
    X, y = load_svmlight_file(default_train_file)

    ss = ShuffleSplit(X.shape[0], n_iter=1, test_size=5640, random_state=1)
    train_index, test_index = ss.__iter__().next()

    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]

    print 'loaded snippet dataset'
    print 'entire training set size', y_train.size
    print 'test set size', y_test.size

    return X_train, y_train, X_test, y_test
Example #5
0
def toNumpy():
    X, y = load_data()
    ss = ShuffleSplit(X.shape[0], n_iter=1, test_size=0.5, random_state=0)
    train_index, test_index = ss.__iter__().next()

    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]

    sel = SelectKBest(chi2, 200)
    X_train = sel.fit_transform(X_train, y_train)
    X_test = sel.transform(X_test)

    X_train = X_train.toarray()
    X_test = X_test.toarray()

    return X_train, y_train, X_test, y_test
Example #6
0
def toNumpy():
    print "News 20 dataset is being loaded"
    X, y = load_libsvm(default_train_file)

    ss = ShuffleSplit(X.shape[0], n_iter=1, test_size=0.5, random_state=0)
    train_index, test_index = ss.__iter__().next()

    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]

    # Let's perform feature selection to decrease the memory requirements
    f = SelectKBest(chi2, k=2000)
    X_train = f.fit_transform(X_train, y_train)
    X_test = f.transform(X_test)

    return X_train, y_train, X_test, y_test
Example #7
0
    def get_sub_set_with_size(self, data, set_size):
        '''
        @param train_data is [X, y]
        '''
        X, y = data

        # random_state is fixed for reproducibility
        ss = ShuffleSplit(X.shape[0], n_iter=1,
                train_size=set_size, test_size=0,
                random_state=1)
        sub_index, other_index = ss.__iter__().next()

        X_sub = X[sub_index]
        y_sub = y[sub_index]

        return X_sub, y_sub
Example #8
0
    def get_sub_set_with_size(self, data, set_size):
        '''
        @param train_data is [X, y]
        '''
        X, y = data

        # TODO: you might want to change random_state
        ss = ShuffleSplit(X.shape[0], n_iter=1,
                train_size=set_size, test_size=0,
                random_state=1)
        sub_index, other_index = ss.__iter__().next()

        X_sub = X[sub_index]
        y_sub = y[sub_index]

        return X_sub, y_sub
Example #9
0
def toNumpy(err = 0.3, n_sample = 10000, n_class = 2, train_err = 0):
    if n_class == 2:
        X, y = generate_binary_class(n_sample, err)
    elif n_class == 4:
        X, y = generate_four_class(n_sample, err)
    else:
        raise ValueError

    ss = ShuffleSplit(y.size, n_iter=1, test_size=0.8, random_state=0)
    train_index, test_index = ss.__iter__().next()

    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]

    y_train = inject_noise(y_train, train_err)

    return X_train, y_train, X_test, y_test
Example #10
0
def grow_single(args):
    rand_int, labeled_data, parent = args
    X, y = labeled_data

    ss = ShuffleSplit(y.size, n_iter=1, test_size=parent.split_r, random_state = rand_int)
    train_index, calib_index = ss.__iter__().next()
    train_set = X_train, y_train = X[train_index], y[train_index]
    calib_set = X_calib, y_calib = X[calib_index], y[calib_index]

    if len(set(y_train)) == 1 or len(set(y_calib)) == 1:  # extreme case.
        return None

    clf = parent.base_clf_class()
    clf.fit(X_train, y_train)

    y_preds_prob = clf.predict_proba(X_calib)[:,1]
    y_trues = y_calib

    return clf, y_trues, y_preds_prob
Example #11
0
def toNumpy():
    print "Covtype dataset is being loaded"
    X, y = load_libsvm(default_train_file)

    ss = ShuffleSplit(X.shape[0], n_iter=1, test_size=0.5, random_state=0)
    train_index, test_index = ss.__iter__().next()

    train_data = X[train_index], y[train_index]
    test_data = X[test_index], y[test_index]

    X_train, y_train = get_sub_set_with_size(train_data, 10000)
    X_test, y_test = get_sub_set_with_size(test_data, 10000)

    y_train[y_train == 1] = 0
    y_train[y_train == 2] = 1
    y_test[y_test == 1] = 0
    y_test[y_test == 2] = 1

    return X_train, y_train, X_test, y_test
Example #12
0
    def get_sub_set_with_size(self, data, set_size):
        '''
        @param train_data is [X, y]
        '''
        X, y = data

        ss = ShuffleSplit(X.shape[0], n_iter=1,
                train_size=set_size, test_size=0,
                random_state=1)
        sub_index, other_index = ss.__iter__().next()

        X_sub = X[sub_index]
        y_sub = y[sub_index]

        from collections import Counter
        print "Class counts"
        print Counter(y_sub)

        return X_sub, y_sub
Example #13
0
def randomSplit(df, y_var_name, x_var_names, testSize=0.35, seedIn=None):
    """ Scale, impute missing, then split dataset *df* randomly into tuple (test_x, test_y, train_x, train_y)

    :param df: the pandas data frame to split
    :param y_var_name: the name of the variable (column of df) we try to predict.
    :type y_var_name: str
    :param x_var_names: list of the predictor variables.
    :type x_var_names: list(str)
    :param testSize: the fraction (0.0 to 1.0) of the dataset to put in TEST partition.
    :param seedIn: seed to the random number generator for reproducible results (what ARE those even).
    :returns: dict -- {'train_x','train_y','test_x','test_y'}
    """
    scaler = preprocessing.StandardScaler()
    imputer = preprocessing.Imputer(missing_values="NaN", strategy="mean", axis=0)

    # remove rows where y_variable is missing.
    good_inds = (np.isnan(df[y_var_name])==False).nonzero()
    d = df.iloc[good_inds]

    if seedIn!=None:
        ss = ShuffleSplit(d.shape[0], n_iter=1, test_size=testSize, random_state=seedIn)
    else:
        ss = ShuffleSplit(d.shape[0], n_iter=1, test_size=testSize)

    training_inds, test_inds = ss.__iter__().next()
    training_rows = d.iloc[training_inds]
    test_rows = d.iloc[test_inds]

    data_tr = training_rows[x_var_names]
    imputer.fit(data_tr)
    scaler.fit(imputer.transform(data_tr))
    data_tr_scaled = scaler.transform(imputer.transform(data_tr))

    data_test = test_rows[x_var_names]
    data_test_scaled = scaler.transform(imputer.transform(data_test))

    return {'test_x': data_test_scaled,
            'test_y': test_rows[y_var_name],
            'train_x': data_tr_scaled,
            'train_y': training_rows[y_var_name],
            }
Example #14
0
def VisualizeModelLearning(X, y):
    # Calculate performance of several models with varying training data sizes
    # then plot the learning and testing scores for each model
    
    # Create 10 cross-validation sets for training and testing
    cv = ShuffleSplit(X.shape[0], n_iter = 10, test_size = .2, random_state = 0)
    print("ShuffleSplit sets: {}".format(cv))
    
    # Generate the training sets of increasing sizes
    train_sizes = np.rint(np.linspace(1, X.shape[0] * .8 - 1, 9)).astype(int)
    print("Visualize training set sizes: {}".format(train_sizes))
    
    # Create the figure window
    fig = pl.figure(figsize = (10, 8))
    
    # Create three different models based on max_depth
    for k, depth in enumerate([1, 3, 4, 5, 6, 10]):
        # Create a decision tree regressor with a max_depth of depth
        regressor = DecisionTreeRegressor(max_depth = depth)
        
        # Calculate training and testing scores
        print("Evaluating depth {}".format(depth))
        sizes, train_scores, test_scores = curves.learning_curve(regressor, X, y, \
               cv = cv, train_sizes = train_sizes, scoring = 'r2')
        
        # Determine the mean and standard deviation for use in smoothing
        train_std = np.std(train_scores, axis = 1)
        train_mean = np.mean(train_scores, axis = 1)
        test_std = np.std(test_scores, axis = 1)
        test_mean = np.mean(test_scores, axis = 1)
        
        # Subplot the learning curve
        ax = fig.add_subplot(3, 2, k + 1)
        ax.plot(sizes, train_mean, 'o-', color = 'r', label = 'Training Score')
        ax.plot(sizes, test_mean, 'o-', color = 'g', label = 'Testing Score')
        ax.fill_between(sizes, train_mean - train_std, train_mean + train_std, alpha = .15, color = 'r')
        ax.fill_between(sizes, test_mean - test_std, test_mean + test_std, alpha = .15, color = 'g')
        print("Results for depth {}: {}".format(depth, test_mean))
        
        # Labels
        ax.set_title('max_depth = %s'%(depth))
        ax.set_xlabel('Number of Training Points')
        ax.set_ylabel('Score')
        ax.set_xlim([0, X.shape[0] * 0.8])
        ax.set_ylim([-.05, 1.05])
        
    # Aesthetics
    ax.legend(loc = 'best')
    fig.suptitle('Decision Tree Regressor Learning Performances', fontsize = 16, y = 1.03)
    fig.tight_layout()
    fig.show()
Example #15
0
def ModelLearning(X, y):
    """ Calculates the performance of several models with varying sizes of training data.
        The learning and testing scores for each model are then plotted. """

    # Create 10 cross-validation sets for training and testing
    cv = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.2, random_state=0)

    # Generate the training set sizes increasing by 50
    train_sizes = np.rint(np.linspace(1, X.shape[0] * 0.8 - 1, 9)).astype(int)

    # Create the figure window
    fig = pl.figure(figsize=(10, 7))

    # Create three different models based on max_depth
    for k, depth in enumerate([1, 3, 6, 10]):

        # Create a Decision tree regressor at max_depth = depth
        regressor = DecisionTreeRegressor(max_depth=depth)

        # Calculate the training and testing scores
        sizes, train_scores, test_scores = curves.learning_curve(regressor, X, y, \
            cv = cv, train_sizes = train_sizes, scoring = 'r2')

        # Find the mean and standard deviation for smoothing
        train_std = np.std(train_scores, axis=1)
        train_mean = np.mean(train_scores, axis=1)
        test_std = np.std(test_scores, axis=1)
        test_mean = np.mean(test_scores, axis=1)

        # Subplot the learning curve
        ax = fig.add_subplot(2, 2, k + 1)
        ax.plot(sizes, train_mean, 'o-', color='r', label='Training Score')
        ax.plot(sizes, test_mean, 'o-', color='g', label='Testing Score')
        ax.fill_between(sizes, train_mean - train_std, \
            train_mean + train_std, alpha = 0.15, color = 'r')
        ax.fill_between(sizes, test_mean - test_std, \
            test_mean + test_std, alpha = 0.15, color = 'g')

        # Labels
        ax.set_title('max_depth = %s' % (depth))
        ax.set_xlabel('Number of Training Points')
        ax.set_ylabel('Score')
        ax.set_xlim([0, X.shape[0] * 0.8])
        ax.set_ylim([-0.05, 1.05])

    # Visual aesthetics
    ax.legend(bbox_to_anchor=(1.05, 2.05), loc='lower left', borderaxespad=0.)
    fig.suptitle(
        'Decision Tree Regressor Learning Performances', fontsize=16, y=1.03)
    fig.tight_layout()
    fig.show()
Example #16
0
def train_model(clf, X, Y, name="NB ngram", plot=False):
    # create it again for plotting
    cv = ShuffleSplit(n=len(X), n_iter=10, test_size=0.3, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = []
    precisions, recalls, thresholds = [], [], []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf.fit(X_train, y_train)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        proba = clf.predict_proba(X_test)

        fpr, tpr, roc_thresholds = roc_curve(y_test, proba[:, 1])
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, proba[:, 1])

        pr_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    if plot:
        scores_to_sort = pr_scores
        median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]

        plot_pr(pr_scores[median],
                name,
                phase,
                precisions[median],
                recalls[median],
                label=name)

    summary = (np.mean(scores), np.std(scores), np.mean(pr_scores),
               np.std(pr_scores))
    print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    return np.mean(train_errors), np.mean(test_errors)
Example #17
0
def toNumpyBalanced():
    X, y = load_svmlight_file(default_train_file)

    ss = ShuffleSplit(X.shape[0], n_iter=1, test_size=5640, random_state=1)
    train_index, test_index = ss.__iter__().next()

    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]

    # subsample the training set to make the number of instances for each class
    # equal.
    from collections import Counter
    sample_num = min(Counter(y_train).values())

    X0 = X_train[np.where(y_train==0)[0],:]
    X0 = X0[:sample_num,:]
    X1 = X_train[np.where(y_train==1)[0],:]
    X1 = X1[:sample_num,:]
    X2 = X_train[np.where(y_train==2)[0],:]
    X2 = X2[:sample_num,:]
    X3 = X_train[np.where(y_train==3)[0],:]
    X3 = X3[:sample_num,:]

    XX = sparse.vstack([X0, X1, X2, X3])
    assert(XX.shape[0] == sample_num * 4)
    X_train = XX

    y_train = np.hstack([
        np.zeros(sample_num),
        np.ones(sample_num),
        np.ones(sample_num) * 2,
        np.ones(sample_num) * 3])

    print 'loaded snippet dataset'
    print 'entire training set size', y_train.size
    print 'test set size', y_test.size

    return X_train, y_train, X_test, y_test
Example #18
0
def prepare_evaluation_splits(tasks_dir, to_dir, folds = 3, test_part = 0.3):
    all_task_fnames = numpy.array([fname
                                   for fname in os.listdir(tasks_dir)
                                   if os.path.isfile(os.path.join(tasks_dir, fname))])
    for fold_i, (train_idx, test_idx) in enumerate(ShuffleSplit(len(all_task_fnames),
                                                                n_iter = folds,
                                                                test_size = test_part)):
        train_dir = os.path.join(to_dir, str(fold_i), TRAIN_DIR)
        ensure_dir_exists(train_dir)
        copy_files(tasks_dir, all_task_fnames[train_idx], train_dir)

        test_dir = os.path.join(to_dir, str(fold_i), TEST_DIR)
        ensure_dir_exists(test_dir)
        copy_files(tasks_dir, all_task_fnames[test_idx], test_dir)
Example #19
0
def univariate_test(x, y, model, names, score_type):
    scores = []
    X = np.matrix(x)
    for i in range(X.shape[1]):
        score = cross_val_score(model,
                                X[:, i:i + 1],
                                y,
                                scoring=score_type,
                                cv=ShuffleSplit(len(X), 3, .3))
        scores.append(round(np.mean(score), 3))
    maxval = max(scores)
    minval = min(scores)
    dist = maxval - minval
    return list(zip((np.array(scores) - minval) / dist, names))
Example #20
0
 def fit(self, X, y):
     train = np.array(X)
     assert len(train.shape) == 2
     assert len(y.shape) == 1
     ss = ShuffleSplit(n=X.shape[0],
                       n_iter=self.n_iter,
                       random_state=self.random_state,
                       test_size=flexible_int(X.shape[0], self.sample_size))
     self.clfs_ = []
     for _, indices in ss:
         tmp_clf = deepcopy(self.clf)
         tmp_clf.fit(train[indices], y[indices])
         self.clfs_.append(tmp_clf)
     return self
Example #21
0
def binary_cbf(oversampling=(0, 0)):
    """
    :param oversampling: Tuple(Int), double review samples with star classes in range
    :return: None
    """
    t = time()
    with sqlite3.connect(DB_PATH) as conn:
        y = FeatureReformer(conn, 'r_samples',
                            ['rstar']).transform('y2').transpose()[0]
        X = FeatureReformer(conn, 'r_samples', [
            'brcnt',
            'bstar',
            'checkins',
            'compliments',
            'fans',
            'rdate',
            'urcnt',
            'ustar',
            'uvotes',
            'ysince',
        ]).transform()

        # oversampling
        ovsp = over_sampling(y, oversampling)
        y = y[ovsp]
        X = X[ovsp]

        n_samples, n_features = X.shape
        print(X.shape)
        print('Done with collecting & reforming data from database, using ',
              time() - t, 's')
        t = time()
        rec_scorer = RecScorer(n_class=2)
        div = ShuffleSplit(n_samples, n_iter=5, test_size=0.2, random_state=0)
        model = ExtraTreesClassifier(n_estimators=5)
        for train, test in div:
            X_train = X[np.array(train)]
            X_test = X[np.array(test)]
            y_train = y[np.array(train)]
            y_test = y[np.array(test)]
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            # Metrics below
            rec_scorer.record(y_true=y_test, y_pred=y_pred)
            # print(confusion_matrix(y_true=y_test, y_pred=y_pred), '\n', time()-t, 's used >>\n')
            print(time() - t, 's used >>\n')

        print('Done with 5-fold training & cross validating, using ',
              time() - t, 's')
        rec_scorer.finalScores()
Example #22
0
def get_acc_auc_randomisedCV(clfname,X,Y,iterNo=5,test_percent=0.2):
	acc=[]
	auc_=[]
	precision=[]
	recall=[]
	f1score=[]
	rs = ShuffleSplit(len(Y), iterNo,test_percent)
	for train_index, test_index in rs:

		#print("TRAIN:", train_index, "TEST:", test_index)
		X_train, X_test = X[train_index], X[test_index]
		Y_train, Y_test = Y[train_index], Y[test_index]

		Y_pred = 0
		if clfname=='Logistic Regression':
			Y_pred=logistic_regression_pred(X_train,Y_train,X_test)
		if clfname=='SVM':
			Y_pred=svm_pred(X_train,Y_train,X_test)
		if clfname=='Decision Tree':
			Y_pred=decisionTree_pred(X_train,Y_train,X_test)
		if clfname=='SGDClassifier':
			Y_pred=SGDClassifier_pred(X_train, Y_train, X_test)
		if clfname=='adaboost':
			Y_pred=adaboost(X_train, Y_train, X_test)
		if clfname=='LogisticRegressionCV':
			Y_pred=LRCV(X_train, Y_train, X_test)
		if clfname=='SVC':
			Y_pred=dosvc(X_train, Y_train, X_test)
		if clfname=='RFC':
			Y_pred=RFC(X_train, Y_train, X_test)
		if clfname=='GBC':
			Y_pred=GBC(X_train, Y_train, X_test)
		accvalue, auc_value, precisionvalue, recallvalue, f1scorevalue = classification_metrics(Y_pred,Y_test)

		acc.append(accvalue)
		auc_.append(auc_value)
		precision.append(precisionvalue)
		recall.append(recallvalue)
		f1score.append(f1scorevalue)



	acc_mean=mean(acc)
	auc_mean=mean(auc_)
	precision_mean = mean(precision)
	recall_mean = mean(recall)
	f1score_mean = mean(f1score)


	return acc_mean,auc_mean,precision_mean,recall_mean,f1score_mean
Example #23
0
 def svc(self, knl='linear'):
     model = Pipeline([('vect',
                        CountVectorizer(tokenizer=self.tokenize,
                                        stop_words=self.stop_words)),
                       ('clf', SVC(kernel=knl))])
     cv = ShuffleSplit(self.len_row, random_state=self.random_state)
     recall_rate = cross_val_score(
         model,
         self.X,
         self.y,
         scoring='recall',
         cv=cv,
         fit_params={'clf__sample_weight': self.weight})
     return recall_rate
Example #24
0
def train_classifierGS(clf, X_train, y_train, params=None):
    cv_iters = 2
    cv_sets = ShuffleSplit(X_train.shape[0],
                           n_iter=cv_iters,
                           test_size=0.20,
                           random_state=0)
    scoring_fnc = make_scorer(performance_metric)

    # Create the grid search object
    grid = GridSearchCV(clf, params, scoring=scoring_fnc, cv=cv_sets)
    grid.fit(X_train, y_train)
    print "best_params_ for the optimal model are: {}.".format(
        str(grid.best_params_))
    return grid.best_params_, grid.best_estimator_
def test2():
    from sklearn.cross_validation import cross_val_score, ShuffleSplit
    X, Y, names = iris.data, iris.target, iris['feature_names']

    rf = RandomForestRegressor()
    scores = []
    for i in range(X.shape[1]):
        score = cross_val_score(rf,
                                X[:, i:i + 1],
                                Y,
                                scoring='r2',
                                cv=ShuffleSplit(len(X), 3, .3))
        scores.append((round(np.mean(score), 3), names[i]))
    print(sorted(scores, reverse=True))
Example #26
0
    def evaluate(self,K,Y):
        n = len(K)
        f_score = __score_definition__()
        cv = self.cv
        if cv == None:
            cv = ShuffleSplit(n, n_iter=1, test_size=.25)

        score = []
        for train,test in cv:
            clf = self.estimator.fit(K[train][:,train],Y[train])
            y_pred = clf.predict(K[test][:,train])
            score.append(f_score(Y[test],y_pred))
        
        return np.mean(score)
def score_models(column):
    """Generates all models and scores the data without storing all the possible models,
    no big tinydb's are used"""

    Ys = get_Ys()
    all_full_input = get_all_lin_model_inp()
    model_obj = LinearRegression(fit_intercept=False)

    Y = Ys[column].dropna().values
    sn_Y = Ys[column].dropna().index
    my_cv = ShuffleSplit(len(Y), n_iter=3, test_size=0.333, random_state=0)

    equip, d_type = column.split(' ')

    top_db = access_db('Top_score_results_' + equip + '_' + d_type, False)

    for i in range(28):
        number_of_terms = i + 1

        done = top_db.contains(Q.n_terms == number_of_terms)

        if done:
            continue

        f_name = 'All_Poss_Mod_{}_Terms'.format(number_of_terms)
        f_obj = access_file(f_name, write=False)
        mcodes = cPickle.load(f_obj)
        f_obj.close()

        top_score = -10000.0
        for i in mcodes:
            # Generate X for certain model and Y
            X = gen_X(sn_Y, all_full_input, i)
            scores = cross_val_score(model_obj, X, Y, cv=my_cv)
            score = mean(scores)

            top_score = max(score, top_score)

            if top_score == score:
                top_mcode = list(i)

        entry = {
            'equipment_name': equip,
            'data_type': d_type,
            'n_terms': number_of_terms,
            'top_score': top_score,
            'top_mcode': top_mcode
        }

        top_db.insert(entry)
Example #28
0
    def _do(matrix, test_ratio=0.0):
        if labels:  # Learning mode

            # Split train & test folds
            shuffle = ShuffleSplit(len(matrix), test_size=test_ratio)
            trainlist, testlist = [(a, b) for (a, b) in shuffle][-1]
            X_train = [x for x in map(lambda i: matrix[i], trainlist)]
            Y_train = [y for y in map(lambda i: labels[i], trainlist)]
            X_valid = [x for x in map(lambda i: matrix[i], testlist)]
            Y_valid = [y for y in map(lambda i: labels[i], testlist)]

            # Display what the underlying classifier is
            print(colored(clf[-1], 'yellow'))

            # Display the dimension of the training elements
            print(colored('Trainset:', 'cyan'))
            print(colored('X: {0}'.format(np.shape(X_train)), 'yellow'))
            print(colored('y: {0}'.format(np.shape(Y_train)), 'yellow'))

            # Process trainset
            for opr in clf[:-1]:
                print(colored(opr, 'yellow'))
                X_train = opr.fit_transform(X_train, Y_train)
            # NOTE: The last operation of the CLF is always a clustering algo
            clf[-1].fit(X_train, Y_train)

            # Display the dimension of the training elements
            print(colored('Validation set:', 'cyan'))
            print(colored('X: {0}'.format(np.shape(X_valid)), 'yellow'))
            print(colored('y: {0}'.format(np.shape(Y_valid)), 'yellow'))

            # Process validation set
            for opr in clf[:-1]:
                print(colored(opr, 'yellow'))
                X_valid = opr.transform(X_valid)

            # Return tuple of [actual], [prediction]
            # on the validation set
            return (Y_valid, clf[-1].predict(X_valid))

        else:  # Classification mode
            X = matrix

            # Feature transformations
            for opr in clf[:-1]:
                X = opr.transform(X)

            # NOTE: Predict the clusters with the last operation
            y = clf[-1].predict(X)
            return iter(y)
Example #29
0
 def __init__(self,
              dataset,
              n_iter=10,
              test_size=0.1,
              train_size=None,
              random_state=None,
              **kwargs):
     n = dataset.X.shape[0]
     cv = ShuffleSplit(n,
                       n_iter=n_iter,
                       test_size=test_size,
                       train_size=train_size,
                       random_state=random_state)
     super(DatasetShuffleSplit, self).__init__(dataset, cv, **kwargs)
Example #30
0
def fit_model(X, y):
    cv = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.2, random_state=0)

    estimator = DecisionTreeClassifier()

    param_grid = {
        'min_samples_split': list(np.linspace(30, 150, 12).astype(int))
    }

    grid = GridSearchCV(estimator, param_grid, scoring='accuracy', cv=cv)

    grid = grid.fit(X, y)

    return grid.best_estimator_
Example #31
0
def model_complexity(X, y):
    """ Calculates the performance of the model as model complexity increases.
        The learning and testing errors rates are then plotted. """

    # Create 10 cross-validation sets for training and testing
    cv = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.2, random_state=0)

    # Vary the max_depth parameter from 1 to 10
    max_depth = np.arange(1, 11)

    # Calculate the training and testing scores
    train_scores, test_scores = curves.validation_curve(
        DecisionTreeRegressor(),
        X,
        y,
        param_name="max_depth",
        param_range=max_depth,
        cv=cv,
        scoring='r2')

    # Find the mean and standard deviation for smoothing
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    # Plot the validation curve
    pl.figure(figsize=(7, 5))

    pl.title('Decision Tree Regressor Complexity Performance')
    pl.plot(max_depth, train_mean, 'o-', color='r', label='Training Score')
    pl.plot(max_depth, test_mean, 'o-', color='g', label='Validation Score')
    pl.fill_between(max_depth,
                    train_mean - train_std,
                    train_mean + train_std,
                    alpha=0.15,
                    color='r')
    pl.fill_between(max_depth,
                    test_mean - test_std,
                    test_mean + test_std,
                    alpha=0.15,
                    color='g')

    # Visual aesthetics
    pl.legend(loc='lower right')
    pl.xlabel('Maximum Depth')
    pl.ylabel('Score')
    pl.ylim([-0.05, 1.05])
    pl.show()
Example #32
0
 def logistic(self, ngram=(1, 1)):
     model = Pipeline([('vect',
                        CountVectorizer(tokenizer=self.tokenize,
                                        stop_words=self.stop_words,
                                        ngram_range=ngram)),
                       ('clf', LogisticRegression())])
     cv = ShuffleSplit(self.len_row, random_state=self.random_state)
     recall_rate = cross_val_score(
         model,
         self.X,
         self.y,
         scoring='recall',
         cv=cv,
         fit_params={'clf__sample_weight': self.weight})
     return recall_rate
def fit_model(X, y):    
    # Create cross-validation sets from the training data
    cv_sets = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.20, random_state = 0)
    # Create a decision tree regressor object
    regressor = DecisionTreeRegressor()
    # Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'max_depth':list(range(1,11))} 
    #Transform 'performance_metric' into a scoring function using 'make_scorer' 
    scoring_fnc = make_scorer(performance_metric)
    #Create the grid search object
    grid = GridSearchCV(regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets)
    # Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)
    # Return the optimal model after fitting the data
    return grid.best_estimator_
Example #34
0
def get_acc_auc_randomisedCV(X, Y, iterNo=5, test_percent=0.2):
    #TODO: First get the train indices and test indices for each iteration
    #Then train the classifier accordingly
    #Report the mean accuracy and mean auc of all the iterations
    kf = ShuffleSplit(len(Y),
                      n_iter=iterNo,
                      test_size=test_percent,
                      random_state=545510477)
    acc = []
    auc = []
    for train, test in kf:
        Y_pred = models.logistic_regression_pred(X[train], Y[train], X[test])
        acc.append(accuracy_score(Y[test], Y_pred))
        auc.append(roc_auc_score(Y[test], Y_pred))
    return mean(acc), mean(auc)
Example #35
0
def logistic_regression_classifier(train_x, train_y):
    from sklearn.linear_model import LogisticRegression
    cv = ShuffleSplit(int(len(train_x)),
                      n_iter=10,
                      random_state=0,
                      test_size=0.2)
    param_grid = {
        'intercept_scaling': list([1, 2, 3]),
        'C': list(range(1, 20))
    }
    model = GridSearchCV(estimator=LogisticRegression(),
                         cv=cv,
                         param_grid=param_grid)
    model.fit(train_x, train_y)
    return model
def splitFunc(target, optNum):

    #do k-fold cross val
    if optNum > 1:
        return KFold(len(target), int(optNum), indices=False, shuffle=True)

    #do percent based train/test split
    elif optNum < 1:
        return ShuffleSplit(n=len(target),
                            n_iter=1,
                            test_size=optNum,
                            indices=False)
    else:
        print 'Error, do not set opt num to 1!'
        return 0
def fit_model2(X, y):
    cv_sets = ShuffleSplit(X.shape[0],
                           n_iter=10,
                           test_size=.20,
                           random_state=0)
    regressor = DecisionTreeRegressor()
    count = range(1, 11)
    params = dict(max_depth=count)
    scoring_func = make_scorer(performance_metric)
    grid = RandomizedSearchCV(regressor,
                              params,
                              cv=cv_sets,
                              scoring=scoring_func)
    grid = grid.fit(X, y)
    return grid.best_estimator_
def RFcross_hq(X, y):
    ### RF cross validation =====================
    from sklearn.cross_validation import cross_val_score, ShuffleSplit
    from sklearn.ensemble import RandomForestRegressor
    from math import log
    n_estimators = max(int(log(X.shape[0]))+1, 100)
    max_depth = max(int(log(X.shape[1]))+1, 5)
    rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)
    scores = []
    for i in range(X.shape[1]):
        score = cross_val_score(rf, X[:, i:i+1], y, scoring="r2",
                              cv=ShuffleSplit(len(X), 3, .3))
        #scores.append((round(np.mean(score), 3), names[i]))
        scores.append(round(np.mean(score), 3))
    return scores
def grid_search_model(clf_factory, X, Y, stop_words):
    cv = ShuffleSplit(n=len(X), test_size=0.3, random_state=0)
    param_grid = dict(
        vect__min_df = [1, 2],
        vect__smooth_idf = [False, True],
        vect__use_idf = [False, True],
        vect__sublinear_tf = [False, True],
        vect__binary = [False, True],
        clf__alpha = [0, 0.01, 0.05, 0.1, 0.5, 1],
    )
    
    grid_search = GridSearchCV(clf_factory(stop_words), param_grid = param_grid, cv = cv, scoring = make_scorer(f1_score), verbose=1)
    grid_search.fit(X, Y)
    
    return grid_search.best_estimator_
Example #40
0
def grid_search_model(clf_factory, X, y):
    cv = ShuffleSplit(n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0)

    param_grid = dict(vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
                      vect__min_df=[1, 2],
                      vect__smooth_idf=[False, True],
                      vect__stop_words=[None, "english"],
                      vect__use_idf=[True, False],
                      vect__sublinear_tf=[True, False],
                      vect__binary=[True, False],
                      clf__alpha=[0, 0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 1], )

    grid_search = GridSearchCV(clf_factory(), param_grid=param_grid, cv=cv, score_func=f1_score, verbose=10)
    grid_search.fit(X, y)
    return grid_search.best_estimator_, grid_search.best_score_, grid_search.best_params_
Example #41
0
def grow_single(args):
    rand_int, labeled_data, parent = args
    X, y = labeled_data

    # In this portion of the script, we can the set used to train a classifier
    # 'train set', and the set used to estimate the classification probability
    # 'calib set'.
    ss = ShuffleSplit(y.size, n_iter=1, test_size=parent.split_r, random_state = rand_int)
    train_index, calib_index = ss.__iter__().next()
    train_set = X_train, y_train = X[train_index], y[train_index]
    calib_set = X_calib, y_calib = X[calib_index], y[calib_index]

    if len(set(y_train)) == 1 or len(set(y_calib)) == 1:  # extreme case.
        return None

    prev_estimator = CC2(parent.base_clf_class)
    prev_estimator.fit(train_set)

    ac_estimator = ac_factory(prev_estimator, calib_set)
    if ac_estimator is None:
        return None
    prev_estimator = ac_estimator

    return prev_estimator
Example #42
0
def exp23():
    x_enrollment_train, x_normal_enrollment_train, x_enrollment_test, x_normal_enrollment_test, y_train, enrollment_id_df, sample_weight_df = load_data(
    )

    skf = ShuffleSplit(y_train.shape[0], 1, 0.4)

    for train_index, test_index in skf:

        reject_features = []
        # reject_features = list(range(170, 209)) + list(range(287, 303))
        # reject_features = list(range(170, 287)) + list(range(287, 303)) + list(range(303, 367))
        reject_features = list(range(170, 248))
        selected_features = list(set(range(367)) - set(reject_features))

        print('nb_feature:', len(selected_features))

        # import ipdb; ipdb.set_trace()
        y_train2 = np.vstack([1 - y_train[train_index],
                              y_train[train_index]]).T

        model = build_model2()

        model.fit(x_normal_enrollment_train[:, selected_features][train_index],
                  y_train2,
                  nb_epoch=7)

        # model.fit(
        #     x_normal_enrollment_train[:, selected_features][train_index],
        #     y_train[train_index]
        # )

        if hasattr(model, 'predict_proba'):
            predicts_cv = model.predict_proba(
                x_normal_enrollment_train[:, selected_features][test_index])
        else:
            predicts_cv = model.decision_function(
                x_normal_enrollment_train[:, selected_features][test_index])

        if len(predicts_cv.shape) == 2:
            if predicts_cv.shape[1] == 2:
                roc = roc_auc_score(y_train[test_index], predicts_cv[:, 1])
                print('roc_auc_score of cv on test %f' % roc)
            else:
                roc = roc_auc_score(y_train[test_index], predicts_cv[:, 0])
                print('roc_auc_score of cv on test %f' % roc)
        else:
            roc = roc_auc_score(y_train[test_index], predicts_cv)
            print('roc_auc_score of cv on test %f' % roc)
Example #43
0
def train_model(clf_factory, X, Y, name):
    labels = np.unique(Y)

    cv = ShuffleSplit(n=len(X), n_iter=1, test_size=0.3, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = defaultdict(list)
    pr_scores_list = np.array([])

    clfs = []  # just to later get the median

    cms = []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]
        clf = clf_factory()
        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append(test_score)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cms.append(cm)

        for label in labels:
            y_label_test = np.asarray(y_test == label, dtype=int)
            proba = clf.predict_proba(X_test)
            proba_label = proba[:, label]

            precision, recall, pr_thresholds = precision_recall_curve(y_label_test, proba_label)
            auc_result = auc(recall, precision)
            pr_scores[label].append(auc(recall, precision))
            pr_scores_list = np.append(pr_scores_list, auc_result)

    summary = (np.mean(scores), np.std(scores), np.mean(pr_scores_list), np.std(pr_scores_list))
    print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
Example #44
0
def fit_model(X, y):
	# Gera conjuntos de validação-cruzada para o treinamento de dados
	cv_sets = ShuffleSplit(X.shape[0]          # qt total elementos
	                     , n_iter = 10         # qt vezes embaralhar e dividir
	                     , test_size = 0.2
	                     , random_state = 123)
	
	grid = GridSearchCV(DecisionTreeRegressor()
	                  , dict(max_depth = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
					  , make_scorer(performance_metric)
					  , cv = cv_sets)
	
	# Encontrando os melhores parâmetros do estimador
	grid = grid.fit(X, y)
	
	return grid.best_estimator_
Example #45
0
    def cross_validation(self):

        self.remove_columns(['institute_latitude', 'institute_longitude'])
        gbr = GradientBoostingRegressor()

        cv = ShuffleSplit(self.X.shape[0],
                          n_iter=3,
                          test_size=0.3,
                          random_state=0)

        self.test_scores = cross_val_score(gbr,
                                           self.X,
                                           self.y,
                                           cv=cv,
                                           scoring=self.rmse_scorer(),
                                           n_jobs=1)  # poor machine
Example #46
0
    def split_arrays(self, n,  test_fraction = 0.1 ):
	
	
	shfSplt = ShuffleSplit( n=n, n_iterations=1, test_size = test_fraction)
	train_ix, test_ix = shfSplt.__iter__().next()
	return train_ix, test_ix
Example #47
0
from pybrain.structure.modules   import SoftmaxLayer, TanhLayer
from pybrain.supervised.trainers import BackpropTrainer
from pybrain.datasets import ClassificationDataSet
from pybrain.utilities import percentError
from lib import dao, viz


#ds = dao.load_ads()
ds = dao.load_credit()
#ds.sanitize(strategy='impute_mean', scale=True)
ds.onehot()
ds.scale_zmuv()
X, y = ds.sample(class_balance=None)

#X, y = ds.select_features(technique='extra_trees')
ssp = ShuffleSplit(X.shape[0], n_iter=1, test_size=0.2, random_state=5557)
train_idxs, test_idxs = ssp._iter_indices().next()
train_idxs = train_idxs[:-1] if len(train_idxs) % 2 != 0 else train_idxs
test_idxs = test_idxs[:-1] if len(test_idxs) % 2 != 0 else test_idxs

train = ClassificationDataSet(X.shape[1], 1, nb_classes=2, class_labels=ds._class_names)
for i in train_idxs:
    train.addSample(X[i], [y[i]])

test = ClassificationDataSet(X.shape[1], 1, nb_classes=2, class_labels=ds._class_names)
for i in test_idxs:
    test.addSample(X[i], [y[i]])

train._convertToOneOfMany()
test._convertToOneOfMany()