def Modelcomplexity(x, y):
    cv = ShuffleSplit(x.shape[0], n_iter=10, test_size=0.2, random_state=0)
    max_depth = np.arange(1, 11)
    plt.figure(figsize=(10, 10))
    classifier = DecisionTreeRegressor()
    (train_scores,
     test_scores) = curves.validation_curve(classifier,
                                            x,
                                            y,
                                            param_name="max_depth",
                                            param_range=max_depth,
                                            cv=cv,
                                            scoring='r2')
    train_mean = np.mean(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    plt.plot(max_depth, test_mean, 'o-', color='g', label='testing scores')
    plt.plot(max_depth, train_mean, 'o-', color='r', label='training scores')
    plt.fill_between(max_depth,
                     train_mean - train_std,
                     train_mean + train_std,
                     color='r',
                     alpha=0.8)
    plt.fill_between(max_depth,
                     test_mean - test_std,
                     test_mean + test_std,
                     color='g',
                     alpha=0.8)

    plt.xlim([0, 11])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('maximum depth')
    plt.ylabel('scores')
    #print(k,depth)

    plt.legend(loc='upper right', borderaxespad=0.)
    plt.subtitle('DecisionTreeClassifier', fontsize=16, color='g', y=1.05)
    plt.tight_layout()
    plt.show()
    return True
def Modellearning(x, y):
    cv = ShuffleSplit(x.shape[0], n_iter=10, test_size=0.2, random_state=0)
    train_size = np.rint(np.linspace(1, x.shape[0] * 0.8 - 1, 9)).astype(int)
    fig = plt.figure(figsize=(10, 10))
    for k, depth in enumerate([1, 3, 6, 10]):
        #print(k,depth)
        classifier = DecisionTreeRegressor(max_depth=depth)
        (sizes, train_scores,
         test_scores) = curves.learning_curve(classifier,
                                              x,
                                              y,
                                              train_sizes=train_size,
                                              cv=cv,
                                              scoring='r2')
        ax = plt.subplot(2, 2, k + 1)
        train_mean = np.mean(train_scores, axis=1)
        test_mean = np.mean(test_scores, axis=1)
        train_std = np.std(train_scores, axis=1)
        test_std = np.std(test_scores, axis=1)
        ax.plot(sizes, test_mean, 'o-', color='g', label='testing scores')
        ax.plot(sizes, train_mean, 'o-', color='r', label='training scores')
        ax.fill_between(sizes,
                        train_mean - train_std,
                        train_mean + train_std,
                        color='r',
                        alpha=0.8)
        ax.fill_between(sizes,
                        test_mean - test_std,
                        test_mean + test_std,
                        color='g',
                        alpha=0.8)
        ax.set_title('maxdepth= %s' % (depth))
        ax.set_xlim([0, x.shape[0] * 0.8])
        ax.set_ylim([-0.05, 1.05])
        ax.set_xlabel('sizes')
        ax.set_ylabel('scores')
    ax.legend(bbox_to_anchor=(1.05, 2.05), loc='lower left', borderaxespad=0.)
    fig.suptitle('DecisionTreeClassifier', fontsize=16, color='g', y=1.05)
    fig.tight_layout()
    fig.show()
    return True
Exemple #3
0
def data_split(inputfile, reads_feature):
    data = hkl.load(inputfile)
    reads_count = hkl.load(reads_feature)
    X = data['mat']
    X_kspec = data['kmer']
    y = np.mean(reads_count + 1)
    rs = ShuffleSplit(len(y), n_iter=1, random_state=1)
    X_kspec = X_kspec.reshape((X_kspec.shape[0], 1024, 4))
    X = np.concatenate((X, X_kspec), axis=1)
    X = X[:, np.newaxis]
    X = X.transpose((0, 1, 3, 2))
    for train_idx, test_idx in rs:
        X_train = X[train_idx, :]
        y_train = y[train_idx]
        X_test = X[test_idx, :]
        y_test = y[test_idx]
    X_train = X_train.astype('float32')
    y_train = y_train.astype('int32')
    X_test = X_test.astype('float32')
    y_test = y_test.astype('int32')
    return [X_train, y_train, X_test, y_test]
Exemple #4
0
def get_acc_auc_randomisedCV(X, Y):
    #TODO: First get the train indices and test indices for each iteration
    #Then train the classifier accordingly
    #Report the mean accuracy and mean auc of all the iterations
    rs = ShuffleSplit(len(Y),
                      n_iter=5,
                      test_size=0.2,
                      random_state=RANDOM_STATE)
    accuracylist = []
    auclist = []
    for train_index, test_index in rs:
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        Y_pred = models_partc.logistic_regression_pred(X_train, Y_train,
                                                       X_test)
        accuracy, auc, precision, recall, f1score = models_partc.classification_metrics(
            Y_pred, Y_test)
        accuracylist.append(accuracy)
        auclist.append(auc)

    return mean(accuracylist), mean(auclist)
 def fit(self, x, y, validation_proportion=0.1):
     n_obs, self.n_features = x.shape
     self.n_classes = np.max(y) + 1
     rs = ShuffleSplit(n_obs, n_iter=1, test_size=validation_proportion,
                       random_state=self.random_state)
     for train_index, valid_index in rs:
         pass
     df = ProcessBatch(x[train_index], y[train_index])
     self._construct_graph(self.n_features, self.n_classes)
     self.sess.run(tf.initialize_all_variables())
     logger = PrintMess()
     if self.verbose:
         logger.info(header=True, Iter=0, TrnLoss=0, ValScore=0)
     for i in range(int(self.n_iter * n_obs / self.batch_size)):
         x_batch, y_batch = df.next_batch(self.batch_size)
         res = self.step(x_batch, y_batch)
         if (i % 40 == 0) and self.verbose:
             yhat = self.predict(x[valid_index])
             score = accuracy_score(y[valid_index], yhat)
             logger.info(header=False, Iter=i, TrnLoss=res[0],
                         ValScore=score)
def evaluate(X, args):
    enum = ShuffleSplit(len(X), n_iter=args.n_iterations, test_size=args.test_size)
    train_scores = []
    test_scores = []
    for train_index, test_index in enum:
        X_train = [X[idx] for idx in train_index]
        X_test = [X[idx] for idx in test_index]
        X_train, X_test = preprocess_datasets(X_train, X_test, args)
        model = GaussianHMM(n_states=args.n_states, n_training_iterations=args.n_training_iterations,
                            topology=args.topology)
        model.fit(X_train)
        train_scores.extend([model.loglikelihood(X_curr) for X_curr in X_train])
        test_scores.extend([model.loglikelihood(X_curr) for X_curr in X_test])

    train_scores_array = np.array(train_scores)
    train_mean = float(np.mean(train_scores_array))
    train_std = float(np.std(train_scores_array))
    test_scores_array = np.array(test_scores)
    test_mean = float(np.mean(test_scores_array))
    test_std = float(np.std(test_scores_array))
    return train_mean, train_std, test_mean, test_std
def mean_decrease_accuracy_regression(df, Y, black_list=[]):
    #直接度量每个特征对模型精确率的影响。主要思路是打乱每个特征的特征值顺序,并且度量顺序变动对模型的精确率的影响。
    #很明显,对于不重要的变量来说,打乱顺序对模型的精确率影响不会太大,但是对于重要的变量来说,打乱顺序就会降低模型的精确率
    rf = RandomForestRegressor()
    scores = defaultdict(list)
    X_src = df.drop(black_list, axis=1)
    X = X_src.values
    names = X_src.columns
    #crossvalidate the scores on a number of different random splits of the data
    for train_idx, test_idx in ShuffleSplit(len(X), 100, .3):
        X_train, X_test = X[train_idx], X[test_idx]
        Y_train, Y_test = Y[train_idx], Y[test_idx]
        r = rf.fit(X_train, Y_train)
        acc = r2_score(Y_test, rf.predict(X_test))
        for i in range(X.shape[1]):
            X_t = X_test.copy()
            np.random.shuffle(X_t[:, i])
            shuff_acc = r2_score(Y_test, rf.predict(X_t))
            scores[names[i]].append((acc - shuff_acc) / acc)
    return dict([(round(np.mean(score), 4), feat)
                 for feat, score in scores.items()])
Exemple #8
0
    def __init__(self, user_ids, item_ids, n_iter=10, 
                 test_size=0.2, cold_start=False, random_seed=None):
        """
        Options:
        - test_size: the fraction of the dataset to be used as the test set.
        - cold_start: if True, test_size of items will be randomly selected to
                      be in the test set and removed from the training set. When
                      False, test_size of all training pairs are moved to the
                      test set.
        """

        self.user_ids = user_ids
        self.item_ids = item_ids
        self.no_interactions = len(self.user_ids)
        self.n_iter = n_iter
        self.test_size = test_size
        self.cold_start = cold_start

        self.shuffle_split = ShuffleSplit(self.no_interactions,
                                          n_iter=self.n_iter,
                                          test_size=self.test_size)
Exemple #9
0
def get_acc_auc_randomisedCV(X, Y, iterNo=5, test_percent=0.2):
    # TODO: First get the train indices and test indices for each iteration
    # Then train the classifier accordingly
    # Report the mean accuracy and mean auc of all the iterations
    accuracy_arr = []
    auc_arr = []

    shuffle_split = ShuffleSplit(n=X.get_shape()[0],
                                 n_iter=5,
                                 test_size=.2,
                                 random_state=545510477)
    for train_i, test_i in shuffle_split:
        X_train, X_test = X[train_i], X[test_i]
        Y_train, Y_test = Y[train_i], Y[test_i]
        Y_pred = models.logistic_regression_pred(X_train, Y_train, X_test)
        acc, auc_, precision, recall, f1score = models.classification_metrics(
            Y_pred, Y_test)
        accuracy_arr.append(acc)
        auc_arr.append(auc_)

    return sum(accuracy_arr) / len(accuracy_arr), sum(auc_arr) / len(auc_arr)
def run_grid_search(estimator, param_grid, metric, X, y, X_test, y_test, seed,
                    profile):
    _train_test_iter = KFold(X.shape[0],
                             n_folds=5,
                             shuffle=True,
                             random_state=seed)
    inner_cv_func = lambda zx, zy: ShuffleSplit(
        zx.shape[0], n_iter=10, test_size=0.2, random_state=seed)
    if metric == 'cindex':
        scoring_func = score_concordance_index
    else:
        scoring_func = score_time_roc

    _grid_search = NestedGridSearchCV(estimator,
                                      param_grid,
                                      scoring_func,
                                      cv=_train_test_iter,
                                      inner_cv=inner_cv_func,
                                      profile=profile)
    _grid_search.fit(X, y, X_test=X_test, y_test=y_test)
    return _grid_search
Exemple #11
0
def dict_train_test_split(dictionary,
                          train_size,
                          cap_train=None,
                          cap_test=None):
    d_list = list(dictionary.iteritems())

    if isinstance(train_size, int) and train_size > 1:
        train_size /= float(len(d_list))

    test_size = 1. - train_size
    indices_train, indices_test = iter(
        ShuffleSplit(len(d_list), n_iter=1, test_size=test_size)).next()
    d_train_list = [d_list[index_train] for index_train in indices_train]
    d_test_list = [d_list[index_test] for index_test in indices_test]

    if cap_train is not None:
        d_train_list = d_train_list[0:cap_train]
    if cap_test is not None:
        d_test_list = d_test_list[0:cap_test]

    return dict(d_train_list), dict(d_test_list)
Exemple #12
0
def mean_decrease_accuracy(x, y, model, names, score_type):
    scores = defaultdict(list)
    X = np.matrix(x)
    Y = np.array(y)
    for train_idx, test_idx in ShuffleSplit(len(x), 100, .3):
        X_train, X_test = X[train_idx], X[test_idx]
        Y_train, Y_test = Y[train_idx], Y[test_idx]
        r = model.fit(X_train, Y_train)
        acc = r2_score(Y_test, model.predict(X_test))
        for i in range(X.shape[1]):
            X_t = X_test.copy()
            np.random.shuffle(X_t[:, i])
            shuff_acc = r2_score(Y_test, model.predict(X_t))
            scores[names[i]].append((acc - shuff_acc) / acc)
    scored = [round(np.mean(score), 4) for feat, score in scores.items()]
    maxval = max(scored)
    minval = min(scored)
    dist = maxval - minval
    return list(
        zip((np.array(scored) - minval) / dist,
            [el[0] for el in scores.items()]))
Exemple #13
0
def cross_validation(data, validation_percent):
    final_results = []
    rs = ShuffleSplit(len(data),
                      n_iter=10,
                      test_size=int(len(data) * validation_percent))
    for train_index, test_index in rs:
        test_data = data.iloc[test_index]
        geo_y = predict_geo_y(data, train_index, test_index)
        temporal_y = predict_temporal_y(data, train_index, test_index)

        predicted_y = second_learner(
            np.array([geo_y, temporal_y]).T, test_data.y)

        currrrrr_results = np.mean(
            np.power(np.array(predicted_y) - np.array(test_data.y), 2))
        final_results.append(currrrrr_results)
        print(
            np.mean(np.power(np.array(geo_y) - np.array(test_data.y), 2)),
            np.mean(np.power(np.array(temporal_y) - np.array(test_data.y), 2)))
        print(currrrrr_results)
    print(final_results)
Exemple #14
0
def AllDataDeal(X_data, X_target):
    X_data = np.array(X_data)
    X_target = np.array(X_target)
    names = ['BMI', '肺活量', '立定跳远', '坐位体前屈', '仰卧起坐/引体向上', '50米跑', '长跑时间']
    rf = RandomForestRegressor(max_features='sqrt')
    scores = []
    score_value = []
    score_name = []
    # 单独采用每个特征进行建模,并进行交叉验证
    # print(len(X_data))
    for i in range(len(names)):
        score = cross_val_score(rf,
                                X_data[:, i:i + 1],
                                X_target,
                                scoring="r2",
                                cv=ShuffleSplit(len(X_data), 3, .3))
        scores.append((format(np.abs(np.mean(score)), '.3f'), names[i]))
        # scores.append((format(np.mean(score), '.3f'), names[i]))
        score_value.append(abs(np.mean(score)))
        score_name.append(names[i])
    return sorted(scores, reverse=True)
Exemple #15
0
def quick_cv(clf,
             X,
             y,
             score_func,
             n_iter=3,
             test_size=0.1,
             random_state=None):
    """ returns the cross validation """
    cv = ShuffleSplit(
        y.shape[0],
        n_iter=n_iter,
        test_size=test_size,
        random_state=random_state,
    )
    scores = []
    for train, test in cv:
        X_train, X_test = X[train], X[test]
        y_train, y_test = y[train], y[test]
        preds = fit_predict(clf, X_train, y_train, X_test)
        scores.append(score_func(y_test, preds))
    return sum(scores) / float(len(scores))
Exemple #16
0
def main():
    # get the processed data
    X,y = preprocess_data()

    # get the dummy clf: Very important, it creates a baseline!
    dummy_clf = get_dummy_clf()
    dummy_clf.fit(X, y)
    y_hat = dummy_clf.predict(y)

    # Get the baseline predictions for x and y
    print "Dummy MSE x", mse(y[:,0], y_hat[:,0])
    print "Dummy MSE y", mse(y[:,1], y_hat[:,1])

    # create 5 different crossvalidation folds
    ss = ShuffleSplit(len(y), n_iter=5, random_state=0)

    scores_x = []
    scores_y = []
    for i, (train_index, test_index) in enumerate(ss):
        # Choose a classifier
        #clf = get_linear_clf()
        clf = get_nn_clf()
        clf.fit(X[train_index], y[train_index])
        y_hat = clf.predict(X[test_index])

        # Save the score for each fold
        score_x = mse(y[test_index,0], y_hat[:,0])
        score_y = mse(y[test_index,1], y_hat[:,1])


        # You can print the coefficients/intercept for the linear classifier
        #print clf.steps[-1][1].coef_,clf.steps[-1][1].intercept_

        scores_x.append(score_x)
        scores_y.append(score_y)
        print scores_x,scores_y


    print "MSE CV x", np.array(scores_x).mean()
    print "MSE CV y", np.array(scores_y).mean()
Exemple #17
0
def drawrocada(X,Y):
	rs = ShuffleSplit(len(Y), 5,0.2)
	i=10

	for train_index, test_index in rs:
		clf = AdaBoostClassifier()

		X_train, X_test = X[train_index], X[test_index]
		Y_train, Y_test = Y[train_index], Y[test_index]



		clf.fit(X_train,Y_train)


		# X_train, Y_train = utils.get_data_from_svmlight("big2/train/part-r-00000")
		# X_test, Y_test = utils.get_data_from_svmlight("big2/test/part-r-00000")

		pre = clf.predict_proba(X_test)


		y_test_prob = pre[:,1]
		y_test = Y_test

		fpr, tpr, _ = roc_curve(y_test, y_test_prob)

		#print (fpr,tpr)
		roc_auc = auc(fpr, tpr)
		#Plot of a ROC curve for a specific class
		plt.figure()
		plt.plot(fpr, tpr, label='ROC curve')#  (area = %0.2f)' % roc_auc)
		plt.plot([0, 1], [0, 1], 'k--')
		plt.xlim([0.0, 1.0])
		plt.ylim([0.0, 1.05])
		plt.xlabel('False Positive Rate')
		plt.ylabel('True Positive Rate')
		plt.title('AdaBoost Classifier')
		plt.legend(loc="lower right")
		plt.savefig("pic"+str(i))
		i=i+1
def get_acc_auc_randomisedCV(X, Y, iterNo=5, test_percent=0.2):
    #TODO: First get the train indices and test indices for each iteration
    #Then train the classifier accordingly
    #Report the mean accuracy and mean auc of all the iterations
    ss = ShuffleSplit(len(Y),
                      n_iter=iterNo,
                      test_size=test_percent,
                      random_state=RANDOM_STATE)
    clf_lr_ss = LogisticRegression()
    acc_list = []
    auc_list = []
    for train, test in ss:
        clf_lr_ss.fit(X[train], Y[train])
        acc = accuracy_score(clf_lr_ss.predict(X[test]), Y[test])
        acc_list.append(acc)

        auc_ = roc_auc_score(clf_lr_ss.predict(X[test]), Y[test])
        auc_list.append(auc_)

    acc_k = array(acc).mean()
    auc_k = array(auc_list).mean()
    return acc_k, auc_k
def predict_with_one(X, out_file_name):
    n_samples, n_features = X.shape
    iter_num = 3
    div = ShuffleSplit(n_samples,
                       n_iter=iter_num,
                       test_size=0.2,
                       random_state=0)
    model = ExtraTreesRegressor(n_estimators=5)
    score_matrix = np.zeros((n_features, n_features))

    t = time()
    round_num = 0
    for train, test in div:
        round_num += 1
        train_samples = X[np.array(train)]
        test_samples = X[np.array(test)]
        for i in range(n_features):
            for j in range(n_features):
                X_train = train_samples[:, i:i + 1]
                X_test = test_samples[:, i:i + 1]
                y_train = train_samples[:, j]
                y_test = test_samples[:, j]
                # for i in range(len(fl)):
                #     for j in range(len(fl)):
                #         if fl[j][1]-fl[j][0] != 1:
                #             continue
                #         X_train = train_samples[:, fl[i][0]:fl[i][1]]
                #         X_test = test_samples[:, fl[i][0]:fl[i][1]]
                #         y_train = train_samples[:, fl[j][0]]
                #         y_test = test_samples[:, fl[j][0]]
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                mae = mean_absolute_error(y_test, y_pred)
                score_matrix[i, j] += mae
                print('Round', round_num, '|', i, j, mae, time() - t)
    np.savetxt(os.path.join(CODE_PATH, out_file_name),
               score_matrix / iter_num,
               fmt='%.3f',
               delimiter=',')
Exemple #20
0
 def fit(X,y):
     '''
         This function actually calculates the importances and accuracy metric
         using cross validation.
         Usage:
           imp,acc = fit(X,y)
         Arguments:
           X: feature vector, numpy array
           y: label vector, numpy array
         Return values:
           imp: feature importance vector
           acc: estimator accuracy metric
     '''
     scores = defaultdict(list) # Any unknown element is automatically a list
     rf= copy.deepcopy(self.clf)
     #
     #crossvalidate the scores on a number of different random splits of the data
     outAcc= 0.
     for train_idx, test_idx in ShuffleSplit(len(X), self.nCV, .3):
         X_train, X_test = X[train_idx], X[test_idx]
         y_train, y_test = y[train_idx], y[test_idx]
         r = rf.fit(X_train, Y_train)
         # Get accuracy metric
         if metric is None:
             outAcc= None
         elif metric == 'OOB':
             outAcc += rf.oob_score
         elif metric == 'AUC':
             outAcc += sklearn.metrics.roc_auc_score(y_test, rf.predict_proba(X_test) )
         if self.algorithm == 'gini':
             scores[i].append( self.giniImportance(rf,X_test,y_test) )
         elif self.algorithm == 'permutation':
             scores[i].append( self.permutationImportance(rf,X_test,y_test) )
         elif self.algorithm == 'conditional':
             scores[i].append( self.conditionalPermutationImportance(rf,X_test,y_test) )
     #
     # Return mean importance and metric
     importances= np.array([np.mean(scores[i]) for i in range(X.shape[1])])
     return importances, outAcc / float(self.nCV)
Exemple #21
0
def trainTest(clf, X, y, fold=10.0, classn=2, returnconfusion=False):
    # kf = KFold(n_splits=int(fold), shuffle=True,random_state=np.random.randint(len(y)))
    # kf = KFold(len(y),n_folds=int(fold))
    kf = ShuffleSplit(len(y), n_iter=int(fold), test_size=0.25, random_state=0)
    accuracy = 0.0
    confusion = np.zeros([classn, classn])
    for train_index, test_index in kf:
        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        expected = y[test_index]
        clf.fit(X_train, y_train)
        predicted = clf.predict(X_test)
        accy_tmp = metrics.accuracy_score(expected, predicted)
        accuracy += accy_tmp
        conf_tmp = metrics.confusion_matrix(expected, predicted)
        confusion += conf_tmp
        print "predited rate:%f" % accy_tmp
    print confusion
    print accuracy / fold
    if returnconfusion:
        return confusion
Exemple #22
0
def train_model(clc_factory, X, Y, testdata):
    print('start train_model...')
    # 设置随机状态,来得到确定性的行为
    # just for kaggle
    cv = ShuffleSplit(n=len(X),
                      n_iter=1,
                      test_size=0.001,
                      indices=True,
                      random_state=0)

    # accuracy
    scores = []
    # AUC
    pr_scores = []
    # F1 score
    f1 = []

    for train, test in cv:

        # just for kaggle, use all data

        X_test = testdata

        X_train, Y_train = X[train], Y[train]

        clf = clc_factory()
        clf.fit(X_train, Y_train)

        # predict_data = clf.predict(X_test)
        # pickle.dump(predict_data, open("./acc_tmp/kaggle_predict_label.p", "wb"))

        while True:
            test = input('Please input your data: ')
            reslut = clf.predict([test])[0]
            print('The sentiment polarity of your input text is: %s' %
                  ('Positive' if reslut == 1 else 'Negative'))

            if test == 'exit()':
                return None
Exemple #23
0
def __grid_search_model(clf_factory, X, Y):
    cv = ShuffleSplit(n=len(X), n_iter=10, test_size=0.3, random_state=0)

    param_grid = dict(
        vect__ngram_range=[(1, 1), (1, 2), (1, 3)],
        vect__min_df=[1, 2],
        vect__smooth_idf=[False, True],
        vect__use_idf=[False, True],
        vect__sublinear_tf=[False, True],
        vect__binary=[False, True],
        clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1],
    )

    grid_search = GridSearchCV(clf_factory(),
                               param_grid=param_grid,
                               cv=cv,
                               score_func=f1_score,
                               verbose=10)
    grid_search.fit(X, Y)
    clf = grid_search.best_estimator_
    print clf
    return clf
def get_best_gb_regressor_model(X, y):
    '''
    Gets best GradientBoost regressor model based on grid search, trained on data [X, y].
    '''
    # Create cross validation sets from training data
    cv_sets = ShuffleSplit(X.shape[0],
                           n_iter=10,
                           test_size=0.20,
                           random_state=42)

    # Create the gradient boost regressor object
    regressor = GradientBoostingRegressor(random_state=42)

    # Params to tune
    gs_params = {
        'n_estimators': [2000, 3000],
        'learning_rate': [0.01, 0.05],
        'max_depth': [3, 4, 5],
        'min_samples_leaf': [20, 26],
        'min_samples_split': [2, 5, 10]
        # 'max_leaf_nodes':[None,5]
    }

    # Use r2 as scoring function
    gs_scoring_func = make_scorer(perf_metric_r2)

    # Create grid search object and fit the data
    grid_search = GridSearchCV(regressor,
                               param_grid=gs_params,
                               scoring=gs_scoring_func,
                               cv=cv_sets)
    model = grid_search.fit(X, y)

    # Print optimal params
    print "GradientBoosting"
    print model.best_params_

    # return the model
    return model
Exemple #25
0
def gridsearch(X, y, weight):
    model = Pipeline([('vect', CountVectorizer(tokenizer=tokenize_filtered)),
                      ('clf', SVC())])
    param_range = np.logspace(-4, 3, 8)
    param_grid = [{
        'clf__C': param_range,
        'clf__gamma': param_range,
        'clf__kernel': ['rbf']
    }]

    cv = ShuffleSplit(6422)
    gs = GridSearchCV(estimator=model,
                      param_grid=param_grid,
                      fit_params={'clf__sample_weigth': weight},
                      cv=cv,
                      scoring='recall',
                      n_jobs=2)
    gs.fit(X, y)

    print 'best score :', gs.best_score_
    print 'best parpams :', gs.best_params_
    print gs.grid_scores_
def train_model(clf_factory, X, Y):
    # setting random state to get deterministic behavior
    cv = ShuffleSplit(n=len(X),
                      n_iter=10,
                      test_size=0.3,
                      indices=True,
                      random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    precisions, recalls, thresholds = [], [], []
    precision_recall_scores = []

    for train_index, test_index in cv:
        X_train, y_train = X[train_index], Y[train_index]
        X_test, y_test = X[test_index], Y[test_index]

        clf = clf_factory
        clf.fit(X_train, y_train)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        scores.append(test_score)
        probability = clf.predict_proba(X_test)
        precision, recall, pr_thresholds = precision_recall_curve(
            y_test, probability[:, 1])

        precision_recall_scores.append(auc(recall, precision))
        precisions.append(precision)
        recalls.append(recall)
        thresholds.append(pr_thresholds)

    return scores, precision_recall_scores, precisions, recalls, thresholds, test_errors, train_errors
Exemple #27
0
def split_data(city_data):
    """Randomly shuffle the sample set. Divide it into 70 percent training and 30 percent testing data."""

    # Get the features and labels from the Boston housing data
    X, y = city_data.data, city_data.target

    ###################################
    ### Step 2. YOUR CODE GOES HERE ###
    ###################################

    from sklearn.cross_validation import ShuffleSplit
    ss = ShuffleSplit(len(X), n_iter=1, test_size=0.3, random_state=0)

    for train_indices, test_indices in ss:
        pass

    X_train = X[train_indices]
    y_train = y[train_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]

    return X_train, y_train, X_test, y_test
Exemple #28
0
def ModelLearning(X, y):
# Performance of several models with varying sizes of training data.
# The learning and testing scores for each model are then plotted 10 cross-validation sets 
    cv = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.2, random_state = 0)
    # Generate the training set sizes increasing by 50
    train_sizes = np.rint(np.linspace(1, X.shape[0]*0.8 - 1, 9)).astype(int)
    # Create the figure window
    fig = pl.figure(figsize=(10,7))
    # Create three different models based on max_depth
    for k, depth in enumerate([1,3,6,10]):  
        # Create a Decision tree regressor at max_depth = depth
        regressor = DecisionTreeRegressor(max_depth = depth)
        # Calculate the training and testing scores
        sizes, train_scores, test_scores = curves.learning_curve(regressor, X, y, \
            cv = cv, train_sizes = train_sizes, scoring = 'r2')
        # Find the mean M and standard deviation S.D for smoothing
        train_std = np.std(train_scores, axis = 1)
        train_mean = np.mean(train_scores, axis = 1)
        test_std = np.std(test_scores, axis = 1)
        test_mean = np.mean(test_scores, axis = 1)
        # Subplot the learning curve 
        ax = fig.add_subplot(2, 2, k+1)
        ax.plot(sizes, train_mean, 'o-', color = 'r', label = 'Training Score')
        ax.plot(sizes, test_mean, 'o-', color = 'g', label = 'Testing Score')
        ax.fill_between(sizes, train_mean - train_std, \
            train_mean + train_std, alpha = 0.15, color = 'r')
        ax.fill_between(sizes, test_mean - test_std, \
            test_mean + test_std, alpha = 0.15, color = 'g')
        # Labels 
        ax.set_title('max_depth = %s'%(depth))
        ax.set_xlabel('Number of Training Points')
        ax.set_ylabel('Score')
        ax.set_xlim([0, X.shape[0]*0.8])
        ax.set_ylim([-0.05, 1.05])
    # Visual 
    ax.legend(bbox_to_anchor=(1.05, 2.05), loc='lower left', borderaxespad = 0.)
    fig.suptitle('Decision Tree Regressor Learning Performances', fontsize = 16, y = 1.03)
    fig.tight_layout()
    fig.show()
Exemple #29
0
def ModelComplexity(X, y, max_depth=np.arange(1, 11), beta=0.5):
    """ Calculates the performance of the model as model complexity increases.
        The learning and testing errors rates are then plotted. """

    # Create 10 cross-validation sets for training and testing
    cv = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.2, random_state=0)

    # Calculate the training and testing scores
    train_scores, test_scores = curves.validation_curve(
        DecisionTreeClassifier(),
        X,
        y,
        param_name="max_depth",
        param_range=max_depth,
        cv=cv,
        scoring=make_scorer(fbeta_score, beta=beta))

    # Find the mean and standard deviation for smoothing
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    # Plot the validation curve
    pl.figure(figsize=(7, 5))
    pl.title('Decision Tree Classiffier Complexity Performance')
    pl.plot(max_depth, train_mean, 'o-', color='r', label='Training Score')
    pl.plot(max_depth, test_mean, 'o-', color='g', label='Validation Score')
    # pl.fill_between(max_depth, train_mean - train_std, \
    #                 train_mean + train_std, alpha=0.15, color='r')
    # pl.fill_between(max_depth, test_mean - test_std, \
    #                 test_mean + test_std, alpha=0.15, color='g')

    # Visual aesthetics
    pl.legend(loc='lower right')
    pl.xlabel('Maximum Depth')
    pl.ylabel('Score')
    pl.ylim([-0.05, 1.05])
    pl.show()
def get_corrcoef(X):
    div = ShuffleSplit(X.shape[0], n_iter=1, test_size=0.05, random_state=0)
    for train, test in div:
        X = X[np.array(test)]
        break

    X = X.transpose()
    pcc = np.ones((X.shape[0], X.shape[0]))
    m = MINE()
    # feat_groups = [[0], [1, 2, 3], [4, 5, 7, 8, 9, 10], [6],
    #                list(range(11, 24)), list(range(24, 29)), list(range(29, 34))]
    t = time()
    for i in range(0, 1):
        for j in range(1, 20):
            m.compute_score(X[i], X[j])
            pcc[i, j] = pcc[j, i] = m.mic()  # np.corrcoef(X[i], X[j])[0, 1]
            print(i, j, pcc[i, j], time() - t)
    np.savetxt(os.path.join(CODE_PATH, 'feat_sim_pcc_2.csv'),
               pcc,
               fmt='%.3f',
               delimiter=',')
    print('Done with computing PCC,', 'using', time() - t, 's')