def Modelcomplexity(x, y): cv = ShuffleSplit(x.shape[0], n_iter=10, test_size=0.2, random_state=0) max_depth = np.arange(1, 11) plt.figure(figsize=(10, 10)) classifier = DecisionTreeRegressor() (train_scores, test_scores) = curves.validation_curve(classifier, x, y, param_name="max_depth", param_range=max_depth, cv=cv, scoring='r2') train_mean = np.mean(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) train_std = np.std(train_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(max_depth, test_mean, 'o-', color='g', label='testing scores') plt.plot(max_depth, train_mean, 'o-', color='r', label='training scores') plt.fill_between(max_depth, train_mean - train_std, train_mean + train_std, color='r', alpha=0.8) plt.fill_between(max_depth, test_mean - test_std, test_mean + test_std, color='g', alpha=0.8) plt.xlim([0, 11]) plt.ylim([-0.05, 1.05]) plt.xlabel('maximum depth') plt.ylabel('scores') #print(k,depth) plt.legend(loc='upper right', borderaxespad=0.) plt.subtitle('DecisionTreeClassifier', fontsize=16, color='g', y=1.05) plt.tight_layout() plt.show() return True
def Modellearning(x, y): cv = ShuffleSplit(x.shape[0], n_iter=10, test_size=0.2, random_state=0) train_size = np.rint(np.linspace(1, x.shape[0] * 0.8 - 1, 9)).astype(int) fig = plt.figure(figsize=(10, 10)) for k, depth in enumerate([1, 3, 6, 10]): #print(k,depth) classifier = DecisionTreeRegressor(max_depth=depth) (sizes, train_scores, test_scores) = curves.learning_curve(classifier, x, y, train_sizes=train_size, cv=cv, scoring='r2') ax = plt.subplot(2, 2, k + 1) train_mean = np.mean(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) train_std = np.std(train_scores, axis=1) test_std = np.std(test_scores, axis=1) ax.plot(sizes, test_mean, 'o-', color='g', label='testing scores') ax.plot(sizes, train_mean, 'o-', color='r', label='training scores') ax.fill_between(sizes, train_mean - train_std, train_mean + train_std, color='r', alpha=0.8) ax.fill_between(sizes, test_mean - test_std, test_mean + test_std, color='g', alpha=0.8) ax.set_title('maxdepth= %s' % (depth)) ax.set_xlim([0, x.shape[0] * 0.8]) ax.set_ylim([-0.05, 1.05]) ax.set_xlabel('sizes') ax.set_ylabel('scores') ax.legend(bbox_to_anchor=(1.05, 2.05), loc='lower left', borderaxespad=0.) fig.suptitle('DecisionTreeClassifier', fontsize=16, color='g', y=1.05) fig.tight_layout() fig.show() return True
def data_split(inputfile, reads_feature): data = hkl.load(inputfile) reads_count = hkl.load(reads_feature) X = data['mat'] X_kspec = data['kmer'] y = np.mean(reads_count + 1) rs = ShuffleSplit(len(y), n_iter=1, random_state=1) X_kspec = X_kspec.reshape((X_kspec.shape[0], 1024, 4)) X = np.concatenate((X, X_kspec), axis=1) X = X[:, np.newaxis] X = X.transpose((0, 1, 3, 2)) for train_idx, test_idx in rs: X_train = X[train_idx, :] y_train = y[train_idx] X_test = X[test_idx, :] y_test = y[test_idx] X_train = X_train.astype('float32') y_train = y_train.astype('int32') X_test = X_test.astype('float32') y_test = y_test.astype('int32') return [X_train, y_train, X_test, y_test]
def get_acc_auc_randomisedCV(X, Y): #TODO: First get the train indices and test indices for each iteration #Then train the classifier accordingly #Report the mean accuracy and mean auc of all the iterations rs = ShuffleSplit(len(Y), n_iter=5, test_size=0.2, random_state=RANDOM_STATE) accuracylist = [] auclist = [] for train_index, test_index in rs: X_train, X_test = X[train_index], X[test_index] Y_train, Y_test = Y[train_index], Y[test_index] Y_pred = models_partc.logistic_regression_pred(X_train, Y_train, X_test) accuracy, auc, precision, recall, f1score = models_partc.classification_metrics( Y_pred, Y_test) accuracylist.append(accuracy) auclist.append(auc) return mean(accuracylist), mean(auclist)
def fit(self, x, y, validation_proportion=0.1): n_obs, self.n_features = x.shape self.n_classes = np.max(y) + 1 rs = ShuffleSplit(n_obs, n_iter=1, test_size=validation_proportion, random_state=self.random_state) for train_index, valid_index in rs: pass df = ProcessBatch(x[train_index], y[train_index]) self._construct_graph(self.n_features, self.n_classes) self.sess.run(tf.initialize_all_variables()) logger = PrintMess() if self.verbose: logger.info(header=True, Iter=0, TrnLoss=0, ValScore=0) for i in range(int(self.n_iter * n_obs / self.batch_size)): x_batch, y_batch = df.next_batch(self.batch_size) res = self.step(x_batch, y_batch) if (i % 40 == 0) and self.verbose: yhat = self.predict(x[valid_index]) score = accuracy_score(y[valid_index], yhat) logger.info(header=False, Iter=i, TrnLoss=res[0], ValScore=score)
def evaluate(X, args): enum = ShuffleSplit(len(X), n_iter=args.n_iterations, test_size=args.test_size) train_scores = [] test_scores = [] for train_index, test_index in enum: X_train = [X[idx] for idx in train_index] X_test = [X[idx] for idx in test_index] X_train, X_test = preprocess_datasets(X_train, X_test, args) model = GaussianHMM(n_states=args.n_states, n_training_iterations=args.n_training_iterations, topology=args.topology) model.fit(X_train) train_scores.extend([model.loglikelihood(X_curr) for X_curr in X_train]) test_scores.extend([model.loglikelihood(X_curr) for X_curr in X_test]) train_scores_array = np.array(train_scores) train_mean = float(np.mean(train_scores_array)) train_std = float(np.std(train_scores_array)) test_scores_array = np.array(test_scores) test_mean = float(np.mean(test_scores_array)) test_std = float(np.std(test_scores_array)) return train_mean, train_std, test_mean, test_std
def mean_decrease_accuracy_regression(df, Y, black_list=[]): #直接度量每个特征对模型精确率的影响。主要思路是打乱每个特征的特征值顺序,并且度量顺序变动对模型的精确率的影响。 #很明显,对于不重要的变量来说,打乱顺序对模型的精确率影响不会太大,但是对于重要的变量来说,打乱顺序就会降低模型的精确率 rf = RandomForestRegressor() scores = defaultdict(list) X_src = df.drop(black_list, axis=1) X = X_src.values names = X_src.columns #crossvalidate the scores on a number of different random splits of the data for train_idx, test_idx in ShuffleSplit(len(X), 100, .3): X_train, X_test = X[train_idx], X[test_idx] Y_train, Y_test = Y[train_idx], Y[test_idx] r = rf.fit(X_train, Y_train) acc = r2_score(Y_test, rf.predict(X_test)) for i in range(X.shape[1]): X_t = X_test.copy() np.random.shuffle(X_t[:, i]) shuff_acc = r2_score(Y_test, rf.predict(X_t)) scores[names[i]].append((acc - shuff_acc) / acc) return dict([(round(np.mean(score), 4), feat) for feat, score in scores.items()])
def __init__(self, user_ids, item_ids, n_iter=10, test_size=0.2, cold_start=False, random_seed=None): """ Options: - test_size: the fraction of the dataset to be used as the test set. - cold_start: if True, test_size of items will be randomly selected to be in the test set and removed from the training set. When False, test_size of all training pairs are moved to the test set. """ self.user_ids = user_ids self.item_ids = item_ids self.no_interactions = len(self.user_ids) self.n_iter = n_iter self.test_size = test_size self.cold_start = cold_start self.shuffle_split = ShuffleSplit(self.no_interactions, n_iter=self.n_iter, test_size=self.test_size)
def get_acc_auc_randomisedCV(X, Y, iterNo=5, test_percent=0.2): # TODO: First get the train indices and test indices for each iteration # Then train the classifier accordingly # Report the mean accuracy and mean auc of all the iterations accuracy_arr = [] auc_arr = [] shuffle_split = ShuffleSplit(n=X.get_shape()[0], n_iter=5, test_size=.2, random_state=545510477) for train_i, test_i in shuffle_split: X_train, X_test = X[train_i], X[test_i] Y_train, Y_test = Y[train_i], Y[test_i] Y_pred = models.logistic_regression_pred(X_train, Y_train, X_test) acc, auc_, precision, recall, f1score = models.classification_metrics( Y_pred, Y_test) accuracy_arr.append(acc) auc_arr.append(auc_) return sum(accuracy_arr) / len(accuracy_arr), sum(auc_arr) / len(auc_arr)
def run_grid_search(estimator, param_grid, metric, X, y, X_test, y_test, seed, profile): _train_test_iter = KFold(X.shape[0], n_folds=5, shuffle=True, random_state=seed) inner_cv_func = lambda zx, zy: ShuffleSplit( zx.shape[0], n_iter=10, test_size=0.2, random_state=seed) if metric == 'cindex': scoring_func = score_concordance_index else: scoring_func = score_time_roc _grid_search = NestedGridSearchCV(estimator, param_grid, scoring_func, cv=_train_test_iter, inner_cv=inner_cv_func, profile=profile) _grid_search.fit(X, y, X_test=X_test, y_test=y_test) return _grid_search
def dict_train_test_split(dictionary, train_size, cap_train=None, cap_test=None): d_list = list(dictionary.iteritems()) if isinstance(train_size, int) and train_size > 1: train_size /= float(len(d_list)) test_size = 1. - train_size indices_train, indices_test = iter( ShuffleSplit(len(d_list), n_iter=1, test_size=test_size)).next() d_train_list = [d_list[index_train] for index_train in indices_train] d_test_list = [d_list[index_test] for index_test in indices_test] if cap_train is not None: d_train_list = d_train_list[0:cap_train] if cap_test is not None: d_test_list = d_test_list[0:cap_test] return dict(d_train_list), dict(d_test_list)
def mean_decrease_accuracy(x, y, model, names, score_type): scores = defaultdict(list) X = np.matrix(x) Y = np.array(y) for train_idx, test_idx in ShuffleSplit(len(x), 100, .3): X_train, X_test = X[train_idx], X[test_idx] Y_train, Y_test = Y[train_idx], Y[test_idx] r = model.fit(X_train, Y_train) acc = r2_score(Y_test, model.predict(X_test)) for i in range(X.shape[1]): X_t = X_test.copy() np.random.shuffle(X_t[:, i]) shuff_acc = r2_score(Y_test, model.predict(X_t)) scores[names[i]].append((acc - shuff_acc) / acc) scored = [round(np.mean(score), 4) for feat, score in scores.items()] maxval = max(scored) minval = min(scored) dist = maxval - minval return list( zip((np.array(scored) - minval) / dist, [el[0] for el in scores.items()]))
def cross_validation(data, validation_percent): final_results = [] rs = ShuffleSplit(len(data), n_iter=10, test_size=int(len(data) * validation_percent)) for train_index, test_index in rs: test_data = data.iloc[test_index] geo_y = predict_geo_y(data, train_index, test_index) temporal_y = predict_temporal_y(data, train_index, test_index) predicted_y = second_learner( np.array([geo_y, temporal_y]).T, test_data.y) currrrrr_results = np.mean( np.power(np.array(predicted_y) - np.array(test_data.y), 2)) final_results.append(currrrrr_results) print( np.mean(np.power(np.array(geo_y) - np.array(test_data.y), 2)), np.mean(np.power(np.array(temporal_y) - np.array(test_data.y), 2))) print(currrrrr_results) print(final_results)
def AllDataDeal(X_data, X_target): X_data = np.array(X_data) X_target = np.array(X_target) names = ['BMI', '肺活量', '立定跳远', '坐位体前屈', '仰卧起坐/引体向上', '50米跑', '长跑时间'] rf = RandomForestRegressor(max_features='sqrt') scores = [] score_value = [] score_name = [] # 单独采用每个特征进行建模,并进行交叉验证 # print(len(X_data)) for i in range(len(names)): score = cross_val_score(rf, X_data[:, i:i + 1], X_target, scoring="r2", cv=ShuffleSplit(len(X_data), 3, .3)) scores.append((format(np.abs(np.mean(score)), '.3f'), names[i])) # scores.append((format(np.mean(score), '.3f'), names[i])) score_value.append(abs(np.mean(score))) score_name.append(names[i]) return sorted(scores, reverse=True)
def quick_cv(clf, X, y, score_func, n_iter=3, test_size=0.1, random_state=None): """ returns the cross validation """ cv = ShuffleSplit( y.shape[0], n_iter=n_iter, test_size=test_size, random_state=random_state, ) scores = [] for train, test in cv: X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test] preds = fit_predict(clf, X_train, y_train, X_test) scores.append(score_func(y_test, preds)) return sum(scores) / float(len(scores))
def main(): # get the processed data X,y = preprocess_data() # get the dummy clf: Very important, it creates a baseline! dummy_clf = get_dummy_clf() dummy_clf.fit(X, y) y_hat = dummy_clf.predict(y) # Get the baseline predictions for x and y print "Dummy MSE x", mse(y[:,0], y_hat[:,0]) print "Dummy MSE y", mse(y[:,1], y_hat[:,1]) # create 5 different crossvalidation folds ss = ShuffleSplit(len(y), n_iter=5, random_state=0) scores_x = [] scores_y = [] for i, (train_index, test_index) in enumerate(ss): # Choose a classifier #clf = get_linear_clf() clf = get_nn_clf() clf.fit(X[train_index], y[train_index]) y_hat = clf.predict(X[test_index]) # Save the score for each fold score_x = mse(y[test_index,0], y_hat[:,0]) score_y = mse(y[test_index,1], y_hat[:,1]) # You can print the coefficients/intercept for the linear classifier #print clf.steps[-1][1].coef_,clf.steps[-1][1].intercept_ scores_x.append(score_x) scores_y.append(score_y) print scores_x,scores_y print "MSE CV x", np.array(scores_x).mean() print "MSE CV y", np.array(scores_y).mean()
def drawrocada(X,Y): rs = ShuffleSplit(len(Y), 5,0.2) i=10 for train_index, test_index in rs: clf = AdaBoostClassifier() X_train, X_test = X[train_index], X[test_index] Y_train, Y_test = Y[train_index], Y[test_index] clf.fit(X_train,Y_train) # X_train, Y_train = utils.get_data_from_svmlight("big2/train/part-r-00000") # X_test, Y_test = utils.get_data_from_svmlight("big2/test/part-r-00000") pre = clf.predict_proba(X_test) y_test_prob = pre[:,1] y_test = Y_test fpr, tpr, _ = roc_curve(y_test, y_test_prob) #print (fpr,tpr) roc_auc = auc(fpr, tpr) #Plot of a ROC curve for a specific class plt.figure() plt.plot(fpr, tpr, label='ROC curve')# (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('AdaBoost Classifier') plt.legend(loc="lower right") plt.savefig("pic"+str(i)) i=i+1
def get_acc_auc_randomisedCV(X, Y, iterNo=5, test_percent=0.2): #TODO: First get the train indices and test indices for each iteration #Then train the classifier accordingly #Report the mean accuracy and mean auc of all the iterations ss = ShuffleSplit(len(Y), n_iter=iterNo, test_size=test_percent, random_state=RANDOM_STATE) clf_lr_ss = LogisticRegression() acc_list = [] auc_list = [] for train, test in ss: clf_lr_ss.fit(X[train], Y[train]) acc = accuracy_score(clf_lr_ss.predict(X[test]), Y[test]) acc_list.append(acc) auc_ = roc_auc_score(clf_lr_ss.predict(X[test]), Y[test]) auc_list.append(auc_) acc_k = array(acc).mean() auc_k = array(auc_list).mean() return acc_k, auc_k
def predict_with_one(X, out_file_name): n_samples, n_features = X.shape iter_num = 3 div = ShuffleSplit(n_samples, n_iter=iter_num, test_size=0.2, random_state=0) model = ExtraTreesRegressor(n_estimators=5) score_matrix = np.zeros((n_features, n_features)) t = time() round_num = 0 for train, test in div: round_num += 1 train_samples = X[np.array(train)] test_samples = X[np.array(test)] for i in range(n_features): for j in range(n_features): X_train = train_samples[:, i:i + 1] X_test = test_samples[:, i:i + 1] y_train = train_samples[:, j] y_test = test_samples[:, j] # for i in range(len(fl)): # for j in range(len(fl)): # if fl[j][1]-fl[j][0] != 1: # continue # X_train = train_samples[:, fl[i][0]:fl[i][1]] # X_test = test_samples[:, fl[i][0]:fl[i][1]] # y_train = train_samples[:, fl[j][0]] # y_test = test_samples[:, fl[j][0]] model.fit(X_train, y_train) y_pred = model.predict(X_test) mae = mean_absolute_error(y_test, y_pred) score_matrix[i, j] += mae print('Round', round_num, '|', i, j, mae, time() - t) np.savetxt(os.path.join(CODE_PATH, out_file_name), score_matrix / iter_num, fmt='%.3f', delimiter=',')
def fit(X,y): ''' This function actually calculates the importances and accuracy metric using cross validation. Usage: imp,acc = fit(X,y) Arguments: X: feature vector, numpy array y: label vector, numpy array Return values: imp: feature importance vector acc: estimator accuracy metric ''' scores = defaultdict(list) # Any unknown element is automatically a list rf= copy.deepcopy(self.clf) # #crossvalidate the scores on a number of different random splits of the data outAcc= 0. for train_idx, test_idx in ShuffleSplit(len(X), self.nCV, .3): X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] r = rf.fit(X_train, Y_train) # Get accuracy metric if metric is None: outAcc= None elif metric == 'OOB': outAcc += rf.oob_score elif metric == 'AUC': outAcc += sklearn.metrics.roc_auc_score(y_test, rf.predict_proba(X_test) ) if self.algorithm == 'gini': scores[i].append( self.giniImportance(rf,X_test,y_test) ) elif self.algorithm == 'permutation': scores[i].append( self.permutationImportance(rf,X_test,y_test) ) elif self.algorithm == 'conditional': scores[i].append( self.conditionalPermutationImportance(rf,X_test,y_test) ) # # Return mean importance and metric importances= np.array([np.mean(scores[i]) for i in range(X.shape[1])]) return importances, outAcc / float(self.nCV)
def trainTest(clf, X, y, fold=10.0, classn=2, returnconfusion=False): # kf = KFold(n_splits=int(fold), shuffle=True,random_state=np.random.randint(len(y))) # kf = KFold(len(y),n_folds=int(fold)) kf = ShuffleSplit(len(y), n_iter=int(fold), test_size=0.25, random_state=0) accuracy = 0.0 confusion = np.zeros([classn, classn]) for train_index, test_index in kf: X_train = X[train_index] y_train = y[train_index] X_test = X[test_index] expected = y[test_index] clf.fit(X_train, y_train) predicted = clf.predict(X_test) accy_tmp = metrics.accuracy_score(expected, predicted) accuracy += accy_tmp conf_tmp = metrics.confusion_matrix(expected, predicted) confusion += conf_tmp print "predited rate:%f" % accy_tmp print confusion print accuracy / fold if returnconfusion: return confusion
def train_model(clc_factory, X, Y, testdata): print('start train_model...') # 设置随机状态,来得到确定性的行为 # just for kaggle cv = ShuffleSplit(n=len(X), n_iter=1, test_size=0.001, indices=True, random_state=0) # accuracy scores = [] # AUC pr_scores = [] # F1 score f1 = [] for train, test in cv: # just for kaggle, use all data X_test = testdata X_train, Y_train = X[train], Y[train] clf = clc_factory() clf.fit(X_train, Y_train) # predict_data = clf.predict(X_test) # pickle.dump(predict_data, open("./acc_tmp/kaggle_predict_label.p", "wb")) while True: test = input('Please input your data: ') reslut = clf.predict([test])[0] print('The sentiment polarity of your input text is: %s' % ('Positive' if reslut == 1 else 'Negative')) if test == 'exit()': return None
def __grid_search_model(clf_factory, X, Y): cv = ShuffleSplit(n=len(X), n_iter=10, test_size=0.3, random_state=0) param_grid = dict( vect__ngram_range=[(1, 1), (1, 2), (1, 3)], vect__min_df=[1, 2], vect__smooth_idf=[False, True], vect__use_idf=[False, True], vect__sublinear_tf=[False, True], vect__binary=[False, True], clf__alpha=[0, 0.01, 0.05, 0.1, 0.5, 1], ) grid_search = GridSearchCV(clf_factory(), param_grid=param_grid, cv=cv, score_func=f1_score, verbose=10) grid_search.fit(X, Y) clf = grid_search.best_estimator_ print clf return clf
def get_best_gb_regressor_model(X, y): ''' Gets best GradientBoost regressor model based on grid search, trained on data [X, y]. ''' # Create cross validation sets from training data cv_sets = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.20, random_state=42) # Create the gradient boost regressor object regressor = GradientBoostingRegressor(random_state=42) # Params to tune gs_params = { 'n_estimators': [2000, 3000], 'learning_rate': [0.01, 0.05], 'max_depth': [3, 4, 5], 'min_samples_leaf': [20, 26], 'min_samples_split': [2, 5, 10] # 'max_leaf_nodes':[None,5] } # Use r2 as scoring function gs_scoring_func = make_scorer(perf_metric_r2) # Create grid search object and fit the data grid_search = GridSearchCV(regressor, param_grid=gs_params, scoring=gs_scoring_func, cv=cv_sets) model = grid_search.fit(X, y) # Print optimal params print "GradientBoosting" print model.best_params_ # return the model return model
def gridsearch(X, y, weight): model = Pipeline([('vect', CountVectorizer(tokenizer=tokenize_filtered)), ('clf', SVC())]) param_range = np.logspace(-4, 3, 8) param_grid = [{ 'clf__C': param_range, 'clf__gamma': param_range, 'clf__kernel': ['rbf'] }] cv = ShuffleSplit(6422) gs = GridSearchCV(estimator=model, param_grid=param_grid, fit_params={'clf__sample_weigth': weight}, cv=cv, scoring='recall', n_jobs=2) gs.fit(X, y) print 'best score :', gs.best_score_ print 'best parpams :', gs.best_params_ print gs.grid_scores_
def train_model(clf_factory, X, Y): # setting random state to get deterministic behavior cv = ShuffleSplit(n=len(X), n_iter=10, test_size=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] precisions, recalls, thresholds = [], [], [] precision_recall_scores = [] for train_index, test_index in cv: X_train, y_train = X[train_index], Y[train_index] X_test, y_test = X[test_index], Y[test_index] clf = clf_factory clf.fit(X_train, y_train) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) train_errors.append(1 - train_score) test_errors.append(1 - test_score) scores.append(test_score) probability = clf.predict_proba(X_test) precision, recall, pr_thresholds = precision_recall_curve( y_test, probability[:, 1]) precision_recall_scores.append(auc(recall, precision)) precisions.append(precision) recalls.append(recall) thresholds.append(pr_thresholds) return scores, precision_recall_scores, precisions, recalls, thresholds, test_errors, train_errors
def split_data(city_data): """Randomly shuffle the sample set. Divide it into 70 percent training and 30 percent testing data.""" # Get the features and labels from the Boston housing data X, y = city_data.data, city_data.target ################################### ### Step 2. YOUR CODE GOES HERE ### ################################### from sklearn.cross_validation import ShuffleSplit ss = ShuffleSplit(len(X), n_iter=1, test_size=0.3, random_state=0) for train_indices, test_indices in ss: pass X_train = X[train_indices] y_train = y[train_indices] X_test = X[test_indices] y_test = y[test_indices] return X_train, y_train, X_test, y_test
def ModelLearning(X, y): # Performance of several models with varying sizes of training data. # The learning and testing scores for each model are then plotted 10 cross-validation sets cv = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.2, random_state = 0) # Generate the training set sizes increasing by 50 train_sizes = np.rint(np.linspace(1, X.shape[0]*0.8 - 1, 9)).astype(int) # Create the figure window fig = pl.figure(figsize=(10,7)) # Create three different models based on max_depth for k, depth in enumerate([1,3,6,10]): # Create a Decision tree regressor at max_depth = depth regressor = DecisionTreeRegressor(max_depth = depth) # Calculate the training and testing scores sizes, train_scores, test_scores = curves.learning_curve(regressor, X, y, \ cv = cv, train_sizes = train_sizes, scoring = 'r2') # Find the mean M and standard deviation S.D for smoothing train_std = np.std(train_scores, axis = 1) train_mean = np.mean(train_scores, axis = 1) test_std = np.std(test_scores, axis = 1) test_mean = np.mean(test_scores, axis = 1) # Subplot the learning curve ax = fig.add_subplot(2, 2, k+1) ax.plot(sizes, train_mean, 'o-', color = 'r', label = 'Training Score') ax.plot(sizes, test_mean, 'o-', color = 'g', label = 'Testing Score') ax.fill_between(sizes, train_mean - train_std, \ train_mean + train_std, alpha = 0.15, color = 'r') ax.fill_between(sizes, test_mean - test_std, \ test_mean + test_std, alpha = 0.15, color = 'g') # Labels ax.set_title('max_depth = %s'%(depth)) ax.set_xlabel('Number of Training Points') ax.set_ylabel('Score') ax.set_xlim([0, X.shape[0]*0.8]) ax.set_ylim([-0.05, 1.05]) # Visual ax.legend(bbox_to_anchor=(1.05, 2.05), loc='lower left', borderaxespad = 0.) fig.suptitle('Decision Tree Regressor Learning Performances', fontsize = 16, y = 1.03) fig.tight_layout() fig.show()
def ModelComplexity(X, y, max_depth=np.arange(1, 11), beta=0.5): """ Calculates the performance of the model as model complexity increases. The learning and testing errors rates are then plotted. """ # Create 10 cross-validation sets for training and testing cv = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.2, random_state=0) # Calculate the training and testing scores train_scores, test_scores = curves.validation_curve( DecisionTreeClassifier(), X, y, param_name="max_depth", param_range=max_depth, cv=cv, scoring=make_scorer(fbeta_score, beta=beta)) # Find the mean and standard deviation for smoothing train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) # Plot the validation curve pl.figure(figsize=(7, 5)) pl.title('Decision Tree Classiffier Complexity Performance') pl.plot(max_depth, train_mean, 'o-', color='r', label='Training Score') pl.plot(max_depth, test_mean, 'o-', color='g', label='Validation Score') # pl.fill_between(max_depth, train_mean - train_std, \ # train_mean + train_std, alpha=0.15, color='r') # pl.fill_between(max_depth, test_mean - test_std, \ # test_mean + test_std, alpha=0.15, color='g') # Visual aesthetics pl.legend(loc='lower right') pl.xlabel('Maximum Depth') pl.ylabel('Score') pl.ylim([-0.05, 1.05]) pl.show()
def get_corrcoef(X): div = ShuffleSplit(X.shape[0], n_iter=1, test_size=0.05, random_state=0) for train, test in div: X = X[np.array(test)] break X = X.transpose() pcc = np.ones((X.shape[0], X.shape[0])) m = MINE() # feat_groups = [[0], [1, 2, 3], [4, 5, 7, 8, 9, 10], [6], # list(range(11, 24)), list(range(24, 29)), list(range(29, 34))] t = time() for i in range(0, 1): for j in range(1, 20): m.compute_score(X[i], X[j]) pcc[i, j] = pcc[j, i] = m.mic() # np.corrcoef(X[i], X[j])[0, 1] print(i, j, pcc[i, j], time() - t) np.savetxt(os.path.join(CODE_PATH, 'feat_sim_pcc_2.csv'), pcc, fmt='%.3f', delimiter=',') print('Done with computing PCC,', 'using', time() - t, 's')