def perform_classifier_cross_validation(classifier, dtm_train,targets_train, dtm_test, targets_test): cv = 3 k_fold = KFold(len(targets_train), n_folds=cv,shuffle=True, random_state=42) scoring = 'f1_macro' scores = cross_validation.cross_val_score(classifier, dtm_train, targets_train,cv=k_fold, scoring=scoring) print("Same classifier with cross validation:") print("Scores for folds" +"("+str(cv)+"):"+ str(scores)) print(scoring + ": %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) targets_train_predicted = cross_validation.cross_val_predict(classifier, dtm_train,targets_train, cv=cv) print_classifier_metrics(targets_train,targets_train_predicted, "train-with-cv") targets_test_predicted = cross_validation.cross_val_predict(classifier, dtm_test,targets_test,cv=cv) print_classifier_metrics(targets_test, targets_test_predicted, "test-with-cv") return classifier
def apply_cross_validated_learning(datasetname, X, y, resultsfolder, nfolds=5): dataspacename = datasetname + "_nfolds-" + str(nfolds) experimentrootpath = IOtools.ensure_dir(os.path.join(resultsfolder, dataspacename)) scorefilepath = os.path.join(experimentrootpath, metaexperimentation.scorefilename+".csv") metaexperimentation.initialize_score_file(scorefilepath) # SVM kernels = ["linear", "rbf", "sigmoid", "poly"] Cs = [1, 10, 100, 1000] for kernel in kernels: for c in Cs: alg = "SVM" modelname = "_m-" + alg + "_k-" + kernel + "_C-" + str(c) experimentname = "nfolds-" + str(nfolds) + modelname clf = svm.SVC(kernel=kernel, C=c) ypredicted = cross_validation.cross_val_predict(clf, X, y, cv=nfolds) #print metrics.accuracy_score(y, ypredicted) reportresults(y, ypredicted, experimentname, experimentrootpath, scorefilepath) # Naive Bayes NBmodels = [naive_bayes.MultinomialNB(), naive_bayes.GaussianNB()] for nbmodel in NBmodels: alg = "NB" modelname = "_m-" + nbmodel.__class__.__name__ experimentname = "nfolds-" + str(nfolds) + modelname ypredicted = cross_validation.cross_val_predict(nbmodel, X, y, cv=nfolds) reportresults(y, ypredicted, experimentname, experimentrootpath, scorefilepath)
def main(): parser = argparse.ArgumentParser(description='Train an ML model') required = parser.add_argument_group('required options') required.add_argument('-x', '--trainfile', required=True, help='File containing training data') required.add_argument('-y', '--targetfile', required=True, help='File containing target data') #required.add_argument('-o', '--modelfile', required=True, help='Output filename for trained model object') #required.add_argument('-t', '--targettype', default=int) args = parser.parse_args() #X = np.loadtxt(args.trainfile, skiprows=1) X = np.loadtxt(args.trainfile) #Y = np.loadtxt(args.targetfile, dtype=args.targettype) #Y = np.loadtxt(args.targetfile) Y = np.genfromtxt(args.targetfile,dtype='str') assert len(X) == len(Y), "length mismatch between train and target data" clf1 = linear_model.LogisticRegression(penalty='l2',C=1e5,solver='newton-cg',tol=0.00001) clf1.fit(X, Y) predicted1=cross_validation.cross_val_predict(clf1,X,Y,cv=2) print("Prediction accuracy of logistic regression : ", metrics.accuracy_score(Y, predicted1)) #predicted=cross_validation.cross_val_predict(clf1,x,x_tr,cv=2) clf2 = svm.SVC(C=1e5,kernel='rbf') clf2.fit(X, Y) predicted2=cross_validation.cross_val_predict(clf2,X,Y,cv=2) print("Prediction accuracy of SVM : ", metrics.accuracy_score(Y, predicted2)) clf3 = naive_bayes.BernoulliNB(alpha=1.9) clf3.fit(X, Y) predicted3=cross_validation.cross_val_predict(clf3,X,Y,cv=2) print("Prediction accuracy of naive bayes : ", metrics.accuracy_score(Y, predicted3)) clf4 = tree.DecisionTreeClassifier(criterion='entropy') clf4.fit(X, Y) predicted4=cross_validation.cross_val_predict(clf4,X,Y,cv=2) print("Prediction accuracy of decision trees : ", metrics.accuracy_score(Y, predicted4)) #with open(args.modelfile, "wb") as outfile: # pickle.dump(clf1, outfile, pickle.HIGHEST_PROTOCOL) with open('bin_file_lr',"wb") as outfile1: pickle.dump(clf1, outfile1, pickle.HIGHEST_PROTOCOL) with open('bin_file_svm',"wb") as outfile2: pickle.dump(clf2, outfile2, pickle.HIGHEST_PROTOCOL) with open('bin_file_bayes',"wb") as outfile3: pickle.dump(clf3, outfile3, pickle.HIGHEST_PROTOCOL) with open('bin_file_dtree',"wb") as outfile4: pickle.dump(clf4, outfile4, pickle.HIGHEST_PROTOCOL)
def test_cross_val_predict_sparse_prediction(): # check that cross_val_predict gives same result for sparse and dense input X, y = make_multilabel_classification( n_classes=2, n_labels=1, allow_unlabeled=False, return_indicator=True, random_state=1 ) X_sparse = csr_matrix(X) y_sparse = csr_matrix(y) classif = OneVsRestClassifier(SVC(kernel="linear")) preds = cval.cross_val_predict(classif, X, y, cv=10) preds_sparse = cval.cross_val_predict(classif, X_sparse, y_sparse, cv=10) preds_sparse = preds_sparse.toarray() assert_array_almost_equal(preds_sparse, preds)
def test_cross_val_predict_pandas(): # check cross_val_score doesn't destroy pandas dataframe types = [(MockDataFrame, MockDataFrame)] try: from pandas import Series, DataFrame types.append((Series, DataFrame)) except ImportError: pass for TargetType, InputFeatureType in types: # X dataframe, y series X_df, y_ser = InputFeatureType(X), TargetType(y) check_df = lambda x: isinstance(x, InputFeatureType) check_series = lambda x: isinstance(x, TargetType) clf = CheckingClassifier(check_X=check_df, check_y=check_series) cval.cross_val_predict(clf, X_df, y_ser)
def run(params): train = loadDataFrame(params,'train') if params['test']: test = loadDataFrame(params,'test') train = runPreprocess(train,params) clf = getSpecifiedClf(params) try: dataset,target = splitDatasetTarget(train,params['target']) except: raise Exception('Target not specified') try: cross_val = params['cross_validate'] except: cross_val = False clfName = getNameFromModel(clf) if cross_val and clfName != 'XGBClassifier': print('Beginning cross validation') predicted = cross_validation.cross_val_predict(clf,dataset,target,cv=5,n_jobs=-1) accuracyChecker(target,predicted) return if clfName == 'XGBClassifier': print('Xgboost CV selected. Beginning to find optimal rounds') clf = xgboostCV(clf,dataset,target) print('Xgboost Accuracy on 80-20 split (for speed)') trainX,testX,trainY,testY = splitTrainTest(dataset,target) clf.fit(trainX,trainY) predicted = clf.predict(testX) accuracyChecker(testY,predicted)
def get_testing_metrics(model, X, y, metrics, as_indexes, n_folds, X_test=None): y_pred = cross_val_predict( model, X, y, cv=StratifiedKFold( y, n_folds=n_folds, shuffle=True, random_state=RANDOM_STATE ) ) print "y_pred", y_pred model.fit(X, y) result = get_y_true_y_pred_based_metrics(y, y_pred, metrics) if FEATURES in metrics: result[FEATURES] = model.get_support(indices=True) if OBJECTS in metrics: if as_indexes: result[OBJECTS] = [get_data_keeper().get_object_name_by_index(index) for (index,) in X] else: result[OBJECTS] = list(X.index) if TEST_PREDICTIONS in metrics: result[TEST_PREDICTIONS] = X_test, model.predict(X_test) return result
def cross_validate(self): progress_logger.info("Starting cross validation.") validate_clf = linear_model.LogisticRegression(class_weight=self.weights) predictions = cross_validation.cross_val_predict(validate_clf, self.X, self.Y.ravel(), cv=5) fp_count = 0.0 tp_count = 0.0 fn_count = 0.0 tn_count = 0.0 miscount = 0.0 for i in range(len(predictions)): prediction = predictions[i] expected = self.Y[i][0] if prediction == 1 and expected == 1: tp_count += 1 elif prediction == 1 and expected == 0: fp_count += 1 elif prediction == 0 and expected == 1: fn_count += 1 elif prediction == 0 and expected == 0: tn_count += 1 else: miscount += 1 if miscount > 0: debug_logger.warn("During cross validation, found {} miscounts.".format(miscount)) total_count = fp_count + tp_count + fn_count + tn_count self.validation_accuracy = (tp_count + tn_count) / total_count if total_count != 0 else 0.0 fp_rate = fp_count / (fp_count + tn_count) if fp_count + tn_count != 0 else 0.0 fn_rate = fn_count / (fn_count + tp_count) if fn_count + tp_count != 0 else 0.0 progress_logger.info("Confusion matrix - True positives: {}, False positives: {}, False negatives: {}, True negatives: {}".format( tp_count, fp_count, fn_count, tn_count)) progress_logger.info("Validation Accuracy: {}".format(self.validation_accuracy)) progress_logger.info("False positive rate: {}".format(fp_rate)) progress_logger.info("False negative rate: {}".format(fn_rate))
def crossvalidation(x, y): """ Cross validation metric. Also plot confusion matrix and save cls if flags are set to 1. :param x: features (valence, arousal) :param y: target (emotion) :return: """ c_array = np.logspace(0, 3, 4) gamma_array = np.logspace(-3, 3, 7) # feature scaling if feature_scaling: std_scale = preprocessing.StandardScaler().fit(x) x = std_scale.transform(x) for c in c_array: for gamma in gamma_array: clf = svm.SVC(kernel='linear', C=c, gamma=gamma) #kernel= rbf #kernel= poly #kernel= linear scores = cross_validation.cross_val_score(clf, x, y, cv=3) print("Accuracy: %0.2f (+/- %0.2f) %f %f" % (scores.mean(), scores.std() * 2, c, gamma)) pred = cross_validation.cross_val_predict(clf, x, y, cv=3) print("Classes accuracy: ", classes_accuracy(y, pred)) print(np.array(y)) print(pred) #plot last one, not best, CARE!!! if plot_confusion_matrix: confusion_matrix.prepare_plot(y, pred) if save_clf: clf.fit(x, y) joblib.dump(clf, 'classifiers\\'+configuration.get('clf_name')+'.pkl')
def main(): pickle_folder = '../pickles_no_rms' pickle_folders_to_load = [f for f in os.listdir(pickle_folder) if os.path.isdir(join(pickle_folder, f))] pickle_folders_to_load = sorted(pickle_folders_to_load) pickle_folders_to_load = [p for p in pickle_folders_to_load if 'drums1__' not in p] sdr_type = 'background' fits = [] sdrs = [] for pick in pickle_folders_to_load: beat_spec_name = join(pickle_folder, pick, pick + '__beat_spec.pick') beat_spec = pickle.load(open(beat_spec_name, 'rb')) entropy, log_mean = beat_spectrum_prediction_statistics(beat_spec) fit_X = [entropy, log_mean] fits.append(fit_X) sdrs_name = join(pickle_folder, pick, pick + '__sdrs.pick') sdr_vals = pickle.load(open(sdrs_name, 'rb')) cur_sdr = sdr_vals[sdr_type][0] sdrs.append(cur_sdr) fits = np.array(fits) sdrs = np.array(sdrs).reshape(-1, 1) knn = neighbors.KNeighborsRegressor(5, weights='distance') scores = cross_validation.cross_val_predict(knn, fits, sdrs, cv=10, verbose=1) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean() - sdrs.mean(), scores.std() * 2))
def classify_cv(data, cats, k): clf = svm.SVC(gamma=0.001, C=100.) vect = TfidfVectorizer(analyzer = 'word', stop_words = stopwords) tfidf_matrix = vect.fit_transform(data) predicted = cross_validation.cross_val_predict(clf, tfidf_matrix, cats, cv=k) conf_matrix = metrics.confusion_matrix(cats, predicted) print (metrics.classification_report(cats, predicted))
def main(): pickle_folder = "../mir_1k/pickles" pickle_folders_to_load = [f for f in os.listdir(pickle_folder) if "__beat_spec.pick" in f] pickle_folders_to_load = sorted(pickle_folders_to_load) sdr_type = "background" fits = [] sdrs = [] for pick in pickle_folders_to_load: pick = pick.replace("__beat_spec.pick", "") beat_spec_path = join(pickle_folder, pick + "__beat_spec.pick") beat_spec = pickle.load(open(beat_spec_path, "rb")) entropy, log_mean = beat_spectrum_prediction_statistics(beat_spec) fit_X = [entropy, log_mean] fits.append(fit_X) sdrs_name = join(pickle_folder, pick + "__sdrs.pick") sdr_vals = pickle.load(open(sdrs_name, "rb")) cur_sdr = sdr_vals[sdr_type][0] sdrs.append(cur_sdr) fits = np.array(fits) sdrs = np.array(sdrs).reshape(-1, 1) knn = neighbors.KNeighborsRegressor(5, weights="distance") scores = cross_validation.cross_val_predict(knn, fits, sdrs, cv=10, verbose=1) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
def main(): start_time = time.clock() vectors, labels, anchors = prepare_data() print("Gonna go out and classify. Wish me luck.") support_vector_machine = grid_search.GridSearchCV(svm.SVC(), PARAM_GRID, cv = GRID_SEARCH_CV) classifier = pipeline.make_pipeline(preprocessing.StandardScaler(), support_vector_machine) predicted_labels = cross_validation.cross_val_predict(classifier, vectors, labels, cv = CV) end_time = time.clock() file_string = str(datetime.datetime.today()) + "-" + "-".join( DIRECTORIES + [str(ANCHORS_PER_CLASS) + "anchors", str(ITEMS_PER_CLASS) + "items"] ) + ".txt" with open( os.path.join("reports", file_string), "w") as f: date = "date: " + str(datetime.datetime.now()) compressor_filters = pretty_print(lzma_filters) anchors_used = "anchors used: " + pretty_print(anchors) time_indication = "indication of time spent: " + str(end_time - start_time) anchors = "anchors per class: " + str(ANCHORS_PER_CLASS) preloaded_anchors = "Used preloaded anchors: " + str(USE_REPRESENTATIVE_ANCHORS) grid_search_cv = "grid search cv: " + str(GRID_SEARCH_CV) cv = "cv: " + str(CV) report = metrics.classification_report(labels, predicted_labels, digits=4) print report + "\n" print time_indication f.writelines("\n".join([date, compressor_filters, anchors_used, time_indication, anchors, preloaded_anchors, grid_search_cv, cv, report]))
def run_xval(prefix, clf, data, cv, features=cgo13_features, seed=1): X = features(data) y = getlabels(data) predicted = cross_validation.cross_val_predict(clf, X, y, cv=cv) return Metrics(prefix, data, predicted, clf)
def test_knn_score_equal_sklearn_loocv_score(self): acc, correct, cmat = \ score(self.distance, self.label, k=5, metric='distance') # scoring only one k value, so take just the first elements: acc = acc[0, 0] correct = correct[0] cmat = cmat[0] # This should work too, but is much slower than using precomp. dist. #======================================================================= # knclassifier = KNeighborsClassifier(n_neighbors=5, algorithm='brute', # metric='cosine') #======================================================================= knclassifier = KNeighborsClassifier(n_neighbors=5, algorithm='brute', metric='precomputed') n = self.distance.shape[0] # for LOO-CV loo_cv = LeaveOneOut(n) predicted_sklearn = cross_val_predict( knclassifier, self.distance, self.label, cv=loo_cv) acc_sklearn = accuracy_score(self.label, predicted_sklearn) if not np.allclose(acc, acc_sklearn): return self.assertAlmostEqual(acc, acc_sklearn, places=7) else: correct_sklearn = predicted_sklearn == self.label equal_prediction = np.all(correct == correct_sklearn) msg = """Accuracies of hub toolbox k-NN and sklearn-kNN are almost equal, but the predictions per data point are not.""" return self.assertTrue(equal_prediction, msg)
def main(): dataTuples=getDataInFormat() print "Length of dataTuples is: ", len(dataTuples) shuffle(dataTuples) trainTuples=dataTuples del dataTuples ids, labels, vectors= getLabelsAndVectors(trainTuples) del trainTuples followerCountsList = loadFollowerCountsFromFile() space=getSpace(vectors) reducedSpace=getReducedSpace(vectors, space) spaceWithMetaFeatures= augmentSpace(reducedSpace, emotionFeatures) print "Total # of features in your space is: ", len(space) print "Total # of features in your reducedSpace is: ", len(reducedSpace) oneHotVectors=getOneHotVectors(ids, labels, vectors,spaceWithMetaFeatures , followerCountsList) trainVectors, trainLabels=getOneHotVectorsAndLabels(oneHotVectors) del oneHotVectors clf = OneVsRestClassifier(SVC(C=1, kernel = 'linear',gamma=0.1, verbose= False, probability=False)) clf.fit(trainVectors, trainLabels) print "\nDone fitting classifier on training data...\n" print "\nDone fitting classifier on training data...\n" print "="*50, "\n" print "Results with 10-fold cross validation:\n" print "="*50, "\n" predicted = cross_validation.cross_val_predict(clf, trainVectors, trainLabels, cv=10) print "*"*20 print "\t accuracy_score\t", metrics.accuracy_score(trainLabels, predicted) print "*"*20 print "precision_score\t", metrics.precision_score(trainLabels, predicted) print "recall_score\t", metrics.recall_score(trainLabels, predicted) print "\nclassification_report:\n\n", metrics.classification_report(trainLabels, predicted) print "\nconfusion_matrix:\n\n", metrics.confusion_matrix(trainLabels, predicted)
def regression_test(X, Y): norm_train_x = preprocessing.MinMaxScaler((-1,1)).fit_transform(X) max_layer_size = len(x_cols)**2 max_layers = [Layer("Sigmoid", units=max_layer_size/4), Layer("Sigmoid", units=max_layer_size/2), # Layer("Sigmoid", units=max_layer_size/2), # Layer("Sigmoid", units=max_layer_size/4), Layer("Linear")] nn = Regressor(layers=max_layers,learning_rate=0.08, n_iter=300) regressors = [('Random Forest Regressor', RandomForestRegressor(n_estimators=100), False), ('AdaBoost Regressor', AdaBoostRegressor(), False), ('SVR', SVR(), False), ('Neural Net w/ Sigmoid -> Sigmoid -> Linear', nn, True)] for name, reg, norm in regressors: if norm: train_x = norm_train_x else: train_x = X print name preds = cross_validation.cross_val_predict(reg, train_x, Y, cv=K) print 'R^2:', metrics.r2_score(Y, preds)
def checkSkflowAccuracy(dataset,target): # baseline: 0.6923 with max_feat=0.5 classifier = RandomForestClassifier(max_depth=8, n_estimators=500, n_jobs=8, random_state=1, max_features=0.9) predicted = cross_validation.cross_val_predict(classifier,dataset,target,cv=5) score = metrics.accuracy_score(target,predicted) print("Accuracy: " + str(score)) print(metrics.confusion_matrix(target,predicted,labels=[0,1,2,3,4,5]))
def training(features, targets, feature_description, validation_features, model_flag): """ Train the data with XGBoost model and 10-cross fold validation method. Output the result in confusion matrix. :param model_flag: :param validation_features: :param features: X, 2-D matrix :param targets: Y 1-D target array :param feature_description: brief description of the feature """ model_name = model_name_dict[model_flag] model = model_dict[model_flag] model.fit(features, targets) prediction = model.predict(validation_features) file_names = np.load('ZL_validation_file_names.npy') validation_result = open('validation_result_' + model_name + feature_description, 'w') # output validation result with specified format. p = re.compile('(validation\.[0-9]+)') for i in range(len(prediction)): # format: validation_xxxxx type print >> validation_result, \ p.findall(file_names[i])[0].replace('.', '_'), \ type_array[int(prediction[i])] validation_result.close() prediction = cross_validation.cross_val_predict( model, features, targets, cv=10) cm = confusion_matrix(targets, prediction) output_confusion_matrix_tex( cm, model_name + '_' + feature_description)
def eval_log_reg(the_training_data, the_truth): K_FOLD = 10 # Linear regression lr = linear_model.LogisticRegression() # Evaluate scores = cross_validation.cross_val_score(lr, the_training_data, the_truth, cv=K_FOLD) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) predicted = cross_validation.cross_val_predict(lr, the_training_data, the_truth, cv=K_FOLD) print "Confusion matrix:" print metrics.confusion_matrix(the_truth, predicted) X_train, X_test, y_train, y_test = cross_validation.train_test_split( the_training_data, the_truth, test_size=1.0/K_FOLD, random_state=0) lr.fit(X_train, y_train) labels = X_train.columns coefficients = [(labels[i],val) for i,val in enumerate(lr.__dict__['coef_'][0])] coefficients.sort(key=lambda x: abs(x[1]), reverse=True) print "Most predictive features:" for i in range(0,5): print " %s: %0.2f" % (coefficients[i][0], coefficients[i][1]) numExamples = np.shape(X_train)[0] print "Training examples: %d" % numExamples usedUtterances = [example.split(".csv_")[0] for example in X_train.index] numUtterances = len(set(usedUtterances)) print "Training utterances: %d" % numUtterances return [scores.mean(), scores.std() * 2, len(coefficients), numExamples, numUtterances]
def fit(self,X,y): ''' fit the model ''' if self.use_append == True: self.__X = X self.__y = y elif self.use_append == False: self.__y = y temp = [] for clf in self.stage_one_clfs: y_pred = cross_val_predict(clf[1], X, y, cv=5, n_jobs=1) clf[1].fit(X,y) y_pred = np.reshape(y_pred,(len(y_pred),1)) if self.use_append == True: self.__X = np.hstack((self.__X,y_pred)) elif self.use_append == False: temp.append(y_pred) if self.print_scores == True: score = mean_squared_error(self.__y,y_pred) print("Score of %s: %0.3f" %(clf[0],score)) if self.use_append == False: self.__X = np.array(temp).T[0] # fit the second stage models for clf in self.stage_two_clfs: clf[1].fit(self.__X,self.__y)
def run_svm(x, y): s = svm.SVR() scores = cross_validation.cross_val_score(s, x, y, cv=10) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) predictions = cross_validation.cross_val_predict(s, x, y, cv=10) return predictions
def transform(self, X): # Purpose of skip is to skip the estimator if self.skip: return X # Is the data being transformed the same as the training data is_train_data = False if isinstance(X, pd.DataFrame) and self.hashed_value == hash(X.values.data.tobytes()): is_train_data = True if isinstance(X, np.ndarray) and self.hashed_value == hash(X.data.tobytes()): is_train_data = True # If the dataset is the training data, use CV predictions if is_train_data: feature = cross_val_predict(clone(self.model), X, self.y)#, cv=self.train_cv) # Otherwise, use the model to predict else: feature = self.model.predict(X) # Add feature to dataset if isinstance(X, pd.DataFrame): X[self.feature_name] = feature if isinstance(X, np.ndarray): X = np.c_[X, feature] return X
def predict_evaluate_models(fn ,ax=None, sel=["Penalties_Conceeded","Tries_Scored"], goal="Referee", verbosity=0): class_weight = 'auto' X, y, names = data_prepare(fn, sel=sel, goal=goal, verbosity=verbosity-1) if verbosity > 2: y_shuffled = y.copy() np.random.shuffle(y_shuffled) print ("All zeros accuracy:",1.0-np.sum(y)/len(y)) print ("y_shuffled f1_csore:",metrics.f1_score(y, y_shuffled)) n_folds = 10 cv = cross_validation.StratifiedKFold(y, n_folds=n_folds) #cv = cross_validation.LeaveOneOut(n=len(y)) results = [] for sclf in ('svm','svmp','svmr','lgCV','gnb','rf','knc'): clf = get_clf(sclf,class_weight=class_weight) y_pred = cross_validation.cross_val_predict(clf, X, y, cv=cv) #print "pred:",y_pred res = [ metrics.accuracy_score(y, y_pred), metrics.precision_score(y, y_pred), metrics.recall_score(y, y_pred), metrics.f1_score(y, y_pred), ] if verbosity > 0: print (sclf,res) results.append( (sclf,res) ) return results
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--data_dir","-dr",default = "/home/1546/code/keyword_extraction/stanford_parser/no_location_features") parser.add_argument('--method','-m',type=int,default=0,choices=range(4), help= """chose methods from: 0:linear_svc 1:logistic regression 2:naive bayes 3:decision tree """) parser.add_argument("--top_size","-ts",type=int,default = 20) parser.add_argument("--need_positive","-np",action='store_true') args=parser.parse_args() X,y,entity_info = load_data_set(args.data_dir) clf = get_classifier(args.method) predicted = cross_validation.cross_val_predict(clf,X,y,cv=5) #accuracy = metrics.accuracy_score(y,predicted) #f1 = metrics.f1_score(y,predicted) #print "performance:" #print "accuracy: %f, f1: %f" %(accuracy,f1) show_performance_on_entity_types(y,predicted,entity_info) print classification_report(y, predicted)
def tune_and_train_rf(X_train, y_train, strat_k_fold=None): ''' Uses oob estimates to find optimal max_depth between None + 0...20 Refits with best max_depth ''' oob_r2 = [] cv_list = [None] + range(1, 20) for md in cv_list: rf = RandomForestRegressor(n_estimators=100, max_depth=md, oob_score=True, random_state=0, n_jobs=-1) rf.fit(X_train, y_train) oob_r2.append(rf.oob_score_) best_max_depth = cv_list[np.argmax(oob_r2)] print("best max_depth: %s" % best_max_depth) # CV rf = RandomForestRegressor(n_estimators=100, max_depth=best_max_depth, oob_score=True, random_state=0, n_jobs=-1) cv_results = None if strat_k_fold: y_predicted_cv = cross_val_predict(rf, X_train, y_train, cv=strat_k_fold, n_jobs=-1) cv_r2 = [] cv_mae = [] for k_train, k_test in strat_k_fold: cv_r2.append(r2_score(y_train[k_test], y_predicted_cv[k_test])) cv_mae.append(mean_absolute_error(y_train[k_test], y_predicted_cv[k_test])) cv_results = {'y_predicted_cv': y_predicted_cv, 'cv_r2': cv_r2, 'cv_mae': cv_mae, 'oob_r2': oob_r2} # refit rf.fit(X_train, y_train) return rf, cv_results
def main(): dataset = samples.get_dataset() X, y, page_labels = build_Xy_from_pages_dataset(dataset) clf = create_classifier() # this gives the prediction result for every element # when it was in the test dataset during cross validation cv_iter = cross_validation.LabelKFold(page_labels, n_folds=10) predicted = cross_validation.cross_val_predict(clf, X, y, cv=cv_iter) cm = metrics.confusion_matrix(y, predicted) print('\nConfusion matrix:') print(cm, '\n\n') print(metrics.classification_report(y, predicted)) print('Training and peeking at the word weights...') X_train, y_train = X[:-20], y[:-20] clf = get_trained_classifier(X_train, y_train) cv = clf.steps[-2][1] svc = clf.steps[-1][1] word_weights = zip(svc.coef_[0], cv.vocabulary_) print('Top 10 weights for negative cases') for weight, word in sorted(word_weights)[:10]: print('%0.5f %s' % (weight, word)) print('\nTop 10 weights for positive cases') for weight, word in sorted(word_weights)[-10:][::-1]: print('%0.5f %s' % (weight, word)) import pickle with open('classifier.pickle', 'w') as f: pickle.dump(clf, f)
def ada_boost_cv(x_train, y_train, cv, max_tree_depth, n_estimators, learning_rate): tree_classifier = DecisionTreeClassifier(max_depth=max_tree_depth, class_weight="balanced") ada_boost_classifier = AdaBoostClassifier(base_estimator=tree_classifier, n_estimators=n_estimators, learning_rate=learning_rate) y_bar = cross_val_predict(estimator=ada_boost_classifier, X=x_train, y=y_train, cv=cv, n_jobs=cv) y_bar_proba = ada_boost_classifier.predict_proba(x_train) print(list(zip(y_bar,y_bar_proba))) cm = confusion_matrix(y_train,y_bar) accuracy_negative = cm[0,0] / np.sum(cm[0,:]) accuracy_positive = cm[1,1] / np.sum(cm[1,:]) precision = cm[1,1] / (cm[1,1] + cm[0,1]) recall = cm[1,1] / (cm[1,1] + cm[1,0]) f1_score = 2 * precision * recall / (precision + recall) return accuracy_positive, accuracy_negative, precision, recall, f1_score
def kfCrossVal(loansData): # Import required libraries from sklearn.cross_validation import cross_val_predict from sklearn import linear_model import sklearn.metrics as met import matplotlib.pyplot as plt from sklearn.preprocessing import PolynomialFeatures # Create linear regression model using FICO score as the only predictor # Interest Rate is the dependent variable lr = linear_model.LinearRegression() y = loansData.as_matrix(columns=['Interest.Rate']) x = loansData[['Loan.Length', 'FICO.Score']].as_matrix() # Run the kfold cross validation and store the results as an array predicted = cross_val_predict(lr, x, y, cv=10) # Try and run as quadratic? # POLY2 = smf.ols(formula = 'Y ~ 1 + X + I(X**2)', data=TRAIN_DF).fit() # Calculate MAE, MSE, and R2 print("Mean Absolute Error: {}".format(met.mean_absolute_error(y, predicted))) print("Mean Squared Error: {}".format(met.mean_squared_error(y, predicted))) print("R Squared: {}".format(met.r2_score(y, predicted))) # Plot the actual versus predicted values fix, ax = plt.subplots() ax.scatter(y, predicted) ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4) ax.set_xlabel('Measured') ax.set_ylabel('Predicted') plt.show()
def benchmark(clf_class, params, name): print("parameters:", params) t0 = time() clf = clf_class(**params).fit(X_train, y_train) print("done in %fs" % (time() - t0)) t0 = time() pred = clf.predict(X_test) print("done in %fs" % (time() - t0)) #execute_prediction(clf) print(clf.score(X_test, y_test)) predicted = cross_validation.cross_val_predict(clf, X, y, cv=10) score = metrics.accuracy_score(y, predicted) print(score) print("Classification report on test set for classifier:") print(clf) print() print(classification_report(y_test, pred, target_names=target_names)) cm = confusion_matrix(y_test, pred) print("Confusion matrix:") print(cm) # Show confusion matrix #pl.matshow(cm) #pl.title('Confusion matrix of the %s classifier' % name) #pl.colorbar() np.set_printoptions(precision=2) print('Confusion matrix, without normalization') plt.figure() plot_confusion_matrix(cm)
# plt.figure() # for color, i, target_name in zip(colors, [0, 1, 2], target_names): # plt.scatter(X_r2[Y == i, 0], X_r2[Y == i, 1], alpha=.8, color=color, # label=target_name) # plt.legend(loc='best', shadow=False, scatterpoints=1) # plt.title('LDA of Data') plt.show() import ipdb ipdb.set_trace() h = 0.02 #step size in mesh clf = LogisticRegression(C=1e5, penalty='l2') clf.fit(X, Y) predicted = cross_validation.cross_val_predict(clf, X, Y, cv=5) print "accuracy score: ", metrics.accuracy_score(Y, predicted) print "precision score: ", metrics.precision_score(Y, predicted, average='weighted') print "recall score: ", metrics.recall_score(Y, predicted, average='weighted') Y_test = clf.predict_proba(X_test) # create submission submission = pd.DataFrame(Y_test, columns=['predict_0', 'predict_1', 'predict_2']) submission.head() submission['id'] = testDF.index.values cols = submission.columns.tolist() cols = cols[-1:] + cols[:-1] submission = submission[cols]
n_folds=5, shuffle=True, random_state=1) # Perform cross-validation scores = cross_validation.cross_val_score(cv=kf, estimator=clf, X=X_train, y=y_train, scoring='accuracy') print('Scores: ' + str(scores)) print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), 2 * scores.std())) # Gather predictions predictions = cross_validation.cross_val_predict(cv=kf, estimator=clf, X=X_train, y=y_train) accuracy_score = metrics.accuracy_score(y_train, predictions) print('accuracy score: ' + str(accuracy_score)) confusion_matrix = metrics.confusion_matrix(y_train, predictions) class_names = encoder.classes_.tolist() #Train the classifier clf.fit(X=X_train, y=y_train) model = {'classifier': clf, 'classes': encoder.classes_, 'scaler': X_scaler} # Save classifier to disk
keep_probes = [] for sname in selectors.keys(): subject_probes = np.array( [i for i, x in enumerate(data['subjects']) if x == sname]) subject_probes = subject_probes[selectors[sname]] keep_probes += list(subject_probes) data['neural_responses'] = data['neural_responses'][:, keep_probes] # filter out the target stimuli (80 -- fruit) #for dropstim in [60, 70, 80, 90]: for dropstim in [80]: keepidx = data['image_category'] != dropstim data['image_category'] = data['image_category'][keepidx] data['neural_responses'] = data['neural_responses'][keepidx] # uncomment for a permutation test #data['image_category'] = np.random.permutation(data['image_category']) # obtain CV predictions print 'Data size', data['neural_responses'].shape clf = RandomForestClassifier(n_estimators=3000) predicted = cross_validation.cross_val_predict(clf, data['neural_responses'], data['image_category'], cv=n_cv) # display results print confusion_matrix(data['image_category'], predicted) print f1_score(data['image_category'], predicted, average='weighted')
if __name__ == "__main__": input_data = load_input_data() target_data = load_target_data() print("Number of data points: ", len(target_data)) decisionTree = RandomForestClassifier(max_features="log2") # Cross validation using K-fold and leave one out cv = KFold(len(target_data), n_folds=2, shuffle=True) cv2 = LeaveOneOut(len(target_data)) # only needs the number of points # Calculating scores for the model scores = cross_val_score(decisionTree, input_data, target_data, cv=cv) print("SCORES: ", scores) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # Value of the output when it was in the test set estimated_results = cross_val_predict(decisionTree, input_data, target_data, cv=cv) print("PREDICTED VALUES:", estimated_results) # Train the model decisionTree.fit(input_data, target_data) predicted = decisionTree.predict(input_data) expected = target_data print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted))
#-logistic,Author:ssb-- from sklearn import metrics, cross_validation from sklearn import datasets import numpy as np from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report,confusion_matrix import pandas as pd import matplotlib.pyplot as plt df=pd.read_csv('dataSample1.csv') samples=df.loc[:,['Openness','Conscientousness','Extraversion','Agreeableness','Emotional_Range','Conversation','Openness to Change','Hedonism','Self-enhancement','Self-transcendence']] target=df.loc[:,'Profession'] cv_folds = cross_validation.StratifiedKFold(target, n_folds=5, shuffle=False, random_state=0) from sklearn.linear_model import LogisticRegression Logistic_predict = cross_validation.cross_val_predict(LogisticRegression(), samples, target, cv=cv_folds) for (fold_no in 1:cv_folds): report=classification_report(target,Logistic_predict) #report print(report)
X = data.reshape((len(data), -1)) X = preprocessing.scale(X) Y = target.flatten() #X, Y = shuffle(X, Y) print(X.shape, Y.shape) clf = RandomForestRegressor(n_estimators=50, criterion='mse') #clf = LinearRegression() #clf = SVR() #clf = BayesianRidge(compute_score=True) Y_pred = cross_val_predict(clf, X, Y, cv=10) r, p = pearsonr(Y, Y_pred) print("Correlation: ", r) print("Variance explained: ", r**2) print("P-value: ", p**2) print("RMSE: ", mean_squared_error(Y, Y_pred)**0.5) Y_pred = Y_pred[Y != 0] Y = Y[Y != 0] print("MAPE: ", mean_absolute_percentage_error(Y, Y_pred)) target_pred = Y_pred.reshape(-1, 16) for i in range(0, 16):
plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.subplots_adjust(bottom=0.15) plt.ylabel('Classe correta') plt.xlabel('Classe predita') tabela = PrettyTable(['Modelo', 'f1', 'Mean Squared Error']) # In[6]: # Regressao Logistica # Modelo logistic = lm.LogisticRegression().fit(X, y) predicted = cv.cross_val_predict(logistic, X, y, cv=10) # Cross Validation scores = cv.cross_val_score(lm.LogisticRegression(), X, y, cv=10, scoring='f1_weighted') print ('Regressao Logistica') print (scores.mean()) # Avaliacao cnf_matrix = metrics.confusion_matrix(y, predicted) cr = metrics.classification_report(y, predicted) print (cr) with open('cr.txt', 'w') as text_file: text_file.write(cr) text_file.write('\n') mse = metrics.mean_squared_error(y, predicted) tabela.add_row(['Regressao Logistica', scores.mean(), mse]) # Matriz de Confusao Normalizada
# In[62]: model = lm.fit(df, score) # In[63]: accuracyscore = cross_val_score(model, df, score, cv=6) # In[64]: accuracyscore # In[65]: predictions = cross_val_predict(model, df, score, cv=6) # In[66]: plt.scatter(score, predictions) plt.xlabel("True Value") plt.ylabel("Predictions") plt.show() # In[67]: R_square = metrics.r2_score(score, predictions) # In[68]: R_square
print X_test.shape, y_test.shape # fit a model lm = linear_model.LinearRegression() model = lm.fit(X_train, y_train) predictions = lm.predict(X_test) ## The line / model plt.scatter(y_test, predictions) plt.xlabel("True Values") plt.ylabel("Predictions") print "Score:", model.score(X_test, y_test) ''' Now let's try out k-fold cross-validation. Again scikit-learn provides useful functions to do the heavy lifting. The function cross_val_predict returns the predicted values for each data point when it's in the testing slice. ''' from sklearn.cross_validation import cross_val_score, cross_val_predict from sklearn import metrics # Perform 6-fold cross validation scores = cross_val_score(model, df, y, cv=6) print "Cross-validated scores:", scores # Make cross validated predictions predictions = cross_val_predict(model, df, y, cv=6) plt.scatter(y, predictions) accuracy = metrics.r2_score(y, predictions) print "Cross-Predicted Accuracy:", accuracy
따라서 리그레이션 말고 다른 모델로 예측하기 추천 ''' plt.scatter(y_train, model.predict(x_train)) plt.xlabel('true values') plt.ylabel('predictions') plt.title('train') plt.scatter(y_test, pred) plt.xlabel('true values') plt.ylabel('predictions') plt.tile('test') #cross validation from sklearn.cross_validation import cross_val_score, cross_val_predict scores = cross_val_score(model, x_train, y_train, cv=6) pred = cross_val_predict(model, x_train, y_train, cv=6) plt.scatter(y, pred) #k fold cross validation import numpy as np from sklearn.model_selection import KFold x = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) y = np.array([1, 2, 3, 4]) kf = KFold(n_splits=2) kf.get_n_splits(x) for train_index, test_index in kf.split(x): print('train:', train_index, 'test:', test_index) print(y[train_index], y[test_index]) #leave one out
hidden_layer_sizes=(25), learning_rate='constant', learning_rate_init=0.001, max_iter=200, momentum=0.9, nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False, warm_start=False) y_pred = cross_val_predict(estimator, X, y, cv=10) print(classification_report(y, y_pred)) #print metrics.confusion_matrix(y, y_pred) print("AUC score ", roc_auc_score(y, y_pred)) # Split the dataset in two equal parts X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1) a = 0 if a == 0: # Set the parameters by cross-validation tuned_parameters = [{
print(train1.columns) X = train1.values X_test = test1.values # In[ ]: #kernel = 1*RBF(length_scale=1.0) kernel = 1.0**2 * Matern(length_scale=1.0, length_scale_bounds=(1e-05, 100000.0), nu=0.5) gp = GaussianProcessRegressor(kernel=kernel, alpha=5e-9, optimizer='fmin_l_bfgs_b', n_restarts_optimizer=0, normalize_y=False, copy_X_train=True, random_state=2016) clf = Pipeline([('scaler', StandardScaler()), ('gp', gp)]) y_log_centered = y_log - y_log.mean() y_pred = cross_val_predict(clf, X, y_log_centered, cv=5, n_jobs=-1) y = np.expm1(y_log) y_pred = np.expm1(y_pred + y_log.mean()) score = rmsle(y,y_pred) print(score) # 0.1459 # In[ ]: import matplotlib as mpl import matplotlib.pyplot as plt # get_ipython().magic(u'matplotlib inline') import seaborn as sns sns.set(style="whitegrid", color_codes=True) plt.scatter(y_pred, y)
skip_header=1, converters={ 1: day_to_number, 3: number_from_end_string, 4: number_from_end_string }) network_X = network_file[:, (0, 1, 2, 3, 4, 6)] network_Y = network_file[:, 5] fixed_set_RMSE = [] average_RMSE = [] for poly_degree in range(1, 8): regr = make_pipeline(PolynomialFeatures(poly_degree), LinearRegression()) predicted = cross_validation.cross_val_predict(regr, network_X, network_Y, 10, 1, 0, None, 0) scores = cross_validation.cross_val_score(regr, network_X, network_Y, cv=10, scoring='mean_squared_error') print '----poly_degree---', poly_degree print 'All RMSEs', numpy.sqrt(-scores) print 'Mean RMSE', numpy.mean(numpy.sqrt(-scores)) print 'Best RMSE', numpy.min(numpy.sqrt(-scores)) fixed_set_RMSE.append(numpy.mean(numpy.sqrt(-scores[0]))) average_RMSE.append(numpy.mean(numpy.sqrt(-scores))) #Residual
y_test.reset_index(drop=True, inplace=True) x_test #SUBMISSION SETUP tuned_clf.fit(x_train, y_train) tuned_clf.score(x_test, y_test) results = cval.cross_val_score(gr_clf, x_train, y_train, scoring='f1_micro', cv=kf_total, n_jobs=-1) results cpred = cval.cross_val_predict(gr_clf, x_test, y_test, cv=kf_total, n_jobs=-1) cpred cpred = pd.DataFrame(prediction, columns=['damage_grade']) cpred.set_index(test_values['building_id'], inplace=True) cpred.to_csv('prediction.csv') tuned_clf.fit(x_sample, y_sample) tuned_clf.score(x_test, y_test) prediction = tuned_clf.predict(test_values) prediction = pd.DataFrame(prediction, columns=['damage_grade']) prediction.set_index(test_id['building_id'], inplace=True) prediction.to_csv('prediction.csv') x_sample #MORE TEST STUFF x_sample.corr() x_sample.drop(columns=['secondary_use'], inplace=True)
X = pd.read_csv('sample/Classifier_Features.csv') #select small samples (200 raws) for example X = X[0:200] X.fillna(0, inplace=True) X.drop(['REVIEW_TEXT'], axis=1, inplace=True) #print X.head() Y = X.pop('CLASS') #store label 'CLASS' in Y numeric_variables = list(X.dtypes[X.dtypes != "object"].index) X = X[numeric_variables] cl0 = DecisionTreeClassifier(max_depth=5) cl1 = RandomForestClassifier(n_estimators=100, criterion="gini", n_jobs=2) # use sklearn cross_validation package to fit model predicted_0 = cross_validation.cross_val_predict(cl0, X, Y, cv=10) predicted_1 = cross_validation.cross_val_predict(cl1, X, Y, cv=10) print(" ") print( "********************Classification Model Results ************************" ) print("--------------------------------------------------------------") print("Decision Tree Accuracy: ", metrics.accuracy_score(Y, predicted_0)) print("Confusion Matrix For Decision Tree Classifier") print(metrics.confusion_matrix(Y, predicted_0)) print("AUC Score : ", metrics.roc_auc_score(Y, predicted_0)) print("Recall : ", metrics.recall_score(Y, predicted_0)) print("Average Precision Score : ", metrics.average_precision_score(Y, predicted_0))
from sklearn.model_selection import cross_val_predict import numpy as np predictors = ["Pclass", "Sex", "Age", "Fare","NlengthD","NameLength", "FsizeD", "Title","Deck"] # Initialize our algorithm with the default paramters # n_estimators is the number of trees we want to make # min_samples_split is the minimum number of rows we need to make a split # min_samples_leaf is the minimum number of samples we can have at the place where a tree branch ends (the bottom points of the tree) rf = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1) kf = KFold(titanic.shape[0], n_folds=5, random_state=1) cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=50) predictions = cross_validation.cross_val_predict(rf, titanic[predictors],titanic["Survived"],cv=kf) predictions = pd.Series(predictions) scores = cross_val_score(rf, titanic[predictors], titanic["Survived"], scoring='f1', cv=kf) # Take the mean of the scores (because we have one for each fold) print(scores.mean()) # In[ ]: predictors = ["Pclass", "Sex", "Age", "Fare","NlengthD","NameLength", "FsizeD", "Title","Deck","TicketNumber"] rf = RandomForestClassifier(random_state=1, n_estimators=50, max_depth=9,min_samples_split=6, min_samples_leaf=4) rf.fit(titanic[predictors],titanic["Survived"]) kf = KFold(titanic.shape[0], n_folds=5, random_state=1)
def search_models(model, param_dists, train_features, train_pred_true, test_features, n_iter=10, scoring='r2', cv=5, n_jobs=-1, pred_column='prediction', file_path='', file_basename='', show_svg=False): r"""Search hyper-parameters for best regression model and report. Args: model (sklearn): Regression model from `sklearn`. param_dists (dict): Dictionary with keys as string parameter names and values as `scipy.stats` distributions [1]_ or lists. train_features (pandas.DataFrame): The data to be fit for training. train_pred_true (pandas.Series): The target relative to `train_features` for regression. test_features (pandas.DataFrame): The data to be fit for testing as a final evaluation. n_iter (int, optional, default=10): Number of parameter settings that are sampled. Trades off runtime vs quality of the solution. scoring (str, optional, default='r2'): Scoring function. Default is R^2, coefficient of determination. See scikit-learn model evaluation documentation [2]_. cv (int, optional, default=5): Number of folds for K-fold cross-validation. n_jobs (int, optional, default=-1): The number of CPUs to use to do the computation. -1 is all CPUs. pred_column (str, optional, default='prediction'): Name for output prediction column in CSV. file_path (str, optional, default=''): Path for generated files. file_basename (str, optional, default=''): Base name for generated files. show_svg (bool, optional, default=False): Show SVG plot. Returns: None Raises: ValueError See Also: sklearn.grid_search.RandomizedSearchCV References: .. [1] http://docs.scipy.org/doc/scipy/reference/stats.html .. [2] http://scikit-learn.org/stable/modules/model_evaluation.html """ # TODO: move to demo/main.py as a top-level script. # TODO: outliers by Bonferroni correcte p-values # TODO: outliers by prediction distribution # Check input. if not isinstance(train_features, pd.DataFrame): raise ValueError("`train_features` must be a `pandas.DataFrame`") if not isinstance(train_pred_true, pd.Series): raise ValueError("`train_pred_true` must be a `pandas.Series`") if not isinstance(test_features, pd.DataFrame): raise ValueError("`test_features` must be a `pandas.DataFrame`") # Search for best model and report. search = sk_gs.RandomizedSearchCV(estimator=model, param_distributions=param_dists, n_iter=n_iter, scoring=scoring, n_jobs=n_jobs, cv=cv) time_start = time.time() search.fit(X=train_features, y=train_pred_true) time_stop = time.time() print(("Elapsed search time (seconds) = {elapsed:.1f}").format( elapsed=time_stop - time_start)) model_best = search.best_estimator_ print(("Best params = {params}").format(params=search.best_params_)) grid_best = max(search.grid_scores_, key=lambda elt: elt.mean_validation_score) if not np.isclose(search.best_score_, grid_best.mean_validation_score): raise AssertionError( "Program error. Max score from `search.grid_scores_` was not found correctly." ) print(("Best score (R^2) = {mean:.4f} +/- {std:.4f}").format( mean=grid_best.mean_validation_score, std=np.std(grid_best.cv_validation_scores))) train_pred_best = sk_cv.cross_val_predict(estimator=model_best, X=train_features, y=train_pred_true, cv=cv, n_jobs=n_jobs) print("Score from best model training predictions (R^2) = {score:.4f}". format(score=sk_met.r2_score(y_true=train_pred_true, y_pred=train_pred_best))) train_pred_default = sk_cv.cross_val_predict(estimator=model, X=train_features, y=train_pred_true, cv=cv, n_jobs=n_jobs) print("Score from default model training predictions (R^2) = {score:.4f}". format(score=sk_met.r2_score(y_true=train_pred_true, y_pred=train_pred_default))) if hasattr(model_best, 'feature_importances_'): print("Plot feature importances from best model:") plot_feature_importances(model=model_best, train_features=train_features) print("Plot actual vs predicted values from best model:") plot_actual_vs_predicted(y_true=train_pred_true, y_pred=train_pred_best) # Create predictions for `test_features`. # Order by index, save as CSV, and graph. test_pred_best = model_best.predict(X=test_features) file_csv = r'predictions_{name}.csv'.format(name=file_basename) path_csv = os.path.join(file_path, file_csv) print("Predictions CSV file =\n{path}".format(path=path_csv)) df_csv = pd.DataFrame(data=test_pred_best, index=test_features.index, columns=[pred_column]).sort_index() df_csv.to_csv(path_or_buf=path_csv, header=True, index=True, quoting=None) if hasattr(model_best.estimators_[0], 'tree_'): file_dot = r'graph_{name}.dot'.format(name=file_basename) path_dot = os.path.join(file_path, file_dot) print("Graphviz dot and SVG files =\n{path}\n{path}.svg".format( path=path_dot)) sk_tree.export_graphviz(decision_tree=model_best.estimators_[0], out_file=path_dot, feature_names=test_features.columns) cmd = ['dot', '-Tsvg', path_dot, '-O'] # Use pre-Python 3.5 subprocess API for backward compatibility. subprocess.check_call(args=cmd) if show_svg: display(SVG(filename=path_dot + '.svg')) return None
test_predictions=alg.predict(titanic[predictors].iloc[test,:]) predictions.append(test_predictions) predictions=np.concatenate(predictions, axis=0) # predictions[predictions>0.5]=1 # predictions[predictions<=0.5]=0 # accuracy=sum(predictions[predictions==titanic["Survived"]])/len(predictions) # print(accuracy) predictions_new = list(range(len(predictions))) titanic_survived_list = list(titanic['Survived']) summ = 0 for i in list(range(len(predictions))): if predictions[i] > 0.5: predictions_new[i] = 1 else: predictions_new[i] = 0 if predictions_new[i] == titanic_survived_list[i]: summ = summ+1 # print predictions_new # print titanic['Survived'] accuracy = summ/float(len(predictions)) print(accuracy) from sklearn import cross_validation from sklearn.linear_model import LogisticRegression#逻辑回归 alg=LogisticRegression(random_state=1) scores=cross_validation.cross_val_predict(alg, titanic[predictors],titanic["Survived"],cv=3) print(scores.mean())
def predict(classifier, x, y): predictions = cross.cross_val_predict(classifier, x, y, cv=10) return predictions
def cross_val_predict_score(model, actuals, predictions): from sklearn.cross_validation import cross_val_predict return cross_val_predict(model, actuals, predictions)
import os import numpy as np from ranknet import RankNet from sklearn.cross_validation import cross_val_predict os.system("rm -rf testlog") data1 = np.random.rand(1000, 30) data2 = np.random.rand(1000, 30) label = [True]*1000 rn = RankNet(hidden_units=[20, 10], learning_rate=0.01, verbose=True) data = rn.pack_data(data1, data2) err = rn.pretrain(data1) print("Reconstruction Error", err) cost = rn.fit(data, logdir="logfine") print("Cost", cost) score = rn.get_scores(data1) if False: cvpred = cross_val_predict(rn, data, label, cv=2) input1 = np.random.rand(10, 30) input2 = np.random.rand(10, 30) input_ = rn.pack_data(input1, input2) prob = rn.predict_prob(input_) pred = rn.predict(input_) score = rn.get_scores(input1) score = rn.get_scores(input2)
# Feature correlation Feature_corr = df.corr() # .corr is used for find corelation plt.figure(figsize=(30, 30)) sns.heatmap(Feature_corr, cbar=True, square=True, cmap='coolwarm') # Random Forest Regressor Model and important features clf = RandomForestRegressor(n_estimators=500, max_features=25) clf.fit(X_train, Y_train) importance = clf.feature_importances_ X1 = df.drop(['Progression'], axis=1) dfi = pd.DataFrame(importance, index=X1.columns, columns=["Importance"]) dfi = dfi.sort_values(['Importance'], ascending=False) dfi.plot(kind='bar', color='Purple') # Cross-validated data Prediction Predicted_Train = cross_val_predict(clf, X_train, Y_train, cv=5) fig_Train, ax_Train = plt.subplots() ax_Train = sns.regplot(x=Y_train, y=Predicted_Train, scatter_kws={ "color": "green", 's': 60 }, line_kws={ "color": "gold", "lw": 3 }, marker="o") plt.ylim(-0.15, 0.05) plt.xlim(-0.15, 0.05) ax_Train.set_xlabel('Real Progression Rate')
data_all['Sex'] = data_all['Sex'].map({'male': 1, 'female': 2}) sns.countplot(data_all['Cabin']) data_all['Cabin'] = data_all['Cabin'].astype('category').cat.codes data_all.info() ####建model data_all.drop('Survived', 1, inplace=True) X_train = data_all[0:len(train_data)] X_test = data_all.iloc[len(train_data):] # Scikit-learn 需要train dataset及label dataset(即答案)各一 Y_label = train_data.Survived #cross validation #decision tree Y_pred = cross_validation.cross_val_predict(DecisionTreeClassifier(), X_train, Y_label, cv=10) acc_decision_tree = metrics.accuracy_score(Y_label, Y_pred) #Random Forest Y_pred = cross_validation.cross_val_predict( RandomForestClassifier(n_estimators=1000), X_train, Y_label, cv=10) acc_rf = metrics.accuracy_score(Y_label, Y_pred) #print (metrics.classification_report(Y_label, Y_pred) ) #Logistic Regression Y_pred = cross_validation.cross_val_predict(LogisticRegression(), X_train, Y_label, cv=10) acc_LR = metrics.accuracy_score(Y_label, Y_pred) #SVC Y_pred = cross_validation.cross_val_predict(SVC(), X_train, Y_label, cv=10)
result = [] for i in selected: result.append(all_elem[i]) return result housing_file = numpy.genfromtxt('../../Datasets/housing_data.csv', delimiter=',', skip_header=1) housing_X = housing_file[:, (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)] housing_Y = housing_file[:, 13] fixed_set_RMSE = [] average_RMSE = [] for poly_degree in range(1, 5): regr = make_pipeline(PolynomialFeatures(poly_degree), LinearRegression()) predicted = cross_validation.cross_val_predict(regr, housing_X, housing_Y, 10, 1, 0, None, 0) scores = cross_validation.cross_val_score(regr, housing_X, housing_Y, cv=10, scoring='mean_squared_error') print '----poly_degree---', poly_degree print 'All RMSEs', numpy.sqrt(-scores) print 'Mean RMSE', numpy.mean(numpy.sqrt(-scores)) print 'Best RMSE', numpy.min(numpy.sqrt(-scores)) fixed_set_RMSE.append(numpy.mean(numpy.sqrt(-scores[0]))) average_RMSE.append(numpy.mean(numpy.sqrt(-scores))) #Residual residual = [] for i in range(len(housing_X)): residual.append(housing_Y[i] - predicted[i])
print "**************************************" # print "lrate: ", learning_rate # print "alpha: ", alpha # print "n_est: ", n_estimators #gb=GradientBoostingRegressor(max_depth=1,learning_rate=0.04,n_estimators=100) #gb=GradientBoostingRegressor(loss='lad',max_depth=1,learning_rate=0.05,n_estimators=440) #gb=GradientBoostingRegressor(loss='huber',max_depth=1,learning_rate=0.45,n_estimators=200,alpha=0.45) #gb=GradientBoostingRegressor(loss='quantile',max_depth=1,learning_rate=0.028,n_estimators=109,alpha=0.36,criterion="friedman_mse") #gb=GradientBoostingRegressor(loss='quantile',max_depth=1,learning_rate=0.028,n_estimators=109,alpha=0.36,criterion="friedman_mse",subsample=0.6) gb = GradientBoostingRegressor() gb.fit(x, y) gbsc = gb.score(x, y) print "R2 score is: ", gbsc gbsc_hpc = gb.score(x_hpc, y_hpc) print "R2 score on hpc is: ", gbsc_hpc predictions = cross_val_predict(gb, x, y, cv=10) np.clip(predictions, 0, 1, out=predictions) predictions_h = gb.predict(x_hpc) np.clip(predictions_h, 0, 1, out=predictions_h) err = np.mean(abs((predictions - y) / y)) var = np.var(abs((predictions - y) / y)) print "Cross-Predicted Relative Error: ", err print "Cross-Predicted Var of Relative Error: ", var np.savetxt('result1.txt', predictions, delimiter='\n', fmt='%.3f') err = np.mean(abs((predictions - y))) var = np.var(abs((predictions - y))) print "Cross-Predicted Abs Error: ", err print "Cross-Predicted Var of Abs Error: ", var err_h = np.mean(abs((predictions_h - y_hpc) / y_hpc)) var_h = np.var(abs((predictions_h - y_hpc) / y_hpc)) print predictions_h
def main(): #picklef = open(config_file, 'r') #config_dict = pickle.load(picklef) print "\n=========================" print "SURROGATE MODEL GENERATOR" print "=========================" print "PARSE AND CLEAN DATA" print "=========================" # load design and target data into a pandas dataframe from the input csv dataframe = pd.read_csv(input_data_file) # drop rows (samples) with NaNs in them dataframe = dataframe[dataframe.isnull() == False] # split the dataframe into design and target dataframes design_data = dataframe[features] design_labels = design_data.axes target_data = dataframe[targets] target_labels = target_data.axes if DEBUG: print "\nFeatures:\n", design_data print "\nTargets:\n", target_data print "\nParsed data shapes\n design data: ", np.shape( design_data), "\n target data: ", np.shape(target_data) print " #samples: %d\n #input parameters: %d" % (np.shape(design_data)[0], np.shape(design_data)[1]) print " #output parameters: %d" % np.shape(target_data)[1] if DEBUG: print "design data:" print design_data print "target_data:" print target_data if test_split > 0.0: print "\n=========================" print "SPLIT TRAIN AND TEST DATASETS" print "=========================" # split the data into a training set and a testing set for validation later. X_train, X_test, Y_train, Y_test = cross_validation.train_test_split( design_data, target_data, test_size=test_split) print "\nX_train, Y_train:", np.shape(X_train), np.shape(Y_train) print "X_test, Y_test:", np.shape(X_test), np.shape(Y_test) print "training sample size: %d" % np.shape(X_train)[0] print "testing sample size: %d" % np.shape(X_test)[0] if DEBUG: print "X_train:\n", X_train print "Y_train:\n", Y_train else: X_train = design_data Y_train = target_data X_test, Y_test = [], [] # standardize the training data to mean 0 and variance 1 if normalize is True: print "\n=========================" print "DATA NORMALIZATION AND SCALING" print "=========================" # initialize a StandardScaler object to calculate the means and scaling values of each design # parameter (that is, it calculates the means and stdevs over the columns). # We then use the scaler object to transform the entire input data set (except for the design ID # number) to their normalized values. X_train_scaler = preprocessing.MinMaxScaler( feature_range=(0, 1)).fit(X_train) X_train_scaled = pd.DataFrame(X_train_scaler.transform(X_train), columns=X_train.axes[1]) if test_split > 0.0: X_test_scaler = preprocessing.MinMaxScaler( feature_range=(0, 1)).fit(X_test) X_test_scaled = pd.DataFrame(X_test_scaler.transform(X_test), columns=X_test.axes[1]) else: X_test_scaled = [] print "\n feature min: ", X_train_scaler.data_min_ print " feature max: ", X_train_scaler.data_max_ print " feature range: ", X_train_scaler.data_range_ print " feature scales: \n", X_train_scaler.scale_ print "\nScaled training inputs:" print " shape: ", np.shape(X_train_scaled) if DEBUG: print "\n X_train_scaled:\n", X_train_scaled print "\nScaled testing inputs:" print " shape:", np.shape(X_test_scaled) print "\n X_test_scaled:\n", X_test_scaled Y_train_scaler = preprocessing.MinMaxScaler( feature_range=(0, 1)).fit(Y_train) Y_train_scaled = pd.DataFrame(Y_train_scaler.transform(Y_train), columns=Y_train.axes[1]) if test_split > 0.0: Y_test_scaler = preprocessing.MinMaxScaler( feature_range=(0, 1)).fit(Y_test) Y_test_scaled = pd.DataFrame(Y_test_scaler.transform(Y_test), columns=Y_test.axes[1]) else: Y_test_scaled = [] print "\n output min: ", Y_train_scaler.data_min_ print " output max: ", Y_train_scaler.data_max_ print " output range: ", Y_train_scaler.data_range_ print " output scales: \n", Y_train_scaler.scale_ print "\nScaled training inputs:" print " shape: ", np.shape(Y_train_scaled) if DEBUG: print "\n Y_train_scaled:\n", Y_train_scaled print "\nScaled testing inputs:" print " shape:", np.shape(Y_test_scaled) print "\n Y_test_scaled:\n", Y_test_scaled #print "\nBefore scaling:" #print np.shape(X_train) #print X_train # This is just for visualizing the normalization transformations with histograms if DEBUG is True and 1: fig, axes = plt.subplots(np.shape(X_train)[1], sharex=True, sharey=True) for ax, label in izip(axes, X_train.axes[1]): ax.hist(X_train[label], bins=7) ax.set_title(label) fig.suptitle( "Distribution of design parameters before normalization") fig, axes = plt.subplots(np.shape(X_train_scaled)[1], sharex=True, sharey=True) print X_train_scaled.axes for ax, label in izip(axes, X_train_scaled.axes[1]): ax.hist(X_train_scaled[label], bins=7) ax.set_title(label) fig.suptitle( "Distribution of design parameters after normalization") if len(Y_train) is not 0 and len(Y_train_scaled) is not 0: fig, axes = plt.subplots(np.shape(Y_train)[1], sharex=True, sharey=True) for ax, label in izip(axes, Y_train.axes[1]): ax.hist(Y_train[label], bins=7) ax.set_title(label) fig.suptitle( "Distribution of performance parameters before normalization" ) fig, axes = plt.subplots(np.shape(Y_train_scaled)[1], sharex=True, sharey=True) for ax, label in izip(axes, Y_train_scaled.axes[1]): ax.hist(Y_train_scaled[label], bins=7) ax.set_title(label) fig.suptitle( "Distribution of performance parameters after normalization" ) plt.show() else: X_train_scaled = X_train X_test_scaled = X_test print "\n=========================" print "SUPPORT VECTOR REGRESSION" print "=========================" surrogate_models = [ ] # Array to hold the surrogate model objects for each output parameter # If gridsearch is True, use scikit-learn's gridsearch to systematically search for optimal # hyperparameter values. Else, we use hyperparameter values set by the user to construct and # train surrogate models for each performance variable. if gridsearch: # construct a surrogate model for each output parameter (performance metric) print "My God... They're learning..." for n, target_parameter in enumerate(Y_train_scaled): print "\n------------------------" print target_parameter print "------------------------" if DEBUG: print Y_train_scaled[target_parameter] model = generate_optimized_surrogate( X_train_scaled, Y_train_scaled[target_parameter], label=target_parameter, C_range=C_range, epsilon_range=epsilon_scale, grid_iter=optimize_iter, scoring=model_scoring) surrogate_models.append(model) else: for n, target_parameter in enumerate(Y_train_scaled): print "\n------------------------" print target_parameter print "------------------------" model = SVR(kernel='rbf', C=C_tuple[n], epsilon=epsilon_tuple[n], gamma='auto').fit(X_train_scaled, Y_train_scaled[target_parameter]) surrogate_models.append(model) print "\nSurrogate models:\n", surrogate_models """ print np.shape(surrogate_model) print surrogate_model # make predictions over the output surrogate data. #prediction_outputs = [model.predict(X_train_scaled) for model in surrogate_model] prediction_outputs = surrogate_model[1].predict(X_train_scaled) print np.shape(prediction_outputs) print prediction_outputs """ # If the sampled data was split into training and testing sets, evaluate the generated models # on the testing data. Otherwise, compute cross-validated scores using the training data. # First, instantiate a list to hold our scaler (transformation) objects to transform the values # predicted by the models to the range of the performance metrics being modeled. Y_scalers = [] for n, model in enumerate(surrogate_models): print "\n------------------------" print targets[n] print "------------------------" if test_split > 0.0: print "\n=========================" print "MODEL EVALUATION" print "=========================" predictions = model.predict(X_test_scaled) target_values = Y_test[targets[n]] # reverse-transform the outputs and predictions back to their original values Y_test_scaler = preprocessing.MinMaxScaler().fit( Y_test[targets[n]].reshape(-1, 1)) predictions = Y_test_scaler.inverse_transform( predictions.reshape(-1, 1)) #print Y_test[:,n] #print predictions #result_array = np.column_stack((Y_test[:,n].reshape(-1,1), predictions)) print "test values, predicted values" print target_values, predictions print "model score:", metrics.mean_squared_error( target_values, predictions) #print "model score: ", model.score(target_values, predictions) print "model parameters:" parameters = model.get_params() print ' C: ', parameters['C'] print ' epsilon: ', parameters['epsilon'] #print ' gamma: ', parameters['gamma'] # If a testing set was not set aside, use Leave-One-Out (LOO) cross-validation else: scaled_target_values = Y_train_scaled[targets[n]].values target_values = Y_train[targets[n]].values scores = cross_validation.cross_val_score( model, X_train_scaled.values, scaled_target_values, scoring='mean_squared_error', cv=len(Y_train_scaled)) avg_score = np.mean(scores) score_std = np.std(scores) print "model avg score: %1.5f (+/-%1.5f)" % (-avg_score, score_std) predictions = cross_validation.cross_val_predict( model, X_train_scaled.values, scaled_target_values, cv=len(Y_train_scaled)) # Make a scaler and inverse transform the predictions back to their original, unscaled ranges Y_test_scaler = preprocessing.MinMaxScaler().fit(target_values) predictions = Y_test_scaler.inverse_transform(predictions) Y_scalers.append(Y_test_scaler) print "Y_scalers[%d]: " % n, Y_scalers[n] # plot the predicted vs actual values fig, ax = plt.subplots() ax.scatter(predictions, target_values, marker='x') ax.plot(target_values, target_values, c='b', linestyle='--') ax.set_xlabel("Predicted Values") ax.set_ylabel("Actual Values") ax.set_title("Predicted vs Actual Target Values: %s" % targets[n]) fig.savefig('%s%s_%s_predicted_vs_actual.png' % (output_directory, data_title, targets[n])) """ if test_split > 0.0: print "\n=========================" print "MODEL EVALUATION" print "=========================" # step through each model and evaluate its performance on the testing data for n, model in enumerate(surrogate_models): print "\n------------------------" print targets[n] print "------------------------" predictions = model.predict(X_test_scaled) target_values = Y_test[targets[n]] # reverse-transform the outputs and predictions back to their original values Y_test_scaler = preprocessing.MinMaxScaler().fit(Y_test[targets[n]].reshape(-1,1)) predictions = Y_test_scaler.inverse_transform(predictions.reshape(-1,1)) #print Y_test[:,n] #print predictions #result_array = np.column_stack((Y_test[:,n].reshape(-1,1), predictions)) print "test values, predicted values" print target_values, predictions print "model score:", metrics.mean_squared_error(target_values, predictions) #print "model score: ", model.score(target_values, predictions) print "model parameters:" parameters = model.get_params() print ' C: ', parameters['C'] print ' epsilon: ', parameters['epsilon'] #print ' gamma: ', parameters['gamma'] # plot the predicted vs actual values fig, ax = plt.subplots() ax.scatter(predictions, target_values, marker = 'x') ax.plot(target_values, target_values, c='b', linestyle='--') ax.set_xlabel("Predicted Values") ax.set_ylabel("Actual Values") ax.set_title("Predicted vs Actual Target Values: %s" %targets[n]) fig.savefig('%s%s_predicted_vs_actual.png' %(output_directory, targets[n])) else: print "\n=========================" print "MODEL CROSS-VALIDATION" print "=========================" # Use cross-validation to evaluate the models created above for n, model in enumerate(surrogate_models): print "\n------------------------" print targets[n] print "------------------------" scaled_target_values = Y_train_scaled[targets[n]].values target_values = Y_train[targets[n]].values scores = cross_validation.cross_val_score(model, X_train_scaled.values, scaled_target_values, scoring = 'mean_squared_error', cv = len(Y_train_scaled)) avg_score = np.mean(scores) score_std = np.std(scores) print "model avg score: %1.5f (+/-%1.5f)" %(-avg_score, score_std) predictions = cross_validation.cross_val_predict(model, X_train_scaled.values, scaled_target_values, cv = len(Y_train_scaled)) # Make a scaler and inverse transform the predictions back to their original, unscaled ranges Y_test_scaler = preprocessing.MinMaxScaler().fit(target_values) predictions = Y_test_scaler.inverse_transform(predictions) # plot the predicted vs actual values fig, ax = plt.subplots() ax.scatter(predictions, target_values, marker = 'x') ax.plot(target_values, target_values, c='b', linestyle='--') ax.set_xlabel("Predicted Values") ax.set_ylabel("Actual Values") ax.set_title("Predicted vs Actual Target Values: %s" %targets[n]) fig.savefig('%s%s_predicted_vs_actual.png' %(output_directory, targets[n])) """ if save_models is True: model_file = data_title + "_surrogate_models.pkl" input_scaler_file = data_title + "_input_scalers.pkl" scaler_file = data_title + "_datascalers.pkl" models_savefile = output_directory + model_file input_scalers_savefile = output_directory + input_scaler_file scalers_savefile = output_directory + scaler_file #models_savefile = "%s%s_surrogate_models.pkl" %(output_directory, data_name) #scalers_savefile = "%s%s_datascalers.pkl" %(output_directory, data_name) with open(models_savefile, 'w') as f: pickle.dump(surrogate_models, f) with open(input_scalers_savefile, 'w') as f: pickle.dump(X_train_scaler, f) with open(scalers_savefile, 'w') as f: pickle.dump(Y_scalers, f) return surrogate_models, Y_scalers
def linearRegression(X, Y, datasetName, workFlow=False, workFlowNo=0): """ :param X: data consisting of features (excluding class variable) :param Y: column vector consisting of class variable :param datasetName: Network / Housing :param workFlow: for question 3 :return: models linear regression and performs 10 fold Cross Validation displays statistics of various features and plots graphs for predicted/residual and actual values """ print "LINEAR REGRESSION" print "Executing..." print # can change to model on the entire dataset but by convention splitting the dataset is a better option X_train, X_test, Y_train, Y_test = cross_validation.train_test_split( X, Y, test_size=0.10, random_state=5) lm = linear_model.LinearRegression( ) # sklearn Linear Regression model used for cross validation lm.fit(X_train, Y_train) scores = cross_validation.cross_val_score( lm, X, Y, cv=10, scoring='mean_squared_error') # cross validation 10 folds predicted = cross_validation.cross_val_predict(lm, X, Y, cv=10) # differentiate between workflow execution and whole dataset execution if workFlow: est = sm.OLS(Y, X).fit( ) # panda OLS library used to build model on entire dataset and provide stats on variable print est.summary() else: print "WORKFLOW_" + str(workFlowNo) print "Estimated intercept coefficient: ", lm.intercept_ estimatedCoefficients = pd.DataFrame( zip(X.columns, lm.coef_), columns=['Features', 'EstimatedCoefficients']) print estimatedCoefficients rmseEstimator = np.mean((scores * -1)**0.5) # average of the RMSE values print print "RMSE Values of Estimator : " + str(rmseEstimator) print # plot graph for Fitted values vs Actual values plt.scatter(Y, predicted) plt.figure(1) plt.xlabel("Actual Median Value") plt.ylabel("Predicted Median Value") plt.title('Fitted values vs Actual Values') plt.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'k--', lw=4) if workFlow: plt.savefig( "../Graphs/{0}/Question {1}/LR - Fitted vs Actual.png".format( datasetName, '2a' if datasetName == "Network" else '4a')) else: plt.savefig( "../Graphs/{0}/Question 3/LR - Fitted vs Actual WorkFlow {1}.png". format(datasetName, workFlowNo)) # plot graph for Residual values vs Fitted Values plt.figure(2) plt.xlabel('Fitted Values') plt.ylabel('Residuals') plt.title('Residuals vs Fitted Values plot') plt.scatter(predicted, predicted - Y, c='b', s=40, alpha=0.5) plt.hlines(y=0, xmin=-10, xmax=50) if workFlow: plt.savefig( "../Graphs/{0}/Question {1}/LR - Residuals vs Fitted.png".format( datasetName, '2a' if datasetName == "Network" else '4a')) else: plt.savefig( "../Graphs/{0}/Question 3/LR - Residuals vs Fitted WorkFlow {1}.png" .format(datasetName, workFlowNo)) plt.show()
from sklearn import datasets from sklearn import linear_model from sklearn.cross_validation import cross_val_predict import matplotlib.pyplot as plt boston = datasets.load_boston() #print(boston.DESCR) #print(boston.target) print(boston.data) #CRIM(犯罪率) ZN(房星大於25000ft比率) #INDOUS(住宅比率) CHAS(有吳臨河) NOX(空汙比率) RM(房間數) #AGE(自有住宅比例) DIS(離市中心距離) RAD(離高速公路距離) #TAX(房屋稅率) PTRATIO(小學老師比率) B(黑人比率) #STAT(低收人比率) MEDV(受僱者收入)4 lr = linear_model.LinearRegression() predict = cross_val_predict(lr, boston.data, boston.target, cv=10) plt.figure() plt.scatter(boston.target, predict) y = boston.target plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4) plt.plot() plt.show() print(predict) # In[1]: from sklearn import datasets import matplotlib.pyplot as plt import numpy as np data = datasets.fetch_olivetti_faces() #print(data.DESCR)
def randomForestRegression(X, Y, datasetName): """ :param X: data consisting of features (excluding class variable) :param Y: column vector consisting of class variable :param datasetName: Network / Housing :return: models random forest regression with fine-tuning of tree depth and maximum number of trees """ print "RANDOM FOREST REGRESSION" print "Executing..." print # fine tuning of depth of tree depth = range(4, 15) rmse = [] for eachDepth in range( len(depth)): # test depth of tree using 10 folds cross validation estimator = RandomForestRegressor(n_estimators=20, max_depth=depth[eachDepth]) scores = cross_validation.cross_val_score(estimator, X, Y, cv=10, scoring='mean_squared_error') rmse.append((np.mean(scores) * -1)**(0.5)) # plot graphs for RMSE vs Depth of Tree plt.figure(1) plt.plot(depth, rmse) plt.title('RMSE vs Depth of Tree') plt.xlabel('Depth') plt.ylabel('Root Mean Square Error') plt.savefig("../Graphs/{0}/Question 2b/RMSE vs Depth of Tree.png".format( datasetName)) bestDepth = depth[rmse.index( min(rmse))] # best depth obtained on basis of RMSE # fine tuning of number of maximum tree noOfTrees = range(20, 220, 40) rmse = [] for eachTreeSize in range( len(noOfTrees) ): # test maximum number of tree using 10 folds cross validation estimator = RandomForestRegressor(n_estimators=noOfTrees[eachTreeSize], max_depth=bestDepth) scores = cross_validation.cross_val_score(estimator, X, Y, cv=10, scoring='mean_squared_error') rmse.append((np.mean(scores) * -1)**(0.5)) bestTrees = noOfTrees[rmse.index(min(rmse))] # plot graphs for RMSE vs Maximum No. of Tree plt.figure(2) plt.plot(noOfTrees, rmse) plt.title('RMSE vs Maximum No of Tree') plt.xlabel('Number of Tree') plt.ylabel('Root Mean Square Error') plt.savefig( "../Graphs/{0}/Question 2b/RMSE vs No of Tree.png".format(datasetName)) rfe = RandomForestRegressor(n_estimators=bestTrees, max_depth=bestDepth) predicted = cross_validation.cross_val_predict(rfe, X, Y, cv=10) # plot graph for Fitted values vs Actual values plt.figure(3) plt.scatter(Y, predicted) plt.xlabel("Actual Median Value") plt.ylabel("Predicted Median Value") plt.title('Fitted values vs Actual Values') plt.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'k--', lw=4) plt.savefig("../Graphs/{0}/Question {1}/LR - Fitted vs Actual.png".format( datasetName, '2b' if datasetName == "Network" else '4a')) plt.figure(4) plt.xlabel('Fitted Values') plt.ylabel('Residuals') plt.title('Residuals vs Fitted Values plot') plt.scatter(predicted, predicted - Y, c='b', s=40, alpha=0.5) plt.hlines(y=0, xmin=-10, xmax=50) plt.savefig( "../Graphs/{0}/Question {1}/LR - Residuals vs Fitted.png".format( datasetName, '2b' if datasetName == "Network" else '4a')) # print the best rmse of random forest with maximum no of trees = 140 and depth = 9 print "RANDOM FOREST REGRESSION" print "PARAMETER TUNING" print "Max Depth : " + str(bestDepth) print "Number of Maximum Tree " + str(bestTrees) print "Root Mean Squared Error : " + str(min(rmse)) plt.show()
boston = datasets.load_boston() # 输出数据集介绍文档 print(boston.DESCR) # 导入线性支持向量机回归模块 from sklearn.svm import LinearSVR # 导入交叉验证模块 from sklearn.cross_validation import cross_val_predict feature = boston.data target = boston.target # 建立线性支持向量机回归模型 model = LinearSVR() # 交叉验证,数据集等分为10份 predictions = cross_val_predict(model, feature, target, cv=10) import matplotlib.pyplot as plt # 绘制散点图 plt.scatter(target, predictions) # 绘制45°参考线 plt.plot([target.min(), target.max()], [target.min(), target.max()], 'r--', lw=2) # 设置坐标轴名称 plt.xlabel('true_target') plt.ylabel('prediction') plt.show()