# for i in range(0,Total_data_number): for z in randomized_list: temp_feature.append(feature_list_of_all_instances[z]) temp_class.append(class_list_of_all_instances[z]) # continue feature_list_of_all_instances = temp_feature class_list_of_all_instances = temp_class # feature_list_of_all_instances = feature_list_of_all_instances.tolist() # class_list_of_all_instances = class_list_of_all_instances.tolist() data = [] for i in range(0, Total_data_number): data.append(i) # kf = cross_validation.KFold(Total_data_number, n_folds=5, shuffle=True) # # # Cs = numpy.logspace(-6, -1, 10) # # # clf = GridSearchCV(estimator='svc',param_grid=dict(C = Cs) , n_jobs=-1 ) # # print("Starting K fold data to Svm ... ") l = 0 for iteration, data in enumerate(kf, start=1): # print(iteration, data[0], data[1]) train_set_indexes = data[0] test_set_indexes = data[1] temp_total_dataset = []
# parser.add_argument("last_sub_idx", help="last sub", # type=int, default=len(all_subjects)) args = parser.parse_args() start_idx = args.start_sub_idx end_idx = args.end_sub_idx for experiment_counter, subject in enumerate(all_subjects[start_idx:end_idx]): file_name = os.path.join(data_base_dir, subject) all_data_per_char, target_per_char, train_mode_per_block, all_data_per_char_as_matrix, target_per_char_as_matrix = create_data_rep_training( file_name, -200, 800, downsampe_params=8) for rep_per_sub, cross_validation_indexes in enumerate(list(cross_validation.KFold(len(train_mode_per_block)/10, n_folds=4, random_state=42, shuffle=True))): # seperate randomally batch_size = 20 select = 1 train_as_p300 = False train_indexes = train_mode_per_block == 1 validation_indexes = train_mode_per_block == 2 test_indexes = train_mode_per_block != 1 if train_as_p300:
testdex = testing.index else: training = pd.read_csv('../input/train.csv', index_col="item_id", parse_dates=["activation_date"]) traindex = training.index testing = pd.read_csv('../input/test.csv', index_col="item_id", parse_dates=["activation_date"]) testdex = testing.index ntrain = training.shape[0] ntest = testing.shape[0] kf = cross_validation.KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED) y = training.deal_probability.copy() training.drop("deal_probability", axis=1, inplace=True) print('Train shape: {} Rows, {} Columns'.format(*training.shape)) print('Test shape: {} Rows, {} Columns'.format(*testing.shape)) print("Combine Train and Test") df = pd.concat([training, testing], axis=0) del training, testing gc.collect() categorical = [ "region", "city", "parent_category_name", "category_name", "user_type", "image_top_1", "param_1", "param_2", "param_3"
print("Finished feature extraction over {} windows".format(len(X))) print("Unique labels found: {}".format(set(y))) sys.stdout.flush() # %%--------------------------------------------------------------------------- # # Train & Evaluate Classifier # # ----------------------------------------------------------------------------- n = len(y) n_classes = len(class_names) # TODO: Train your classifier! cv = cross_validation.KFold(n, n_folds=10, shuffle=False, random_state=None) tree = DecisionTreeClassifier(criterion="entropy", max_depth=3) dtavgacc = 0.0 dtavgprecision = 0.0 dtavgrecall = 0.0 for i, (train_indexes, test_indexes) in enumerate(cv): X_train = X[train_indexes, :] y_train = y[train_indexes] X_test = X[test_indexes, :] y_test = y[test_indexes] tree.fit(X_train, y_train) y_pred = tree.predict(X_test) conf = confusion_matrix(y_test, y_pred, labels=[0,1,2]) dtaccuracy1 = tree.score(X_test, y_test)
def trainAndTestNet(): unsupervisedData, data, labels = createTrainingSet() print np.unique(np.argmax(labels, axis=1)) print "data.shape" print data.shape print "labels.shape" print labels.shape # Random data for training and testing kf = cross_validation.KFold(n=len(data), k=5) for train, test in kf: break print data data = common.scale(data) unsupervisedData = None activationFunction = activationfunctions.Rectified() rbmActivationFunctionVisible = activationfunctions.Identity() rbmActivationFunctionHidden = activationfunctions.RectifiedNoisy() unsupervisedLearningRate = 0.0001 supervisedLearningRate = 0.001 momentumMax = 0.99 trainData = data[train] trainLabels = labels[train] # net = db.DBN(4, [1200, 1500, 1000, len(args.emotions)], # binary=False, # activationFunction=activationFunction, # rbmActivationFunctionVisible=rbmActivationFunctionVisible, # rbmActivationFunctionHidden=rbmActivationFunctionHidden, # unsupervisedLearningRate=unsupervisedLearningRate, # supervisedLearningRate=supervisedLearningRate, # momentumMax=momentumMax, # nesterovMomentum=True, # rbmNesterovMomentum=True, # rmsprop=True, # miniBatchSize=20, # hiddenDropout=0.5, # visibleDropout=0.8, # momentumFactorForLearningRateRBM=False, # firstRBMheuristic=False, # rbmVisibleDropout=1.0, # rbmHiddenDropout=1.0, # preTrainEpochs=10, # sparsityConstraintRbm=False, # sparsityRegularizationRbm=0.001, # sparsityTragetRbm=0.01) # # net.train(trainData, trainLabels, maxEpochs=200, # validation=False, # unsupervisedData=unsupervisedData) # # probs, predicted = net.classify(data[test]) net = cnn.CNN(30, 40, len(args.emotions)) net.train(trainData, trainLabels) probs, predicted = net.classify(data[test]) actualLabels = labels[test] correct = 0 errorCases = [] for i in xrange(len(test)): actual = actualLabels[i] print probs[i] if predicted[i] == np.argmax(actual): correct += 1 else: errorCases.append(i) print "correct" print correct print "percentage correct" print correct * 1.0 / len(test) confMatrix = confusion_matrix(np.argmax(actualLabels, axis=1), predicted) print "confusion matrix" print confMatrix with open(args.net_file, "wb") as f: pickle.dump(net, f) return net
def predict_news_article(): ''' It would seem we are essentially unable to predict which articles will get news stories, conditioned on their already receiving a press release! This is actually kind of interesting, since we *are* able to predict (certainly better than chance) which articles will get press release OR news articles. This supports the conclusions of Chambers et al: it seems the press release selection and process is the crucial thing. ''' X, y, vectorizer = get_X_y() lr = LogisticRegression(penalty="l2", fit_intercept=True) parameters = {"C": [.1, .01, .001]} clf0 = GridSearchCV(lr, parameters, scoring='accuracy') print "fitting model..." clf0.fit(X, y) print "done." print texify_most_informative_features(vectorizer, clf0, "predictive features") kf = cross_validation.KFold(X.shape[0], shuffle="true", n_folds=5) fs, aucs = [], [] fold = 0 for train, test in kf: clf = GridSearchCV(lr, parameters, scoring='accuracy') clf.fit(X[train], y[train]) probs = clf.predict_proba(X[test]) #aucs.append(sklearn.metrics.roc_auc_score(y[test], probs)) cur_auc = sklearn.metrics.roc_auc_score(y[test], probs[:, 1]) aucs.append(cur_auc) preds = clf.predict(X[test]) fs.append(sklearn.metrics.f1_score(y[test], preds)) if fold == 0: fpr, tpr, thresholds = sklearn.metrics.roc_curve( y[test], probs[:, 1]) pylab.clf() fout = "roc" pylab.plot(fpr, tpr, label="ROC curve (area = %0.2f)" % cur_auc) pylab.plot([0, 1], [0, 1], 'k--') pylab.xlim((-0.025, 1.025)) pylab.ylim((-0.025, 1.025)) pylab.xlabel("false positive rate") pylab.ylabel("true positive rate") pylab.title("ROC curve (area = %0.2f)" % cur_auc) pylab.tight_layout() pylab.savefig(fout) fold += 1 print "average auc: %s" % (sum(aucs) / float(len(aucs))) print "average fs: %s" % (sum(fs) / float(len(fs))) #print "ABOUT TO RETURN" pdb.set_trace() return clf0
def testPicklingDBN(): data, labels = readKanade(False, None, equalize=False) print "data.shape" print data.shape print "labels.shape" print labels.shape # Random data for training and testing kf = cross_validation.KFold(n=len(data), n_folds=5) for train, test in kf: break if args.relu: activationFunction = Rectified() unsupervisedLearningRate = 0.05 supervisedLearningRate = 0.01 momentumMax = 0.95 data = scale(data) rbmActivationFunctionVisible = Identity() rbmActivationFunctionHidden = RectifiedNoisy() else: activationFunction = Sigmoid() rbmActivationFunctionVisible = Sigmoid() rbmActivationFunctionHidden = Sigmoid() unsupervisedLearningRate = 0.5 supervisedLearningRate = 0.1 momentumMax = 0.9 trainData = data[train] trainLabels = labels[train] # TODO: this might require more thought net = db.DBN(5, [1200, 1500, 1500, 1500, 7], binary=1 - args.relu, activationFunction=activationFunction, rbmActivationFunctionVisible=rbmActivationFunctionVisible, rbmActivationFunctionHidden=rbmActivationFunctionHidden, unsupervisedLearningRate=unsupervisedLearningRate, supervisedLearningRate=supervisedLearningRate, momentumMax=momentumMax, nesterovMomentum=True, rbmNesterovMomentum=True, rmsprop=True, miniBatchSize=20, hiddenDropout=0.5, visibleDropout=0.8, rbmVisibleDropout=1.0, rbmHiddenDropout=1.0, preTrainEpochs=1) net.train(trainData, trainLabels, maxEpochs=10, validation=False, unsupervisedData=None, trainingIndices=train) initialDict = net.__dict__ with open(args.netFile, "wb") as f: pickle.dump(net, f) with open(args.netFile, "rb") as f: net = pickle.load(f) afterDict = net.__dict__ del initialDict['rbmActivationFunctionHidden'] del initialDict['rbmActivationFunctionVisible'] del afterDict['rbmActivationFunctionHidden'] del afterDict['rbmActivationFunctionVisible'] for key in initialDict: assert key in afterDict if isinstance(initialDict[key], (np.ndarray, np.generic)): assert np.arrays_equal(initialDict[key], afterDict[key]) else: assert initialDict[key] == afterDict[key]
Y_pred_NB = modelNB.predict(X_test) printMetrics(Y_pred, Y_predNB) #99.86 #%% # USING CROSS VALIDATION: LOGISTIC REGRESSION #from sklearn.svm import SVC classifier = LogisticRegression() #classifier = svm.SVC(kernel = 'rbf', C = 1, gamma = 0.001) # performing kfold_cross_validation kfold_cv = cross_validation.KFold(n=len(X_train), n_folds=20) print(kfold_cv) #Running the model using scoring metric as Accuracy kfold_cv_result = cross_validation.cross_val_score(estimator=classifier, X=X_train, y=Y_train, cv=kfold_cv) #print(kfold_cv_result) #finding the mean print(kfold_cv_result.mean()) # 99.659 # MAX ACCURACY OF KFOLD FOR LOGISTIC REGRESSION:
#V = mat(V).T # Components to be included as features #k_pca = 3 #X = X*V[:,0:k_pca] #N, M = X.shape # Parameters for neural network classifier n_hidden_units = 2 # number of hidden units n_train = 5 # number of networks trained in each k-fold learning_goal = 10 # stop criterion 1 (train mse to be reached) max_epochs = 64 # stop criterion 2 (max epochs in training) show_error_freq = 3 # frequency of training status updates # K-fold crossvalidation K = 10 # only five folds to speed up this example CV = cross_validation.KFold(N, K, shuffle=True) # Variable for classification error errors = np.zeros(K) error_hist = np.zeros((max_epochs, K)) bestnet = list() k = 0 for train_index, test_index in CV: print('\nCrossvalidation fold: {0}/{1}'.format(k + 1, K)) # extract training and test set for current CV fold X_train = X[train_index, :] y_train = y[train_index] X_test = X[test_index, :] y_test = y[test_index] # X_train = X[train_index]
train_tags_all_subject = [] test_tags_all_subject = [] time_noise = 0 for experiment_counter, subject in enumerate( all_subjects[start_idx:end_idx]): print "start subject:{}".format(subject) file_name = os.path.join(data_base_dir, subject) all_data_per_char, target_per_char, train_mode_per_block, all_data_per_char_as_matrix, target_per_char_as_matrix = create_data_rep_training( file_name, -200 + time_noise, 800 + time_noise, downsampe_params=8) for rep_per_sub, cross_validation_indexes in enumerate( list( cross_validation.KFold(len(train_mode_per_block) / 10, n_folds=4, random_state=42, shuffle=True))): batch_size = 20 select = 1 train_as_p300 = False train_indexes = train_mode_per_block == 1 validation_indexes = train_mode_per_block == 2 test_indexes = train_mode_per_block != 1 if train_as_p300: data_generator_batch = triplet_data_generator_no_dict( all_data_per_char_as_matrix[train_indexes], target_per_char_as_matrix[train_indexes], batch_size=batch_size, select=select,
np.save("train_X.npy", train_X) np.save("train_y.npy", train_y) print "Done" """ print "loading.." train_X = np.load("train_X.npy") train_y = np.load("train_y.npy") print train_X.shape, train_y.shape ################## XGBoost ############### ################## XGBoost ############### print "Building models.." cv_scores = [] kf = cross_validation.KFold(train_X.shape[0], n_folds=8, shuffle=True, random_state=2015) for dev_index, val_index in kf: dev_X, val_X = train_X[dev_index, :], train_X[val_index, :] dev_y, val_y = train_y[dev_index], train_y[val_index] sc = preprocessing.StandardScaler() dev_X = sc.fit_transform(dev_X) val_X = sc.transform(val_X) runNN(dev_X, dev_y, val_X, test_y=val_y) break
from sklearn.qda import QDA from sklearn.svm import LinearSVC, SVC from create_lagged_series import create_lagged_series if __name__ == "__main__": snpret = create_lagged_series("^GSPC", datetime.datetime(2001, 1, 10), datetime.datetime(2005, 12, 31), lags=5) X = snpret[["Lag1", "Lag2"]] y = snpret["Direction"] kf = cross_validation.KFold(len(snpret), n_folds=10, indices=False, shuffle=True, random_state=42) for train_index, test_index in kf: X_train = X.ix[X.index[train_index]] X_test = X.ix[X.index[test_index]] y_train = y.ix[y.index[train_index]] y_test = y.ix[y.index[test_index]] print("Hit Rates/Confusion Matrices:\n") model = SVC(C=1000000.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0001, kernel='rbf',
for score_col in scores: predictions_all.ix[:, score_col] = predictions_all.ix[:, score_col].map( logit) unique_models = predictions_all.modelname.unique() n_obs = essays.meta_data().shape[0] predictions_matrix = np.zeros((len(scores), unique_models.shape[0], n_obs)) for n_score, score in enumerate(scores): for n_model, model in enumerate(unique_models): predictions_matrix[ n_score, n_model, :] = predictions_all_list[n_model][score] predictions_matrix = predictions_matrix.transpose(2, 0, 1) # set up cross validatio cvsets = cross_validation.KFold(len(trainset), 10, random_state=0) trainset = np.where(essays.meta_data().essay_type == "TRAINING")[0] testset = np.where(essays.meta_data().essay_type == "VALIDATION")[0] cvsets = [(trainset[tr], trainset[te]) for tr, te in cvsets] + [(trainset, testset)] # set up final dataframe predictions_df = pd.DataFrame({ 'id': range(essays.meta_data().shape[0]), 'student_id': essays.meta_data()["student_id"], 'test_id': essays.meta_data()["test_id"], 'essay_type': essays.meta_data()["essay_type"],
splits = 5 features = [] for i in range(amount): if (i % 10 == 0): print(i, "/", amount) #features[i] = extractor.splitColorFeatures(thumbs[i],splits) harald = extractor.calculateDarktoBrightRatio(thumbs[i]) rian = extractor.splitColorFeatures(thumbs[i], splits) features.append(numpy.append(harald, rian)) #model = grid_search.GridSearchCV(svm.SVC(),{'kernel' : ['poly'], 'C' : [1, 10, 100, 1000], 'degree' : [4,7,10], 'shrinking' : [True, False]}) #model.fit(features, classes) #print(model.best_estimator_) #print('\a') print("Producing KFold indexes") kfold = cv.KFold(amount, n_folds=5, shuffle=True) model = lda.LDA() #model = svm.SVC(kernel = 'linear') #model = qda.QDA() score = cross_validation.cross_val_score(model, features, classes, cv=kfold) print("scores ", score) print("mean score ", score.mean()) #model = svm.SVC(kernel = 'linear', probability = True) model = lda.LDA() #model = neighbors.KNeighborsClassifier(n_neighbors = 1) scores = score_calculation.loglossKFold(features, classes, model, 5) print("logloss scores ", scores) print("logloss score mean ", numpy.mean(scores), " ", numpy.std(scores)) #predictions = cross_validation.cross_val_predict(model, features, classes, cv = kfold)
def outer_cross(): from Reading_data import * folder_prefix = ['Outer_', 'Inner_'] # Normalize data X = stats.zscore(X) #shuffling X x_index = [24, 25, 31, 33] y_index = 54 doc_nr = 1000 y = X[:doc_nr, y_index] # attribute 54 is used for prediction purpose X = X[:doc_nr, x_index] # these attribute are used as input N, M = X.shape print "Max value in all attribute :", np.max(X) print "Min value in all attribute ", np.min(X) print "X shape : ", X.shape # ============ parameter for ann ===================== n_hidden_units = np.arange(2, 13) # number of hidden units n_train = 2 # number of networks trained in each k-fold learning_goal = 100 # stop criterion 1 (train mse to be reached) max_epochs = 65 # stop criterion 2 (max epochs in training) show_error_freq = 10 # frequency of training status updates' # ========================================================= # summary index BEST_NEURONS_NR = 0 NET_TRAIN_ERROR = 1 BEST_TRAIN_ERROR = 2 Y_TEST = 3 Y_TEST_EST = 4 Y_TRAIN = 5 Y_TRAIN_EST = 6 MEAN_TEST_ERR_VS_UNITS = 7 MEAN_TRAIN_ERR_VS_UNIT = 8 # ================================================================== OUTER_K = 3 INNER_K = 5 best_hidden_units = np.zeros(OUTER_K) Train_errors = np.zeros(OUTER_K) Test_errors = np.zeros(OUTER_K) #creating to folder for diagrams folder_one = create_new_dir(folder_prefix[0]) folder_two = create_new_dir(folder_prefix[1]) #=================================================================== summary_dict = { } # {k:(best_neurons_nr,net_train_error,best_train_error,y_test,y_test_est,y_train,y_train_est,mean_test_err_vs_unit,mean_train_err_vs_unit)} f = 0 OUTER_CV = cross_validation.KFold(N, OUTER_K, shuffle=True) for train_index, test_index in OUTER_CV: print('\nOuter Crossvalidation fold: {0}/{1}'.format(f + 1, OUTER_K)) # extract training and test set for current CV fold X_train = X[train_index, :] y_train = y[train_index, :] X_test = X[test_index, :] y_test = y[test_index, :] best_neurons_nr, mean_test_err_vs_unit, mean_train_err_vs_unit, mean_best_train_err_vs_unit = inner_cross( X_train, y_train, INNER_K, n_hidden_units, n_train, learning_goal, max_epochs, show_error_freq) bestnet, best_train_error, net_train_errors = find_best_network( X_train, y_train, n_train, best_neurons_nr, learning_goal, max_epochs, show_error_freq) y_test_est = bestnet.sim(X_test) y_train_est = bestnet.sim(X_train) summary_dict[f] = (best_neurons_nr, net_train_errors, best_train_error, y_test, y_test_est, y_train, y_train_est, mean_test_err_vs_unit, mean_train_err_vs_unit) #for index in x_index: # new_index=x_index.index(index) # plot_featue_vs_residual(attributeNames[index],X_train[:,new_index],index,abs(y_test_est-y_test),folder_one) f += 1 # after the work is done then visualize it #create folder for saving files for key in summary_dict: value = summary_dict[key] plot_result(key, value[BEST_NEURONS_NR], value[Y_TEST], value[Y_TEST_EST], folder_one) plot_error_vs_units(key, value[MEAN_TRAIN_ERR_VS_UNIT], value[MEAN_TEST_ERR_VS_UNITS], n_hidden_units, folder_two)
def runGradientBoosting(train_file, train_label_file, test_file, test_label_file): ############################################## # Load Data train_data = csv_io.read_data(train_file) train_label = csv_io.read_data(train_label_file) train_data = np.array([x[0:] for x in train_data]) train_label = np.array([x[0] for x in train_label]) ############################################## # Fit regression model params = { 'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 2, 'learning_rate': 0.15 } rf = ensemble.GradientBoostingRegressor(**params) # Doing cross validation using 10 folds cv = cross_validation.KFold(len(train_data), n_folds=3) average_importance = 0 average_score = 0 for traincv, testcv in cv: print train_data[traincv].shape, train_label[traincv].shape rf.fit(train_data[traincv], train_label[traincv]) average_score += rf.score(train_data[testcv], train_label[testcv]) #print rf.predict(train_data[testcv]) print rf.feature_importances_ mse = mean_squared_error(train_label[testcv], rf.predict(train_data[testcv])) mse_train = mean_squared_error(train_label[traincv], rf.predict(train_data[traincv])) print("MSE: %.4f" % mse) print("MSE: %.4f" % mse_train) print "average score is" + str(average_score / 10) # End doing cross validaiton # Predicting using test data test_data = csv_io.read_data(test_file) test_data = np.array([x[0:] for x in test_data]) #test_label = csv_io.read_data(test_label_file) #test_label = np.array( x[1] for x in test_label ) max_probs = [] min_mse = 1 for i in range(1, 20): rf.fit(train_data, train_label) predicted_probs = rf.predict_proba(test_data) mse_train = mean_squared_error(train_label, rf.predict(train_data)) if min_mse > mse_train: min_mse = mse_train max_probs = predicted_probs print np.array(max_probs).shape #print mean_squared_error(test_label, max_probs) postprocessing.getBenchmark(max_probs)
plt.semilogx(alphas, np.array(scores) - np.array(scores_std) / np.sqrt(len(X)), 'b--') plt.ylabel('CV score') plt.xlabel('alpha') plt.axhline(np.max(scores), linestyle='--', color='.5') ############################################################################## # Bonus: how much can you trust the selection of alpha? # To answer this question we use the LassoCV object that sets its alpha # parameter automatically from the data by internal cross-validation (i.e. it # performs cross-validation on the training data it receives). # We use external cross-validation to see how much the automatically obtained # alphas differ across different cross-validation folds. lasso_cv = linear_model.LassoCV(alphas=alphas) k_fold = cross_validation.KFold(len(X), 3) print("Answer to the bonus question:", "how much can you trust the selection of alpha?") print() print("Alpha parameters maximising the generalization score on different") print("subsets of the data:") for k, (train, test) in enumerate(k_fold): lasso_cv.fit(X[train], y[train]) print("[fold {0}] alpha: {1:.5f}, score: {2:.5f}".format( k, lasso_cv.alpha_, lasso_cv.score(X[test], y[test]))) print() print("Answer: Not very much since we obtained different alphas for different") print("subsets of the data and moreover, the scores for these alphas differ") print("quite substantially.")
def run_pipe(input_files, input_labels, use_modules, no_proc): '''run svr forkflow on data''' #--------------Organise inputs #calculate matrix #feature_matrix = prepare_modality(input_files, input_mask) #--------------Execute analysis #prepare feature agglomeration #mask_handle = nb.load(input_mask) connect = sklim.grid_to_graph(*input_files[0].shape, mask=np.invert( np.isnan(np.sum(input_files, 0)))) inshape = input_files.shape feature_matrix = input_files.reshape((inshape[0], -1)) #remove nans sum_features = np.sum(feature_matrix, 0) feature_matrix = feature_matrix[:, np.invert(np.isnan(sum_features))] #cross validation loo = sklcv.KFold(len(input_labels), n_folds=len(input_labels)) print('Starting svr') cv_pred = jl.Parallel(n_jobs=no_proc, verbose=1, pre_dispatch=no_proc * 2)( jl.delayed(do_model)(feature_matrix[train], input_labels[train], feature_matrix[test], connect, use_modules) for train, test in loo) cv_pred = np.array(cv_pred) corr, p = ss.pearsonr(cv_pred[:, 0], input_labels) #creating final model print('creating final model') if use_modules.find('a') != -1: final_agglo = sklcl.WardAgglomeration(connectivity=connect, n_clusters=int( np.median(cv_pred[:, 1]))) feature_matrix = final_agglo.fit_transform(feature_matrix) else: final_agglo = 0 if use_modules.find('b') != -1: bool_pos, bool_neg = direction_cutoff(feature_matrix) feature_matrix = feature_matrix[:, bool_pos] else: bool_pos = 0 if use_modules.find('c') != -1: final_scaler = sklpre.StandardScaler() feature_matrix = final_scaler.fit_transform(feature_matrix) else: final_scaler = 0 if use_modules.find('d') != -1: final_univ = sklfs.SelectFpr(alpha=np.median(cv_pred[:, 2])) feature_matrix = final_univ.fit_transform(feature_matrix, input_labels) else: final_univ = 0 final_model = sklsvm.NuSVR(kernel='linear', C=100, degree=1, nu=np.median(cv_pred[:, 3])) final_model.fit(feature_matrix, input_labels) return cv_pred, corr, p, final_agglo, final_univ, final_scaler, bool_pos, final_model
# Fit a per-column scaler X_scaler = StandardScaler().fit(X) # Apply the scaler to X X_train = X_scaler.transform(X) y_train = np.array(label_list) # Convert label strings to numerical encoding encoder = LabelEncoder() y_train = encoder.fit_transform(y_train) # Create classifier clf = svm.SVC(kernel='linear') # Set up 5-fold cross-validation kf = cross_validation.KFold(len(X_train), n_folds=5, shuffle=True, random_state=1) # Perform cross-validation scores = cross_validation.cross_val_score(cv=kf, estimator=clf, X=X_train, y=y_train, scoring='accuracy') print('Scores: ' + str(scores)) print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), 2 * scores.std())) # Gather predictions predictions = cross_validation.cross_val_predict(cv=kf, estimator=clf, X=X_train,
def linear_regression(self, lr_input, sqlContext, cc_output): # try: logging.info('Performing Regression') dfi = pd.read_csv(spark_conf.file_path['data_update_path'] + 'Encoded_Classified_1.txt', sep='|', encoding='ISO-8859-1') #dfi_test = pd.read_csv(spark_conf.classifier_input['output_file']+'Encoded_Classified_test.txt', sep = '|', encoding = 'ISO-8859-1') dfi_test_new = cc_output dfi_test = lr_input input_list = list(dfi_test) #print input_list corr = dfi.corr() #sns.heatmap(corr,xticklabels=corr.columns,yticklabels=corr.columns) feature_cols = [ 'likes', 'comment_count', 'user_level_num', 'Average', 'Avoid!', 'Blah!', 'Good Enough', 'Great!', 'Insane!', 'Not rated', 'Very Bad', 'Well...', 'Big Foodie', 'Connoisseur', 'Foodie', 'Super Foodie', 'Bad Ambience', 'Bad Food', 'Bad Service', 'Good Ambience', 'Good Food', 'Good Service', 'Not Worthy', 'binarized_user_foodie_level', 'binarized_rating_text', 'binarized_class_name' ] feature_cols_1 = list(set(input_list).intersection(feature_cols)) # print feature_cols_1 X_train = dfi[:-1] # print len(X_train) X_test = dfi_test[0:] # print len(X_test) y_train = dfi.confidence[:-1] # print len(y_train) y_test = dfi_test.confidence[0:] #print len(y_test) X = X_train[feature_cols_1] y = y_train Xtest = X_test[feature_cols_1] regr = linear_model.Lasso(alpha=0.0000000001, fit_intercept=True, normalize=False, precompute=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic') regr.fit(X, y) shuffle = cross_validation.KFold(len(X), n_folds=10, shuffle=True, random_state=0) scores = cross_validation.cross_val_score(regr, X, y, cv=shuffle) #print("Accuracy: %.3f%% (%.3f%%)") % (scores.mean()*100.0, scores.std()*100.0) #print regr.intercept_ #print (regr.coef_) #print mean_squared_error(regr.predict(Xtest), y_test)**0.5 #print regr.predict(Xtest) #print regr.score(X,y) se = pd.Series(regr.predict(Xtest)) dfi_test_new['score'] = se.values # dfi_test['xyz'] = se.values print list(dfi_test_new) df_s = sqlContext.createDataFrame(dfi_test_new) #df_s.show() #print df_s.count() df_s.rdd.map(lambda x: list(x)).map(lambda y: filter_data( y)).saveAsTextFile(spark_conf.hdfs_path['classifier_output'] + '%s.txt' % spark_conf.utc_time()[1]) # dfi_test.to_csv(spark_conf.classifier_input['output_file']+'final_Output.txt',sep='|',encoding="ISO-8859-1") return 1
# Try cross-validation(We did a 3 fold crossvalidation and arbitaryly assigning the following paremeters) """bug: the following code arise an warning: overflow encountered in exp. Check out the discussion about it later http://comments.gmane.org/gmane.comp.python.scikit-learn/3730""" # 3-fold cross validation means: # Ex: >>X = np.array([2, 3, 1, 0,12,10,22,11,22,111,23,12]), # >>kfold = cross_validation.KFold(len(X), n_folds=3) 3-fold means 2/3 train, 1/3 test # ###ytrain [ 12 10 22 11 22 111 23 12] ytest [2 3 1 0] ###ytrain [ 2 3 1 0 22 111 23 12] ytest [12 10 22 11] ###ytrain [ 2 3 1 0 12 10 22 11] ytest [ 22 111 23 12] from sklearn.ensemble import GradientBoostingClassifier from sklearn import cross_validation kfold = cross_validation.KFold(len(X), n_folds=3) gbc= GradientBoostingClassifier(n_estimators=100, max_depth=1, learning_rate=1.0,random_state=0) crossScores=[gbc.fit(X[train], y[train]).score(X[test], y[test]) for train, test in kfold] print crossScores, 'average crossvalidation score ='+str(sum(crossScores)/3) ##return : 0.55 """Algorithm evaluation starts here!!!!""" seperateIdx= len(X)*60/100 X_train = X[0:seperateIdx] y_train = y[0:seperateIdx] X_test = X[seperateIdx:] y_test = y[seperateIdx:]
def c_validation(X, y, k, function): kf = cross_validation.KFold(X.shape[0], n_folds=k) totalloss = 0 # Variable that will store the total intances that will be tested totalsuccess5 = 0 totalsuccess10 = 0 totalpercentageloss = 0 res = [] corr = [] lines = [] for trainIndex, testIndex in kf: trainSet = X[trainIndex] testSet = X[testIndex] trainLabels = y[trainIndex] testLabels = y[testIndex] avg = 0 for i in trainLabels: avg += i avg = avg / trainLabels.shape[0] predictedLabels = function(trainSet, trainLabels, testSet) loss = 0 percentageloss = 0 success5 = 0 success10 = 0 for i in range(testSet.shape[0]): if not np.isnan(predictedLabels[i][0]): if predictedLabels[i][0] > 1: predictedLabels[i][0] = avg loss += abs(predictedLabels[i][0] - testLabels[i]) / ( testLabels[i] * testLabels.shape[0]) percentageloss += abs(predictedLabels[i][0] - testLabels[i]) / testLabels.shape[0] if abs(predictedLabels[i][0] - testLabels[i]) / testLabels[i] < 0.05: success5 += 1 / testLabels.shape[0] success10 += 1 / testLabels.shape[0] elif abs(predictedLabels[i][0] - testLabels[i]) / testLabels[i] < 0.1: success10 += 1 / testLabels.shape[0] else: print(i) print('Loss: ', 100 * loss, '%') print('Average error: ', 100 * percentageloss, '%') print('Success 0.05: ', 100 * success5, '%') print('Success 0.1: ', 100 * success10, '%') totalloss += loss totalpercentageloss += percentageloss totalsuccess5 += success5 totalsuccess10 += success10 res += list(predictedLabels) corr += list(testLabels[:]) plt.plot(res, linestyle='', marker='.') plt.plot(corr, linestyle='', marker='.') plt.show() print('Total Loss: ', 100 * totalloss / k, '%') print('Total Average Error: ', 100 * totalpercentageloss / k, '%') print('Total success 0.05: ', 100 * totalsuccess5 / k, '%') print('Total success 0.1: ', 100 * totalsuccess10 / k, '%') return totalloss / k
# -*- coding: utf-8 -*- """ @author: Yu """ import numpy from sklearn import linear_model, cross_validation data = numpy.genfromtxt('random_housing_data.csv', delimiter=',', skip_header=1) alphas = [0.1, 0.01, 0.001] others = data[:, (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)] medv = data[:, 13] rmse = [] tenfold = cross_validation.KFold(len(others), 10, True) ridge = linear_model.RidgeCV(alphas, False, False, None, tenfold, None, False) ridge.fit(others, medv) predicted = ridge.predict(others) rmse.append(numpy.sqrt(((predicted - medv)**2).mean())) print 'RMSE: \n', rmse print 'Alpha: \n', ridge.alpha_
bothClassifier = SVC() end = time.time() ######################################### # genderClassifier.fit(train, genders) # ageClassifier.fit(train, ages) # with open('image_classifiers.bin', 'wb') as fp: # pickle.dump(genderClassifier, fp) # pickle.dump(ageClassifier, fp) # fp.close() # Run 10-Fold Validation ######################################### cv = cross_validation.KFold(size, n_folds=10) resultsGender = [] resultsAge = [] resultsBoth = [] i = 1 for traincv, testcv in cv: print "Starting iteration ", i start = time.time() s = time.time() genderClassifier.fit(train[traincv[0]:traincv[-1]], genders[traincv[0]:traincv[-1]]) ageClassifier.fit(train[traincv[0]:traincv[-1]], ages[traincv[0]:traincv[-1]]) bothClassifier.fit(train[traincv[0]:traincv[-1]], both[traincv[0]:traincv[-1]])
def getHyperParamsAndBestNet(): unsupervisedData, data, labels = createTrainingSet() print np.unique(np.argmax(labels, axis=1)) print "data.shape" print data.shape print "labels.shape" print labels.shape print data data = common.scale(data) unsupervisedData = None activationFunction = activationfunctions.Rectified() rbmActivationFunctionVisible = activationfunctions.Identity() rbmActivationFunctionHidden = activationfunctions.RectifiedNoisy() tried_params = [] percentages = [] best_index = 0 index = 0 best_correct = 0 # Random data for training and testing kf = cross_validation.KFold(n=len(data), n_folds=10) for train, test in kf: unsupervisedLearningRate = random.uniform(0.0001, 0.2) supervisedLearningRate = random.uniform(0.0001, 0.2) momentumMax = random.uniform(0.7, 1) tried_params += [{ 'unsupervisedLearningRate': unsupervisedLearningRate, 'supervisedLearningRate': supervisedLearningRate, 'momentumMax': momentumMax }] trainData = data[train] trainLabels = labels[train] # net = db.DBN(4, [1200, 1500, 1000, len(args.emotions)], # binary=False, # activationFunction=activationFunction, # rbmActivationFunctionVisible=rbmActivationFunctionVisible, # rbmActivationFunctionHidden=rbmActivationFunctionHidden, # unsupervisedLearningRate=unsupervisedLearningRate, # supervisedLearningRate=supervisedLearningRate, # momentumMax=momentumMax, # nesterovMomentum=True, # rbmNesterovMomentum=True, # rmsprop=True, # miniBatchSize=20, # hiddenDropout=0.5, # visibleDropout=0.8, # momentumFactorForLearningRateRBM=False, # firstRBMheuristic=False, # rbmVisibleDropout=1.0, # rbmHiddenDropout=1.0, # preTrainEpochs=10, # sparsityConstraintRbm=False, # sparsityRegularizationRbm=0.001, # sparsityTragetRbm=0.01) # # net.train(trainData, trainLabels, maxEpochs=200, # validation=False, # unsupervisedData=unsupervisedData) # # probs, predicted = net.classify(data[test]) net = cnn.CNN(30, 40, len(args.emotions)) net.train(trainData, trainLabels) probs, predicted = net.classify(data[test]) actualLabels = labels[test] correct = 0 for i in xrange(len(test)): actual = actualLabels[i] print probs[i] if predicted[i] == np.argmax(actual): correct += 1 percentage_correct = correct * 1.0 / len(test) print "percentage correct" print percentage_correct if percentage_correct > best_correct: best_index = index best_correct = percentage_correct with open(args.net_file, "wb") as f: pickle.dump(net, f) percentages += [percentage_correct] index += 1 print 'best params' print tried_params[best_index] print 'precision' print best_correct
elif option == '-l': label_file = value else: assert False, "Option %s not available" % option if not data_file or not label_file: Usage() data = np.genfromtxt(data_file, delimiter=',') labels = np.genfromtxt(label_file, delimiter='\n') #data=data[labels!=2] # Normalizing data data = preprocessing.scale(data) preds_1 = np.zeros(data.shape[0]) preds_2 = np.zeros(data.shape[0]) preds_3 = np.zeros(data.shape[0]) preds_4 = np.zeros(data.shape[0]) for train_index, test_index in cross_validation.KFold(data.shape[0], n_folds=10): #for train_index, test_index in cross_validation.LeaveOneOut(data.shape[0]): print 'Running for a split...' estimator = LogisticRegression() estimator.fit(data[train_index], labels[train_index]) preds_1[test_index] = estimator.predict(data[test_index]) estimator = sklearn.dummy.DummyClassifier(strategy='stratified',random_state=0) estimator.fit(data[train_index], labels[train_index]) preds_2[test_index] = estimator.predict(data[test_index]) estimator = RandomForestClassifier(n_estimators=20, n_jobs=5) estimator.fit(data[train_index], labels[train_index]) preds_3[test_index] = estimator.predict(data[test_index]) #GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
def run_stack(SEED): model = "DIV Long-Lat KNN5 55 Imp" print "Running GB, RF, ET stack." trainBase = csv_io.read_data("PreProcessData/training_PreProcess5_40.csv", skipFirstLine=False, split="\t") test = csv_io.read_data("PreProcessData/test_PreProcess5_40.csv", skipFirstLine=False, split="\t") weights = csv_io.read_data("PreProcessData/Weights.csv", skipFirstLine=False) #random.seed(SEED) #random.shuffle(trainBase) avg = 0 NumFolds = 5 # 5 is good, but 10 yeilds a better mean since outliers are less significant. (note, predictions are less reliable when using 10). predicted_list = [] bootstrapLists = [] # use this for quick runs. # note RF with 150 crashes on 30 features # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingRegressor(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # RandomForestRegressor(n_estimators=100, n_jobs=1), #RandomForestRegressor(n_estimators=75, n_jobs=1), # clfs = [ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1), # SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True, tol=0.001, verbose=False) # ] #knn 5 at 3.45 #knn 15 at 3.31 #knn 25 at 3.30 #knn 40 at 3.31 # KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # LinearRegression at 3.77 # Ridge at 3.77 # SGD 4.23 #Gauss at 13 # LinearRegression(fit_intercept=True, normalize=False, copy_X=True), # Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, tol=0.001), # SGDRegressor(loss='squared_loss', penalty='l2', alpha=0.0001, rho=0.84999999999999998, fit_intercept=True, n_iter=5, shuffle=False, verbose=0, epsilon=0.10000000000000001, p=None, seed=0, learning_rate='invscaling', eta0=0.01, power_t=0.25, warm_start=False), # GaussianNB() # clfs = [KNeighborsRegressor(n_neighbors=15, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2), # KNeighborsRegressor(n_neighbors=25, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2),KNeighborsRegressor(n_neighbors=35, weights='uniform', algorithm='auto', leaf_size=30, warn_on_equidistant=False, p=2) # ] # GB, 125 est is minimum, score is bad below this, explore higher and other dimensions. ****************** # clfs = [GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=100, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=100, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=100, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=200, random_state=166), # GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=200, random_state=166),GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=10, n_estimators=200, random_state=166) # ] # about 1 hour run time, and 3.10 score. #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=400, random_state=166) # about 2 hours run time at 3.05 #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=400, random_state=166) # about 2 hours run time at 3.06 #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=800, random_state=166) # about 4 hours run time at 3.06 #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=8, n_estimators=800, random_state=166) # 6/2000 on 40 features is 2.97 # I clfs = [ GradientBoostingRegressor(max_features=40, learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=3000, random_state=166, min_samples_leaf=1) ] # use this for quick runs. # clfs = [GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50, random_state=166), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125, random_state=551), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80, random_state=441), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80, random_state=331), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80, random_state=221), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120, random_state=91), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120, random_state=81), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120, random_state=71), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160, random_state=61), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160, random_state=51), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160, random_state=41), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200, random_state=31), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200, random_state=21), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200, random_state=10), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200, random_state=19), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240, random_state=18), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240, random_state=17), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240, random_state=16), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280, random_state=15), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280, random_state=14), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280, random_state=13), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320, random_state=12), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320, random_state=11), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='gini'), # RandomForestClassifier(n_estimators=100, n_jobs=1, criterion='entropy'), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5)] # use this for quick runs. reduced estimators to 50 # clfs = [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, # gamma=2**-5.5, kernel='rbf', probability=False, shrinking=True, # tol=0.001, verbose=False) # ] #GradientBoostingRegressor(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), #ExtraTreesRegressor(n_estimators=50, min_density=0.2, n_jobs=1, bootstrap=True, random_state=1) # clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7)] # full algorithm stack. # clfs = [ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3), # ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4), # ExtraTreesClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # ExtraTreesClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # ExtraTreesClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7), # ExtraTreesClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8), # GradientBoostingClassifier(learn_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50), # GradientBoostingClassifier(learn_rate=0.02, subsample=0.2, max_depth=8, n_estimators=125), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=80), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=120), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=160), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=200), # GradientBoostingClassifier(lesarn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=4, n_estimators=200), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=240), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=3, n_estimators=280), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=1, n_estimators=320), # GradientBoostingClassifier(learn_rate=0.01, subsample=0.2, max_depth=2, n_estimators=320), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='gini', bootstrap=True, random_state=1), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='gini', bootstrap=True, random_state=2), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='gini', bootstrap=True, random_state=3), # RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='gini', bootstrap=True, random_state=4), # RandomForestClassifier(n_estimators=150, min_density=0.2, n_jobs=1, criterion='entropy', bootstrap=True, random_state=5), # RandomForestClassifier(n_estimators=150, min_density=0.09, n_jobs=1, criterion='entropy', bootstrap=True, random_state=6), # RandomForestClassifier(n_estimators=150, min_density=0.05, n_jobs=1, criterion='entropy', bootstrap=True, random_state=7), # RandomForestClassifier(n_estimators=150, min_density=0.02, n_jobs=1, criterion='entropy', bootstrap=True, random_state=8)] print "Data size: ", len(trainBase), len(test) dataset_blend_train = np.zeros((len(trainBase), len(clfs))) dataset_blend_test = np.zeros((len(test), len(clfs))) trainNew = [] trainTestNew = [] testNew = [] trainNewSelect = [] trainTestNewSelect = [] testNewSelect = [] print "Scaling" targetPre = [x[0] for x in trainBase] trainPre = [x[1:] for x in trainBase] testPre = [x[0:] for x in test] #print trainPre[0] scaler = preprocessing.Scaler().fit(trainPre) trainScaled = scaler.transform(trainPre) testScaled = scaler.transform(testPre) #print scaler.mean_ #print scaler.std_ print "Begin Training" for ExecutionIndex, clf in enumerate(clfs): print clf avg = 0 predicted_list = [] dataset_blend_test_set = np.zeros((len(test), NumFolds)) foldCount = 0 #Stratified for classification...[trainBase[i][0] for i in range(len(trainBase))] Folds = cross_validation.KFold(len(trainBase), k=NumFolds, indices=True) for train_index, test_index in Folds: #trainBaseTemp = [trainBase[i] for i in train_index] #target = [x[0] for x in trainBaseTemp] #train = [x[1:] for x in trainBaseTemp] #testBaseTemp = [trainBase[i] for i in test_index] #targetTest = [x[0] for x in testBaseTemp] #trainTest = [x[1:] for x in testBaseTemp] #test = [x[0:] for x in test] target = [targetPre[i] for i in train_index] train = [trainScaled[i] for i in train_index] targetTest = [targetPre[i] for i in test_index] trainTest = [trainScaled[i] for i in test_index] print print "Iteration: ", foldCount print "LEN: ", len(train), len(target) clf.fit(train, target) prob = clf.predict(trainTest) dataset_blend_train[test_index, ExecutionIndex] = prob probSum = 0 weightSum = 0 # totalOffByHalf = 0 # totalPositive = 0 # totalPositiveOffByHalf = 0 # totalPositivePredictions = 0 for i in range(0, len(prob)): probX = prob[i] probSum += weights[test_index[i]][0] * math.fabs( targetTest[i] - probX) weightSum += weights[test_index[i]][0] #print "Weight", weights[test_index[i]][0], "Index: ",i, "Test_Index: ",test_index[i] , "Actual: ", targetTest[i], "Predicted: ", probX # log loss cal #probSum += int(targetTest[i])*log(probX)+(1-int(targetTest[i]))*log(1-probX) # if ( math.fabs(probX - int(targetTest[i])) > 0.5 ): # totalOffByHalf = totalOffByHalf + 1 # if ( int(targetTest[i]) == 1 ): # totalPositive = totalPositive + 1 # if ( int(targetTest[i]) == 1 and probX < 0.5): # totalPositiveOffByHalf = totalPositiveOffByHalf + 1 # if (probX > 0.5): # totalPositivePredictions = totalPositivePredictions + 1 # print # print "Stats:" # print "Total Off By > 0.5 ", totalOffByHalf # print "Total Positive ", totalPositive # print "Total Positive Off By Half ", totalPositiveOffByHalf # print "Total Positive Predictions ", totalPositivePredictions #print -probSum/len(prob) print "Score: ", probSum / weightSum avg += (probSum / weightSum) / NumFolds predicted_probs = clf.predict(testScaled) #predicted_list.append([x[1] for x in predicted_probs]) dataset_blend_test_set[:, foldCount] = predicted_probs #[0] foldCount = foldCount + 1 dataset_blend_test[:, ExecutionIndex] = dataset_blend_test_set.mean(1) #print "Saving NP" #np.savetxt('temp/dataset_blend_test_set.txt', dataset_blend_test_set) #np.savetxt('temp/dataset_blend_test_set.mean.txt', dataset_blend_test_set.mean(1) ) #np.savetxt('temp/dataset_blend_test.txt', dataset_blend_test) #print "Done Saving NP" now = datetime.datetime.now() #print dataset_blend_test_set.mean(1) csv_io.write_delimited_file_single( "../predictions/Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_test_set.mean(1)) csv_io.write_delimited_file_single( "../predictions/Target_Stack_" + now.strftime("%Y%m%d%H%M%S") + "_" + str(avg) + "_" + str(clf)[:12] + ".csv", dataset_blend_train[:, ExecutionIndex]) csv_io.write_delimited_file("../predictions/RunLog.csv", [ now.strftime("%Y %m %d %H %M %S"), str(avg), str(clf), str(NumFolds), model, "", "" ], filemode="a", delimiter=",") print "------------------------Average: ", avg #np.savetxt('temp/dataset_blend_train.txt', dataset_blend_train) return dataset_blend_train, dataset_blend_test
mmwrite(os.path.join(save_dir,save_stem+'_Omega.mtx'),Omega) # h5write(os.path.join(save_dir,save_stem+'_W0_ipsi.h5'), # np.zeros((Y_ipsi.shape[0],X.shape[0]))) # h5write(os.path.join(save_dir,save_stem+'_W0_contra.h5'), # np.zeros((Y_contra.shape[0],X.shape[0]))) if cross_val_matrices: from sklearn import cross_validation fid=open(cmdfile,'w') n_inj=X.shape[1] # Sets up nested outer/inner cross-validation. The inner loop is for # model selection (validation), the outer for testing. if cross_val=='LOO': outer_sets=cross_validation.LeaveOneOut(n_inj) else: outer_sets=cross_validation.KFold(n_inj, n_folds=cross_val, shuffle=True, random_state=shuffle_seed) for i,(train,test) in enumerate(outer_sets): X_train=X[:,train] X_test=X[:,test] Y_train_ipsi=Y_ipsi[:,train] Y_test_ipsi=Y_ipsi[:,test] Omega_train = Omega[:,train] Omega_test = Omega[:,test] Y_train_contra=Y_contra[:,train] Y_test_contra=Y_contra[:,test] # setup some directories outer_dir=os.path.join(save_dir,'cval%d'%i) try: os.mkdir(outer_dir) except OSError:
# LARS Regression import pandas from sklearn import cross_validation from sklearn.linear_model import Lars url = "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data" names = [ 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV' ] dataframe = pandas.read_csv(url, delim_whitespace=True, names=names) array = dataframe.values X = array[:, 0:13] Y = array[:, 13] num_folds = 10 num_instances = len(X) seed = 7 kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed) model = Lars() scoring = 'mean_squared_error' results = cross_validation.cross_val_score(model, X, Y, cv=kfold, scoring=scoring) print(results.mean())
def calculate(path = None, n_estimators = None, max_depth = None, max_features = None, vectorizer_max_features = None, neutral_sentiment = None): # Read console parameters if function is not called from python. if (path is None): path = sys.argv[1] n_estimators = int(sys.argv[2]) max_depth = int(sys.argv[3]) max_features = int(sys.argv[4]) vectorizer_max_features = int(sys.argv[5]) neutral_sentiment = bool(sys.argv[6]) # Get only text reviews and star ratings from entire data set. print('Extracting data...') reviews, ratings = extract_reviews_and_rating(path, neutral_sentiment) # Limit vocabulary size to 5000. vectorizer = CountVectorizer( analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = vectorizer_max_features ) # Initialize a random forest classifier. forest = RandomForestClassifier( n_estimators = n_estimators, max_depth = max_depth, max_features = max_features ) # Create bag of words features. train_data_features = vectorizer.fit_transform(reviews) train_data_features = train_data_features.toarray() # Create bag of words features. test_data_features = vectorizer.transform(reviews) test_data_features = test_data_features.toarray() # Prepare train and test indices for tenfold cross validation. kf = cross_validation.KFold(len(reviews), n_folds = 10) sum_accuracy = 0 # Tenfold cross validation loop. for train, test in kf: # Convert python lists to numpy arrays. train = np.array(train) test = np.array(test) # Train classifier. print('Training classifier...') forest = forest.fit(train_data_features[train], ratings[train]) # Use trained classifier to predict sentiment of test data. print('Processing test data...') result = forest.predict(test_data_features[test]) # Calculate prediction accuracy. accuracy = accuracy_score(ratings[test], result) print('Accuracy: ' + str(accuracy) + '\n') # Sum each score to calculate average. sum_accuracy += accuracy # Display average accuracy for tenfold cross validation. accuracy = sum_accuracy / 10 print('Average accuracy: ' + str(accuracy) + '\n') return accuracy