def main(): df = pd.read_csv('dataset/ElectionsData.csv') df['split'] = 0 indices = KFold(n=len(df), n_folds=5, shuffle=True)._iter_test_indices() df['split'][indices.next()] = 1 df['split'][indices.next()] = 2 raw_data = df.copy() raw_data[raw_data['split']==0].drop('split', axis=1).to_csv('dataset/raw_train.csv', index=False) raw_data[raw_data['split']==1].drop('split', axis=1).to_csv('dataset/raw_test.csv', index=False) raw_data[raw_data['split']==2].drop('split', axis=1).to_csv('dataset/raw_validation.csv', index=False) all_features, discrete_features, continuous_features, categorical_features, numeric_features = split_features_by_type(df) features_to_keep = {'Yearly_ExpensesK', 'Yearly_IncomeK', 'Overall_happiness_score', 'Most_Important_Issue', 'Avg_Residancy_Altitude', 'Will_vote_only_large_party', 'Financial_agenda_matters'} df = mark_negative_values_as_nan(df) df = outlier_detection(df, continuous_features) #fill missing values by correlated features. fill_f1_by_f2_linear(df, 'Yearly_ExpensesK', 'Avg_monthly_expense_on_pets_or_plants') fill_f1_by_f2_linear(df, 'Yearly_IncomeK', 'Avg_size_per_room') fill_f1_by_f2_linear(df, 'Overall_happiness_score', 'Political_interest_Total_Score') #not perfectly corelated, but better then nothing fill_f1_by_f2_discrete(df, 'Most_Important_Issue', 'Last_school_grades') fill_f1_by_f2_linear(df, 'Avg_Residancy_Altitude', 'Avg_monthly_expense_when_under_age_21') fill_f1_by_f2_discrete(df, 'Will_vote_only_large_party', 'Looking_at_poles_results') fill_f1_by_f2_discrete(df, 'Financial_agenda_matters', 'Vote') for c in features_to_keep: rows_to_fix = df[c].isnull() for row, value in enumerate(rows_to_fix): if value: df[c][row] = df[df.Vote==df.Vote[row]][c].mean() df=df[list(features_to_keep)+['Vote', 'split']] reduce_Most_Important_Issue(df) z_score_scaling(df, list(features_to_keep.intersection(set(continuous_features)))) l_encoder = label_encoder(df) df = categorical_features_transformation(df) pickle.dump(l_encoder, open('encoder.pickle', 'w')) df[df['split'] == 0].drop('split', axis=1).to_csv('dataset/transformed_train.csv', index=False) df[df['split'] == 1].drop('split', axis=1).to_csv('dataset/transformed_test.csv', index=False) df[df['split']==2].drop('split', axis=1).to_csv('dataset/transformed_validation.csv', index=False)
def cv_valid(data, cutoff, folds, make_syn): log.info('Creating CV splits.') y = get_labels(data,cutoff) valid_idx = [] for i,y_val in enumerate(y): if y_val==0 or y_val==1: valid_idx.append(i) log.info('Data label distribution: total={0}, benign={1}, malware={2}, ambigious={3}, client_unlabeled={4}, no_vt={5}, unknown={6}.'.format(len(y), np.sum(y==0), np.sum(y==1), np.sum(y==-2), np.sum(y==-1), np.sum(y==-3), np.sum(np.isnan(y)))) first_seen = data['time_seen'] cuckoo_idx, splunk_idx = valid_by_types(data, valid_idx) log.info('Time split stats: min(days)={}, max(days)={}, std(days)={}.'.format(np.min(first_seen[cuckoo_idx])/86400.0, np.max(first_seen[cuckoo_idx])/86400.0, np.std(first_seen[cuckoo_idx])/86400.0)) #first seen time time_cut = np.median(first_seen[valid_idx]) train = [] test = [] for i in cuckoo_idx: if first_seen[i]<time_cut: train.append(i) else: test.append(i) cv_time = [[np.array(train), np.array(test)]] cv_cuckoo = StratifiedKFold(y[cuckoo_idx], n_folds=folds, shuffle=True) if len(splunk_idx)>=folds: cv_splunk = KFold(len(splunk_idx), n_folds=folds+1, shuffle=True) else: cv_splunk = [] for i in xrange(folds+1): cv_splunk.append([[],[]]) #touples syn_touples = [] cv_sandbox = [] cv_enterprise = [] count = 1 for cuckoo,splunk in zip(cv_cuckoo, cv_splunk): train = [] test_sandbox = [] test_enterprise = [] train.extend([cuckoo_idx[i] for i in cuckoo[0]]) train.extend([splunk_idx[i] for i in splunk[0]]) test_sandbox.extend([cuckoo_idx[i] for i in cuckoo[1]]) test_enterprise.extend([splunk_idx[i] for i in splunk[1]]) #add the malware from cuckoo box test_enterprise.extend([cuckoo_idx[i] for i in cuckoo[1] if y[cuckoo_idx[i]]==1]) train = np.array(train) curr_sandbox = [train,np.array(test_sandbox)] curr_enterprise = [train,np.array(test_enterprise)] cv_sandbox.append(curr_sandbox) cv_enterprise.append(curr_enterprise) cuckoo_idx_c, splunk_idx_c = valid_by_types(data, curr_sandbox[0]) cuckoo_idx_d, splunk_idx_d = valid_by_types(data, curr_sandbox[1]) log.info('Created sandbox split %d: training size=%d (benign=%d,malware=%d,cuckoo=%d,splunk=%d), testing size=%d (benign=%d,malware=%d,cuckoo=%d,splunk=%d).' % (count, len(curr_sandbox[0]), np.sum(y[curr_sandbox[0]]==0), np.sum(y[curr_sandbox[0]]==1), len(cuckoo_idx_c), len(splunk_idx_c), len(curr_sandbox[1]), np.sum(y[curr_sandbox[1]]==0), np.sum(y[curr_sandbox[1]]==1), len(cuckoo_idx_d), len(splunk_idx_d))) if len(curr_enterprise[1])>0: log.info('Created enterprise split %d: training size=%d (benign=%d,malware=%d), testing size=%d (benign=%d,malware=%d).' % (count, len(curr_enterprise[0]), np.sum(y[curr_enterprise[0]]==0), np.sum(y[curr_enterprise[0]]==1), len(curr_enterprise[1]), np.sum(y[curr_enterprise[1]]==0), np.sum(y[curr_enterprise[1]]==1))) count+=1 if make_syn: splunk_count = 0 for train,test in cv_splunk: if splunk_count==folds: syn_touples.extend(test) splunk_count+=1 syn_touples = np.array(syn_touples) return cv_sandbox, cv_enterprise, syn_touples, cv_time
from sklearn import datasets from sklearn.gaussian_process import GaussianProcess from sklearn.cross_validation import cross_val_score, KFold # Load the dataset from scikit's data sets diabetes = datasets.load_diabetes() X, y = diabetes.data, diabetes.target # Instanciate a GP model gp = GaussianProcess(regr='constant', corr='absolute_exponential', theta0=[1e-4] * 10, thetaL=[1e-12] * 10, thetaU=[1e-2] * 10, nugget=1e-2, optimizer='Welch') # Fit the GP model to the data performing maximum likelihood estimation gp.fit(X, y) # Deactivate maximum likelihood estimation for the cross-validation loop gp.theta0 = gp.theta_ # Given correlation parameter = MLE gp.thetaL, gp.thetaU = None, None # None bounds deactivate MLE # Perform a cross-validation estimate of the coefficient of determination using # the cross_validation module using all CPUs available on the machine K = 20 # folds R2 = cross_val_score(gp, X, y=y, cv=KFold(y.size, K), n_jobs=1).mean() print("The %d-Folds estimate of the coefficient of determination is R2 = %s" % (K, R2))
et = SklearnHelper(clf=ExtraTreesRegressor, seed=SEED, params=et_params) ada = SklearnHelper(clf=AdaBoostRegressor, seed=SEED, params=ada_params) gb_regressor = SklearnHelper(clf=GradientBoostingRegressor, seed=SEED, params=gb_regressor_params) svm = SklearnHelper(clf=LinearSVR, seed=SEED, params=svm_params) # ------------------------------------------------------------------------------------------------- # here where you can notice our result are different from him, because we don't have full 3m records with eng. features ntrain = x_train.shape[0] print(ntrain) ntest = x_test_201610.shape[0] #need the size of a test set print(ntest) kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED) # ------------------------------------------------------------------------------------------------- svm_oof_train, svm_oof_test_201610, svm_oof_test_201611, svm_oof_test_201612, svm_oof_test_201710, svm_oof_test_201711, svm_oof_test_201712 = get_oof(svm,x_train, y_train, x_test_201610, x_test_201611, x_test_201612, x_test_201710, x_test_201711, x_test_201712) # Support Vector Classifier print("SVM Training is complete") # ------------------------------------------------------------------------------------------------- et_oof_train, et_oof_test_201610, et_oof_test_201611, et_oof_test_201612, et_oof_test_201710, et_oof_test_201711, et_oof_test_201712 = get_oof(et, x_train, y_train, x_test_201610, x_test_201611, x_test_201612, x_test_201710, x_test_201711, x_test_201712) # Extra Trees print("Extra Trees Regressor Training is complete") # ------------------------------------------------------------------------------------------------- rf_oof_train, rf_oof_test_201610, rf_oof_test_201611, rf_oof_test_201612, rf_oof_test_201710, rf_oof_test_201711, rf_oof_test_201712 = get_oof(rf,x_train, y_train, x_test_201610, x_test_201611, x_test_201612, x_test_201710, x_test_201711, x_test_201712) # Random Forest print("Random Forest Regressor Training is complete")
def crossValidation(data): k_fold = KFold(n=len(data), n_folds=10) Mat = np.zeros((users, items)) for e in data: Mat[e[0] - 1][e[1] - 1] = e[2] sim_item_cosine, sim_item_jaccard, sim_item_pearson = similarity_item(Mat) #sim_item_cosine, sim_item_jaccard, sim_item_pearson = np.random.rand(items,items), np.random.rand(items,items), np.random.rand(items,items) '''sim_item_cosine = np.zeros((items,items)) sim_item_jaccard = np.zeros((items,items)) sim_item_pearson = np.zeros((items,items)) f_sim_i = open("sim_item_based.txt", "r") for row in f_sim_i: r = row.strip().split(',') sim_item_cosine[int(r[0])][int(r[1])] = float(r[2]) sim_item_jaccard[int(r[0])][int(r[1])] = float(r[3]) sim_item_pearson[int(r[0])][int(r[1])] = float(r[4]) f_sim_i.close()''' rmse_cosine = [] rmse_jaccard = [] rmse_pearson = [] for train_indices, test_indices in k_fold: train = [data[i] for i in train_indices] test = [data[i] for i in test_indices] M = np.zeros((users, items)) for e in train: M[e[0] - 1][e[1] - 1] = e[2] true_rate = [] pred_rate_cosine = [] pred_rate_jaccard = [] pred_rate_pearson = [] for e in test: user = e[0] item = e[1] true_rate.append(e[2]) pred_cosine = 3.0 pred_jaccard = 3.0 pred_pearson = 3.0 #item-based if np.count_nonzero(M[:, item - 1]): sim_cosine = sim_item_cosine[item - 1] sim_jaccard = sim_item_jaccard[item - 1] sim_pearson = sim_item_pearson[item - 1] ind = (M[user - 1] > 0) #ind[item-1] = False normal_cosine = np.sum(np.absolute(sim_cosine[ind])) normal_jaccard = np.sum(np.absolute(sim_jaccard[ind])) normal_pearson = np.sum(np.absolute(sim_pearson[ind])) if normal_cosine > 0: pred_cosine = np.dot(sim_cosine, M[user - 1]) / normal_cosine if normal_jaccard > 0: pred_jaccard = np.dot(sim_jaccard, M[user - 1]) / normal_jaccard if normal_pearson > 0: pred_pearson = np.dot(sim_pearson, M[user - 1]) / normal_pearson if pred_cosine < 0: pred_cosine = 0 if pred_cosine > 5: pred_cosine = 5 if pred_jaccard < 0: pred_jaccard = 0 if pred_jaccard > 5: pred_jaccard = 5 if pred_pearson < 0: pred_pearson = 0 if pred_pearson > 5: pred_pearson = 5 #print str(user) + "\t" + str(item) + "\t" + str(e[2]) + "\t" + str(pred_cosine) + "\t" + str(pred_jaccard) + "\t" + str(pred_pearson) pred_rate_cosine.append(pred_cosine) pred_rate_jaccard.append(pred_jaccard) pred_rate_pearson.append(pred_pearson) rmse_cosine.append( sqrt(mean_squared_error(true_rate, pred_rate_cosine))) rmse_jaccard.append( sqrt(mean_squared_error(true_rate, pred_rate_jaccard))) rmse_pearson.append( sqrt(mean_squared_error(true_rate, pred_rate_pearson))) print( str(sqrt(mean_squared_error(true_rate, pred_rate_cosine))) + "\t" + str(sqrt(mean_squared_error(true_rate, pred_rate_jaccard))) + "\t" + str(sqrt(mean_squared_error(true_rate, pred_rate_pearson)))) #raw_input() #print sum(rms) / float(len(rms)) rmse_cosine = sum(rmse_cosine) / float(len(rmse_cosine)) rmse_pearson = sum(rmse_pearson) / float(len(rmse_pearson)) rmse_jaccard = sum(rmse_jaccard) / float(len(rmse_jaccard)) print( str(rmse_cosine) + "\t" + str(rmse_jaccard) + "\t" + str(rmse_pearson)) f_rmse = open("rmse_item.txt", "w") f_rmse.write( str(rmse_cosine) + "\t" + str(rmse_jaccard) + "\t" + str(rmse_pearson) + "\n") rmse = [rmse_cosine, rmse_jaccard, rmse_pearson] req_sim = rmse.index(min(rmse)) print(req_sim) f_rmse.write(str(req_sim)) f_rmse.close() if req_sim == 0: sim_mat_item = sim_item_cosine if req_sim == 1: sim_mat_item = sim_item_jaccard if req_sim == 2: sim_mat_item = sim_item_pearson #predictRating(Mat, sim_mat_item) return Mat, sim_mat_item
test = admissions[admissions['fold'] == fold] lr.fit(train[['gpa']], train['actual_label']) test['predicted_label'] = lr.predict(test[['gpa']]) correct_predictions = test[(test['predicted_label']) == ( test['actual_label'])] fold_accuracies.append(len(correct_predictions) / len(test)) return (fold_accuracies) accuracies = train_and_test(admissions, fold_ids) print(accuracies) average_accuracy = np.mean(accuracies) print(average_accuracy) ## 5. Sklearn ## from sklearn.cross_validation import KFold from sklearn.cross_validation import cross_val_score admissions = pd.read_csv("admissions.csv") admissions["actual_label"] = admissions["admit"] admissions = admissions.drop("admit", axis=1) kf = KFold(len(admissions), 5, shuffle=True, random_state=8) lr = LogisticRegression() accuracies = cross_val_score(lr, admissions[['gpa']], admissions['actual_label'], scoring='accuracy', cv=kf) average_accuracy = sum(accuracies) / len(accuracies) print(accuracies, average_accuracy)
print loans.info() # In[ ]: # In[96]: #使用逻辑回归来分析数据,逻辑回归是一个非常经典的二分类 from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import cross_val_predict, KFold lr = LogisticRegression() cols = loans.columns train_cols = cols.drop("loan_status") features = loans[train_cols] target = loans["loan_status"] kf = KFold(features.shape[0], random_state=1) predictions = cross_val_predict(lr, features, target, cv=kf) predictions = pd.Series(predictions) # In[98]: #False positive fp_filter = (predictions == 1) & (loans["loan_status"] == 0) fp = len(predictions[fp_filter]) #True Positive tp_filter = (predictions == 1) & (loans["loan_status"] == 1) tp = len(predictions[tp_filter]) #False negative fn_filter = (predictions == 0) & (loans["loan_status"] == 1) fn = len(predictions[fn_filter]) #True negative
def stacking_classifier(folds, models): # Level 1 regression models regrs = models # 5-fold cross validation kf = list(KFold(len(target_train_bin), n_folds=folds, shuffle = True, random_state = 1991)) # Pre-allocate the data blend_train = np.zeros((regressors_train_pca.shape[0], len(regrs))) # Number of training data x Number of classifiers blend_test = np.zeros((regressors_validation_pca.shape[0], len(regrs))) # Number of testing data x Number of classifiers # For each classifier, we train the number of fold times (=len(kf)) for j, clf in enumerate(regrs): print('Training Regression Model [{}] - {}'.format(j, clf)) blend_test_j = np.zeros((regressors_validation_pca.shape[0], len(kf))) # Number of testing data x Number of folds , we will take the mean of the predictions later for i, (train_index, cv_index) in enumerate(kf): print('Fold [{}]'.format(i)) # This is the training and validation set X_train = regressors_train_pca[train_index] #Y_train = target_train_bin.iloc[train_index] Y_train = target_train_bin[train_index] X_cv = regressors_train_pca[cv_index] if(j == 0): # ANN Y_train = to_categorical(Y_train) clf.fit(X_train, Y_train, validation_split=0.2, epochs=5, batch_size=16, verbose=2) else: clf.fit(X_train, Y_train) # This output will be the basis for our blended classifier to train against, # which is also the output of level 1 Regressors if(j==0): blend_train[cv_index, j] = clf.predict_classes(X_cv).flatten() blend_test_j[:, i] = clf.predict_classes(regressors_validation_pca).flatten() else: blend_train[cv_index, j] = clf.predict(X_cv).flatten() blend_test_j[:, i] = clf.predict(regressors_validation_pca).flatten() # Take the mean of the predictions of the cross validation set blend_test[:, j] = blend_test_j.mean(1) # Blending (predict Level 2 based on predictions on the train set) # ridgecv = RidgeCV(alphas=alphas, scoring='neg_mean_squared_error', normalize=False) # ridgecv.fit(blend_train, target_train) # ridgecv.alpha_ # Fit Ridge model with best alpha # bclf = Ridge(alpha=ridgecv.alpha_, normalize=False, max_iter=10000) # bclf.fit(blend_train, target_train) bclf = LogisticRegression() bclf.fit(blend_train, target_train_bin) #bclf =NN_CLF_model(len(regrs)) #bclf.fit(blend_train, to_categorical(target_train_bin), validation_split=0.2, epochs=5, batch_size=16, verbose=2) # Predict now predicted_level2_bin = bclf.predict(blend_test) #predicted_level2_bin = bclf.predict_classes(blend_test) score = accuracy_score(target_validation_bin, predicted_level2_bin) return score, predicted_level2_bin
def loadDataSet(filename): strArr = [line.strip().split('\t') for line in open(filename).readlines()] dataSet = [map(float, line) for line in strArr] dataMat = np.mat(dataSet) m, n = np.shape(dataMat) return dataMat[:, :n - 1], dataMat[:, -1] if __name__ == "__main__": x, y = loadDataSet('../2-knn/datingTestSet2.txt') #拆分数据集 m = np.shape(x)[0] kf = KFold(m, n_folds=5, shuffle=True) #1000分为5份 clf = neighbors.KNeighborsClassifier(n_neighbors=3) for iteration, data in enumerate(kf, start=1): clf.fit(x[data[0]], np.ravel(y[data[0]])) answer = clf.predict(x[data[1]]) print 'iteration', iteration print(classification_report(y[data[1]], answer)) #训练KNN分类器 # x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2) # clf = neighbors.KNeighborsClassifier(n_neighbors=3) # clf.fit(x_train,np.ravel(y_train)) # answer = clf.predict(x_test) # print(classification_report(y_test,answer)) # precision,recall,thresholds = precision_recall_curve(y_test,answer) 二分类问题
def run_cross_validation_create_models(nfolds=10): # input image dimensions batch_size = 16 nb_epoch = 25 random_state = 51 restore_from_last_checkpoint = 1 train_data, train_target, train_id, driver_id, unique_drivers = read_and_normalize_train_data( ) yfull_train = dict() kf = KFold(len(unique_drivers), n_folds=nfolds, shuffle=True, random_state=random_state) num_fold = 0 sum_score = 0 for train_drivers, test_drivers in kf: model = VGG_16() unique_list_train = [unique_drivers[i] for i in train_drivers] X_train, Y_train, train_index = copy_selected_drivers( train_data, train_target, driver_id, unique_list_train) unique_list_valid = [unique_drivers[i] for i in test_drivers] X_valid, Y_valid, test_index = copy_selected_drivers( train_data, train_target, driver_id, unique_list_valid) num_fold += 1 print('Start KFold number {} from {}'.format(num_fold, nfolds)) print('Split train: ', len(X_train), len(Y_train)) print('Split valid: ', len(X_valid), len(Y_valid)) print('Train drivers: ', unique_list_train) print('Test drivers: ', unique_list_valid) kfold_weights_path = os.path.join( 'cache', 'weights_kfold_vgg16_' + str(num_fold) + '.h5') if not os.path.isfile( kfold_weights_path) or restore_from_last_checkpoint == 0: callbacks = [ EarlyStoppingByLossVal(monitor='val_loss', value=0.00001, verbose=1), EarlyStopping(monitor='val_loss', patience=5, verbose=0), ModelCheckpoint(kfold_weights_path, monitor='val_loss', save_best_only=True, verbose=0), ] model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, shuffle=True, verbose=1, validation_data=(X_valid, Y_valid), callbacks=callbacks) if os.path.isfile(kfold_weights_path): model.load_weights(kfold_weights_path) # score = model.evaluate(X_valid, Y_valid, show_accuracy=True, verbose=0) # print('Score log_loss: ', score[0]) predictions_valid = model.predict(X_valid.astype('float32'), batch_size=batch_size, verbose=1) score = log_loss(Y_valid, predictions_valid) print('Score log_loss: ', score) sum_score += score * len(test_index) # Store valid predictions for i in range(len(test_index)): yfull_train[test_index[i]] = predictions_valid[i] score = sum_score / len(train_data) print("Log_loss train independent avg: ", score) predictions_valid = get_validation_predictions(train_data, yfull_train) print('Final log_loss: {}, nfolds: {} epoch: {}'.format( score, nfolds, nb_epoch)) info_string = 'loss_' + str(score) \ + '_folds_' + str(nfolds) \ + '_ep_' + str(nb_epoch) save_useful_data(predictions_valid, train_id, model, info_string) score1 = log_loss(train_target, predictions_valid) if abs(score1 - score) > 0.0001: print('Check error: {} != {}'.format(score, score1))
feats = df_train.drop("revenue", axis=1) X = feats.values #features y = df_train["revenue"].values #target for i in range(0, len(y) - 1): if y[i] > 10000000: print("sdfjsd") X.pop(i) y.pop(i) ### Linear Regression ### from sklearn.linear_model import LinearRegression from sklearn.preprocessing import StandardScaler kf = KFold(len(y), n_folds=15, shuffle=True) y_pred = np.zeros(len(y), dtype=y.dtype) # where we'll accumulate predictions lr = LinearRegression() # CV Loop for train_index, test_index in kf: # for each iteration of the for loop we'll do a test train split X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] t = StandardScaler() X_train = t.fit_transform(X_train) lr.fit(X_train, y_train) # Train on the training data X_test = t.transform(X_test)
9.others from shutil import copyfile copyfile(src,file) # make and write file script=open(os.path.join(output_file,'Dodge_'+str(idx)+'.txt'),'a') script.write('1'+'\n') script.close() 10. glob path = os.path.join('..','data','train',fld,'*jpg') files = glob.glob(path) 11. sclearn #K-Folds cross validation iterator from sklearn.cross_validation import KFold kf = KFold(len(X_train), n_folds=n_fold, shuffle=True, random_state=random_state) for train_idx, cv_idx in kf: 12. keras callbacks = [EarlyStopping(monitor='val_loss', patience=3, verbose=0)] model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, shuffle=True, verbose=2, validation_data=(X_valid, Y_valid), callbacks=callbacks) model.add(Convolution2D(12,4,4, border_mode='same',trainable=False)) 13. pandas result1 = pd.DataFrame(predictions, columns=['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT']) result1.loc[:, 'image'] = pd.Series(test_id, index=result1.index)
#%% # Lasso model = linear_model.Lasso(alpha = 0.001) model.fit(trainX, trainY) prediction = model.predict(testX) print("Lasso Accuracy: ", model.score(testX,testY)) #%% #Ridge model = linear_model.Ridge(alpha = 0.05, normalize=True) model.fit(trainX, trainY) prediction = model.predict(testX) print("Ridge Accuracy: ", model.score(testX,testY)) #%% kfold = KFold(n=10,random_state=10) #%% cvMean = [] results = [] classifiers = ['Linear Svm','Radial Svm','Logistic Regression','Decision Tree','KNN'] models = [svm.SVC(kernel='linear'),svm.SVC(kernel='rbf'),LogisticRegression(),DecisionTreeClassifier(),KNeighborsClassifier(n_neighbors=3)] for i in models: model = i result = cross_val_score(model, wine[wine.columns[:11]], wine['quality'],cv=kfold, scoring='accuracy') results.append(result) cvMean.append(result.mean()) new_models_df = pd.DataFrame(cvMean, index=classifiers) new_models_df.columns = ['CV Mean'] new_models_df
def get_ten_fold_crossvalid_perfermance(self, fisher_mode, settings=None): analysis_scr = [] predicted_score = False reduce_ratio = 1 #for seq_no in range(1, self.ddi_obj.total_number_of_sequences+1): #subset_size = math.floor(self.ddi_obj.total_number_of_sequences / 10.0) kf = KFold(self.ddi_obj.total_number_of_sequences, n_folds=10) #for subset_no in range(1, 11): for ((train_index, test_index), subset_no) in izip(kf, range(1, 11)): #for train_index, test_index in kf; print("Subset:", subset_no) print("Train index: ", train_index) print("Test index: ", test_index) #logger.info('subset number: ' + str(subset_no)) if 1: print "SVM" #start_index = int((subset_no - 1) * subset_size + 1) #if subset_no == 10: # end_index = int(max(start_index + subset_size, self.ddi_obj.total_number_of_sequences)) #else: # end_index = int(start_index + subset_size) #print start_index, end_index #(train_X_10fold, train_y_10fold),(train_X_reduced, train_y_reduced), (test_X, test_y) = self.ddi_obj.get_ten_fold_crossvalid_one_subset(start_index, end_index, reduce_ratio = reduce_ratio) (train_X_10fold, train_y_10fold), (train_X_reduced, train_y_reduced), ( test_X, test_y) = self.ddi_obj.get_ten_fold_crossvalid_one_subset( train_index, test_index, reduce_ratio=reduce_ratio) standard_scaler = preprocessing.StandardScaler().fit( train_X_reduced) scaled_train_X = standard_scaler.transform(train_X_reduced) scaled_test_X = standard_scaler.transform(test_X) Linear_SVC = LinearSVC(C=1, penalty="l2") Linear_SVC.fit(scaled_train_X, train_y_reduced) predicted_test_y = Linear_SVC.predict(scaled_test_X) isTest = True #new analysis_scr.append( (self.ddi, subset_no, fisher_mode, 'SVM', isTest) + tuple( performance_score(test_y, predicted_test_y).values())) #new predicted_train_y = Linear_SVC.predict(scaled_train_X) isTest = False #new analysis_scr.append( (self.ddi, subset_no, fisher_mode, 'SVM', isTest) + tuple( performance_score(train_y_reduced, predicted_train_y).values())) # direct deep learning min_max_scaler = Precessing_Scaler_0_9() X_train_pre_validation_minmax = min_max_scaler.fit(train_X_reduced) X_train_pre_validation_minmax = min_max_scaler.transform( train_X_reduced) x_test_minmax = min_max_scaler.transform(test_X) pretraining_X_minmax = min_max_scaler.transform(train_X_10fold) x_train_minmax, x_validation_minmax, y_train_minmax, y_validation_minmax = train_test_split( X_train_pre_validation_minmax, train_y_reduced, test_size=0.4, random_state=42) finetune_lr = 1 batch_size = 100 pretraining_epochs = cal_epochs(5000, x_train_minmax, batch_size=batch_size) #pretrain_lr=0.001 pretrain_lr = 0.001 training_epochs = 1500 hidden_layers_sizes = [100, 100] corruption_levels = [0.1, 0.1] if 1: print "direct deep learning" sda = trainSda(x_train_minmax, y_train_minmax, x_validation_minmax, y_validation_minmax , x_test_minmax, test_y, hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \ training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, pretrain_lr = pretrain_lr, finetune_lr=finetune_lr ) print 'hidden_layers_sizes:', hidden_layers_sizes print 'corruption_levels:', corruption_levels training_predicted = sda.predict(x_train_minmax) y_train = y_train_minmax isTest = False #new analysis_scr.append(( self.ddi, subset_no, fisher_mode, 'DL', isTest ) + tuple( performance_score(y_train, training_predicted).values())) test_predicted = sda.predict(x_test_minmax) y_test = test_y isTest = True #new analysis_scr.append( (self.ddi, subset_no, fisher_mode, 'DL', isTest) + tuple(performance_score(y_test, test_predicted).values())) if 0: # deep learning using unlabeled data for pretraining print 'deep learning with unlabel data' pretraining_epochs = cal_epochs(5000, pretraining_X_minmax, batch_size=batch_size) sda_unlabel = trainSda(x_train_minmax, y_train_minmax, x_validation_minmax, y_validation_minmax , x_test_minmax, test_y, pretraining_X_minmax = pretraining_X_minmax, hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \ training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, pretrain_lr = pretrain_lr, finetune_lr=finetune_lr ) print 'hidden_layers_sizes:', hidden_layers_sizes print 'corruption_levels:', corruption_levels training_predicted = sda_unlabel.predict(x_train_minmax) y_train = y_train_minmax isTest = False #new analysis_scr.append( (self.ddi, subset_no, fisher_mode, 'DL_U', isTest) + tuple( performance_score(y_train, training_predicted, predicted_score).values())) test_predicted = sda_unlabel.predict(x_test_minmax) y_test = test_y isTest = True #new analysis_scr.append( (self.ddi, subset_no, fisher_mode, 'DL_U', isTest) + tuple( performance_score(y_test, test_predicted, predicted_score).values())) if 0: # deep learning using split network print 'deep learning using split network' # get the new representation for A set. first 784-D pretraining_epochs = 5000 hidden_layers_sizes = [100, 100, 100] corruption_levels = [0, 0, 0] x = x_train_minmax[:, :x_train_minmax.shape[1] / 2] print "original shape for A", x.shape a_MAE_A = train_a_MultipleAEs( x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, hidden_layers_sizes=hidden_layers_sizes, corruption_levels=corruption_levels) new_x_train_minmax_A = a_MAE_A.transform( x_train_minmax[:, :x_train_minmax.shape[1] / 2]) x = x_train_minmax[:, x_train_minmax.shape[1] / 2:] print "original shape for B", x.shape a_MAE_B = train_a_MultipleAEs( x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, hidden_layers_sizes=hidden_layers_sizes, corruption_levels=corruption_levels) new_x_train_minmax_B = a_MAE_B.transform( x_train_minmax[:, x_train_minmax.shape[1] / 2:]) new_x_test_minmax_A = a_MAE_A.transform( x_test_minmax[:, :x_test_minmax.shape[1] / 2]) new_x_test_minmax_B = a_MAE_B.transform( x_test_minmax[:, x_test_minmax.shape[1] / 2:]) new_x_validation_minmax_A = a_MAE_A.transform( x_validation_minmax[:, :x_validation_minmax.shape[1] / 2]) new_x_validation_minmax_B = a_MAE_B.transform( x_validation_minmax[:, x_validation_minmax.shape[1] / 2:]) new_x_train_minmax_whole = np.hstack( (new_x_train_minmax_A, new_x_train_minmax_B)) new_x_test_minmax_whole = np.hstack( (new_x_test_minmax_A, new_x_test_minmax_B)) new_x_validationt_minmax_whole = np.hstack( (new_x_validation_minmax_A, new_x_validation_minmax_B)) finetune_lr = 1 batch_size = 100 pretraining_epochs = cal_epochs(5000, x_train_minmax, batch_size=batch_size) #pretrain_lr=0.001 pretrain_lr = 0.001 training_epochs = 1500 hidden_layers_sizes = [100, 100, 100] corruption_levels = [0, 0, 0] sda_transformed = trainSda(new_x_train_minmax_whole, y_train_minmax, new_x_validationt_minmax_whole, y_validation_minmax , new_x_test_minmax_whole, y_test, hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \ training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, pretrain_lr = pretrain_lr, finetune_lr=finetune_lr ) print 'hidden_layers_sizes:', hidden_layers_sizes print 'corruption_levels:', corruption_levels training_predicted = sda_transformed.predict( new_x_train_minmax_whole) y_train = y_train_minmax isTest = False #new analysis_scr.append( (self.ddi, subset_no, fisher_mode, 'DL_S', isTest) + tuple( performance_score(y_train, training_predicted, predicted_score).values())) test_predicted = sda_transformed.predict( new_x_test_minmax_whole) y_test = test_y isTest = True #new analysis_scr.append( (self.ddi, subset_no, fisher_mode, 'DL_S', isTest) + tuple( performance_score(y_test, test_predicted, predicted_score).values())) report_name = filename + '_' + '_test10fold_'.join( map(str, hidden_layers_sizes) ) + '_' + str(pretrain_lr) + '_' + str(finetune_lr) + '_' + str( reduce_ratio) + '_' + str(training_epochs) + '_' + current_date saveAsCsv(predicted_score, report_name, performance_score(y_test, test_predicted, predicted_score), analysis_scr)
lasagne.layers.set_all_param_values(net['prob'], d['param values']) for i, (tr_ix, val_ix) in enumerate(kf): print('CV Fold', i) X_tr = X[tr_ix] y_tr = y[tr_ix] X_val = X[val_ix] y_val = y[val_ix] #net['new_output'] = DenseLayer(net['pool5/7x7_s1'], num_units=10, nonlinearity=softmax, W=lasagne.init.Normal(0.01)) lasagne.layers.set_all_param_values(net['prob'], d['param values']) learning_rate.set_value(0.0002) for epoch in range(2): kf2 = KFold(len(y_tr), n_folds=np.floor(len(y_tr) / BATCH_SIZE), shuffle=True, random_state=1) progbar = Progbar(np.floor(len(y_tr) / BATCH_SIZE)) for j, (_, ix) in enumerate(kf2): loss, acc = train_batch(ix) progbar.add(1) learning_rate.set_value(learning_rate.get_value() * learning_rate_decay) v_ix = range(len(y_val)) t_ix = range(len(y_tr)) np.random.shuffle(v_ix) np.random.shuffle(t_ix) tr_loss_tot = 0.
data = featureFormat(my_dataset, features_list) ### split into labels and features (this line assumes that the first ### feature in the array is the label, which is why "poi" must always ### be first in features_list labels, features = targetFeatureSplit(data) ### machine learning goes here! ### please name your classifier clf for easy export below ### deploying feature selection features_train, features_test, labels_train, labels_test = cross_validation.train_test_split( features, labels, test_size=0.1, random_state=42) ### use KFold for split and validate algorithm kf = KFold(len(labels), 3) for train_indices, test_indices in kf: #make training and testing sets features_train = [features[ii] for ii in train_indices] features_test = [features[ii] for ii in test_indices] labels_train = [labels[ii] for ii in train_indices] labels_test = [labels[ii] for ii in test_indices] t0 = time() clf = DecisionTreeClassifier() clf.fit(features_train, labels_train) score = clf.score(features_test, labels_test) print 'accuracy before tuning ', score print "Decision tree algorithm time:", round(time() - t0, 3), "s"
process_mask[:, 30:] = 0 process_mask_img = nibabel.Nifti1Image(process_mask, mask_img.get_affine()) ### Searchlight computation ################################################### # Make processing parallel # /!\ As each thread will print its progress, n_jobs > 1 could mess up the # information output. n_jobs = 1 ### Define the cross-validation scheme used for validation. # Here we use a KFold cross-validation on the session, which corresponds to # splitting the samples in 4 folds and make 4 runs using each fold as a test # set once and the others as learning sets from sklearn.cross_validation import KFold cv = KFold(y.size, n_folds=4) import nilearn.decoding # The radius is the one of the Searchlight sphere that will scan the volume searchlight = nilearn.decoding.SearchLight(mask_img, process_mask_img=process_mask_img, radius=5.6, n_jobs=n_jobs, verbose=1, cv=cv) searchlight.fit(fmri_img, y) ### F-scores computation ###################################################### from nilearn.input_data import NiftiMasker # For decoding, standardizing is often very important
from sklearn.externals import joblib import time from sklearn.naive_bayes import MultinomialNB filename = '/Users/jzhy/Downloads/train.csv' data = pd.read_csv(filename) X = numpy.zeros((len(data.x), 4)) X[:, 0] = data.x X[:, 1] = data.y X[:, 2] = data.accuracy X[:, 3] = data.time Y = numpy.zeros((len(data.x), 1)) Y = data.place_id XX = preprocessing.scale(X) YY = numpy.unique(Y) kf = KFold(len(X), n_folds=len(Y) / 10000 + 1) clf = MultinomialNB() i = 0 for train, test in kf: clf.partial_fit(X[test, :], Y[test], YY.reshape((len(YY), -1))) i = i + 1 print i joblib.dump(clf, 'MultinomialNB.pkl') exit()
# Create the dataframe containing the unsorted preictal samples features ix_1 = 1990401 # Index of the first sample of the category second_preictal_index = df[df['index'] >= ix_1].index.tolist() #print "second_preictal_list", len(second_preictal_index) [debug] # Create two dataframes to store separately the preictal and the interictal samples preictal_df = df.loc[(df['class'] == 1) & (df['index'] < ix_1)] preictal_df = preictal_df.sort_values('index', axis = 0) # print "preictal_df", preictal_df.shape [debug] interictal_df = (df.loc[df['class'] == 0]) interictal_df = interictal_df.sort_values('index',axis = 0) # print "interictal_df", interictal_df.shape [debug] # Create the train test splits for the preictal and the interictal samples preictal_folds = list(KFold(n = preictal_df.shape[0], n_folds=25, shuffle=False)) interictal_folds = list(KFold(n = interictal_df.shape[0], n_folds=24, shuffle=False)) # Create the test set test_i = [] # create the list for temporary storage of interictal sample indices for the test set test_p = [] # create the list for temporary storage of preictal sample indices for the test set # Compose the list of indices for the test set for i in range(25): if ((i+1)%5) == 0: tr_p, tt_p = preictal_folds[i] test_p = test_p + list(tt_p+interictal_df.shape[0]) if ((i+1)%6) == 0 and i < 24: tr_i, tt_i = interictal_folds[i] test_i = test_i + list(tt_i)
def main(): if len(sys.argv) == 1: print 'need filename' sys.exit(-1) else: infilename = sys.argv[-1] print infilename #npzfile = np.load('data/unigram_bigram_ner_senti_pos_lda_data.npz') npzfile = np.load(infilename) X = npzfile['X']; y = npzfile['y']; #split the data into 8:2 -> training:testing trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.2, random_state=0) print 'feature size: '+str(np.shape(X)) feature_index = set() # store the best feature indices kf = KFold(trainX.shape[0], n_folds=5) # kfold on training set for feature selection for train_index, test_index in kf: trainX_train, trainX_test = trainX[train_index], trainX[test_index] trainy_train, trainy_test = y[train_index], y[test_index] auc_best_global = 0; # best auc in each cross validation xtrainBest = [] # store the best feture matrix for traing section of traingX xtestBest = [] #store the best feture matrix for testing section of traingX residual_col_indices = set() # residual column indices to check for each iteration when adding new features for i in range(0,X.shape[1]): #init the set with all col indices residual_col_indices.add(i) for i in range(0, X.shape[1]): colInd_best = -1; auc_best_local = 0 # init to 0 for colInd in residual_col_indices: if i == 0: # if it's the first feature to add xtrainCur = trainX_train[:,colInd].reshape(trainX_train.shape[0],-1) #convert to a column vector xtestCur = trainX_test[:,colInd].reshape(trainX_test.shape[0],-1) else: xtrainCur = np.hstack((xtrainBest, trainX_train[:,colInd].reshape(trainX_train.shape[0],-1) )) xtestCur = np.hstack((xtestBest, trainX_test[:,colInd].reshape(trainX_test.shape[0],-1) )) clf = LogisticRegression(); clf.fit(xtrainCur, trainy_train) y_true, y_pred = trainy_test, clf.predict(xtestCur) auc = roc_auc_score(y_true, y_pred) # auc score if auc_best_local < auc: auc_best_local = auc colInd_best = colInd print 'auc = ' + str(auc_best_local) + '\tcolInd_best = '+str(colInd_best) if auc_best_global < auc_best_local : # if auc is increasing by adding new features if i == 0: # if it's the first feature to add xtrainBest = trainX_train[:,colInd_best].reshape(trainX_train.shape[0],-1) xtestBest = trainX_test[:,colInd_best].reshape(trainX_test.shape[0],-1) else: xtrainBest = np.hstack((xtrainBest,trainX_train[:,colInd_best].reshape(trainX_train.shape[0],-1))) xtestBest = np.hstack((xtestBest,trainX_test[:,colInd_best].reshape(trainX_test.shape[0],-1))) print 'feature index to add: '+str(colInd_best) feature_index.add(colInd_best) # union of all features selected during each k-fold CV residual_col_indices.remove(colInd_best) auc_best_global = auc_best_local if auc_best_global == 1: break; else: break; print 'auc_best_global found on current trainX_test fold: '+str(auc_best_global) print '# features selected = '+str(len(feature_index)) feature_index = list(feature_index) print 'feature_index = ' + str(feature_index) # should NOT sort feature_index before test! outfilename = infilename[0:-8] +'selected.npz' np.savez(outfilename,X = X[:,feature_index], y = y) clf.fit(trainX[:,feature_index], trainy) testy_true, testy_pred = testy, clf.predict(testX[:,feature_index]) auc_test = roc_auc_score(testy_true, testy_pred) print 'auc test = '+str(auc_test) # ---------------------------------- tune params ---------------------------------- # Set the parameters by cross-validation tuned_parameters = [{}, {'penalty': ['l2'], 'C':np.logspace(-5, 4, 10), 'solver': ['sag'] ,'max_iter':[500] }, {'penalty': ['l2'], 'C':np.logspace(-5, 4, 10), 'solver': ['newton-cg'] ,'max_iter':[500] }, {'penalty': ['l2'], 'C':np.logspace(-5, 4, 10), 'solver': ['lbfgs'] ,'max_iter':[500] }, {'penalty': ['l2','l1'], 'C':np.logspace(-5, 4, 10), 'solver': ['liblinear'] ,'max_iter':[500] } ] clf = GridSearchCV(LogisticRegression(class_weight= 'balanced'), tuned_parameters, cv=5, scoring= None) clf.fit(trainX[:,feature_index], trainy) print("Best parameters set found on development set:") print(clf.best_params_) y_true, y_pred = testy, clf.predict(testX[:,feature_index]) auc = roc_auc_score(testy_true, testy_pred) print 'accuracy = ' + str(accuracy_score(y_true, y_pred)) print 'auc = ' + str(auc)
miz = aud_model.Functional_Model(input_neurons=input_neurons, dropout1=dropout1, cross_validation=cross_validation, act1=act1, act2=act2, act3=act3, nb_filter=nb_filter, filter_length=filter_length, num_classes=num_classes, model=model, dimx=dimx, dimy=dimy) np.random.seed(68) if cross_validation: kf = KFold(len(tr_X), folds, shuffle=True, random_state=42) results = [] for train_indices, test_indices in kf: train_x = [tr_X[ii] for ii in train_indices] train_y = [tr_y[ii] for ii in train_indices] test_x = [tr_X[ii] for ii in test_indices] test_y = [tr_y[ii] for ii in test_indices] #train_y = to_categorical(train_y,num_classes=len(labels)) #test_y = to_categorical(test_y,num_classes=len(labels)) train_x = np.array(train_x) train_y = np.array(train_y) test_x = np.array(test_x) test_y = np.array(test_y) print "Development Mode"
def main(): parser = argparse.ArgumentParser( description="Transform csv files into numpy array") parser.add_argument('-d', '--data', required=True, help="The data directory") args = parser.parse_args() learning_rate = 0.0001 L1_reg = 0.00 L2_reg = 0.0001 n_epochs = 1000 batch_size = 32 n_hidden = 1000 ds = pickle.load(open(os.path.join(args.data, 'ds.npy'))) trI = ds['trI'] trX = ds['trX'].toarray() trY = ds['trY'].astype(np.int32) teI = ds['teI'] teX = ds['teX'].toarray() allX = np.vstack((trX, teX)) means, stds = calculate_mean_and_std(allX) normailize_by_zvalue(means, stds, allX) #normailize_by_minmax(allX); trX = allX[0:trX.shape[0], :] teX = allX[trX.shape[0]:trX.shape[0] + teX.shape[0], :] kf = KFold(trX.shape[0], n_folds=5) trainIds = None testIds = None for train, test in kf: trainIds = train testIds = test cv_train_X = theano.shared(trX[trainIds, :], 'cv_train_X') cv_test_X = theano.shared(trX[testIds, :], 'cv_test_X') cv_train_Y = theano.shared(trY[trainIds], 'cv_train_Y') cv_test_Y = theano.shared(trY[testIds], 'cv_test_Y') ncvtr = len(trainIds) ncvte = len(testIds) nte = teX.shape[0] n_train_batches = int(np.ceil(len(trainIds) * 1.0 / batch_size)) n_valid_batches = int(np.ceil(len(testIds) * 1.0 / batch_size)) n_test_batches = int(np.ceil(teX.shape[0] * 1.0 / batch_size)) teX = theano.shared(teX) rng = np.random.RandomState(1234) print('... building the model') left = T.lscalar() right = T.lscalar() x = T.matrix('x') y = T.ivector('y') classifier = MLP(rng=rng, input=x, n_in=149, n_hidden=n_hidden, n_out=2) cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr) validate_model = theano.function(inputs=[left, right], outputs=classifier.errors(y), givens={ x: cv_test_X[left:right], y: cv_test_Y[left:right] }) gparams = [T.grad(cost, param) for param in classifier.params] updates = [(param, param - learning_rate * gparam) for param, gparam in zip(classifier.params, gparams)] train_model = theano.function(inputs=[left, right], outputs=cost, updates=updates, givens={ x: cv_train_X[left:right], y: cv_train_Y[left:right] }) test_model = theano.function(inputs=[left, right], outputs=classifier.output, givens={ x: teX[left:right], }) print('... training') # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): minibatch_avg_cost = train_model( minibatch_index * batch_size, min((minibatch_index + 1) * batch_size, ncvtr)) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: validation_losses = [ validate_model(i * batch_size, min((i + 1) * batch_size, ncvte)) for i in range(n_valid_batches) ] this_validation_loss = np.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) if this_validation_loss < best_validation_loss: if (this_validation_loss < best_validation_loss * improvement_threshold): patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter if patience <= iter: done_looping = True break end_time = timeit.default_timer() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) probs = np.vstack([ test_model(i * batch_size, min((i + 1) * batch_size, nte)) for i in range(n_test_batches) ]) fid = open('/local/db/uqdxingz/Santander/sub/mlp.csv', 'w') fid.write('ID,TARGET\n') for i in range(len(teI)): fid.write('%d,%.9f\n' % (int(teI[i]), probs[i][1])) fid.close()
xs = xyz[0][count] ys = xyz[1][count] zs = xyz[2][count] count = count+1 ax.scatter(xs, ys, zs, c=plt.cm.coolwarm(zs), alpha=.4) plt.show() #treino xy = mat['dados_rbf'][:,:2] z = mat['dados_rbf'][:,2].reshape(-1,1) ntd = xy.shape[0] #qtd dados #nf = x.shape[1] #qtd features #metaparametros n = 8 #qtd neuronios kf = KFold(ntd, n_folds=3) i = 1 #validacao cruzada for train_index, test_index in kf: #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = xy[train_index], xy[test_index] y_train, y_test = z[train_index], z[test_index] # rbf regression rbf = RBF(2, n, 1) rbf.train(X_train, y_train) zest = rbf.test(X_test) MSE = mean_squared_error(y_test, zest)
train_feat['id'] = train_feat['id'].apply(lambda x: 0 - int(x[1:]) if 'p' in x else int(x[1:])) test_feat['id'] = test_feat['id'].apply(lambda x: 0 - int(x[1:]) if 'p' in x else int(x[1:])) predictors = train_feat.columns.drop( ['label', 'enddate', 'hy_16.0', 'hy_91.0', 'hy_94.0']) print('开始CV 5折训练...') scores = [] t0 = time.time() mean_score = [] train_preds = np.zeros(len(train_feat)) test_preds = np.zeros(len(test_feat)) kf = KFold(len(train_feat), n_folds=5, shuffle=True, random_state=520) for i, (train_index, test_index) in enumerate(kf): lgb_train = lgb.Dataset(train_feat[predictors].iloc[train_index], train_feat['label'].iloc[train_index]) lgb_test = lgb.Dataset(train_feat[predictors].iloc[test_index], train_feat['label'].iloc[test_index]) params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'max_depth': 20, 'num_leaves': 150, 'learning_rate': 0.01, 'subsample': 0.7,
max_auc = 0.62 # best auc so far best_parameter = {'nodes': 200, 'weight decay': 1e-5} average_aucs = np.array([]) # array to store all average aucs for different parameters sd_aucs = np.array([]) # to store all standard deviation of aucs for i in range(len(grid)): print('---'*30) print('remaining iterations:', len(grid) - i) print('best auc so far is:', max_auc) print('best set of parameters are', best_parameter) next_parameters = {'nodes': grid[i][0], 'weight decay': grid[i][1],'regulization': grid[i][2]} print('Now try: ', next_parameters) nb_folds = 5 kfolds = KFold(len(y), nb_folds) #av_roc = 0. auc = np.array([]) # array to store all aucs in each fold f = 0 for train, valid in kfolds: print('---'*20) print('Fold', f+1) # counting folds f += 1 # splitting the folds X_train = X[train] X_valid = X[valid] Y_train = Y[train]
def make_mf_sliced_classification(subset_tr, subset_te, clf, n_round=3, target_col='median_relevance'): print '\n [make_mf_slice]' print clf mf_tr = np.zeros(len(subset_tr)) mf_te = np.zeros(len(subset_te)) #query-slice for cur_query in subset_tr.query_stem.value_counts().index: mask_tr = subset_tr.query_stem == cur_query mask_te = subset_te.query_stem == cur_query # build Bow vect = CountVectorizer(min_df=1, ngram_range=(1, 2)) txts = (list((subset_tr[mask_tr]['title_ext']).values) + list( (subset_te[mask_te]['title_ext']).values)) vect.fit(txts) X_loc_base = vect.transform( list((subset_tr[mask_tr]['title_ext']).values)).todense() X_loc_hold = vect.transform( list((subset_te[mask_te]['title_ext']).values)).todense() y_loc_train = subset_tr[mask_tr][target_col].values # intersect terms feat_counts = np.array(np.sum(X_loc_base, axis=0))[0] * np.array( np.sum(X_loc_hold, axis=0))[0] feat_mask = np.where(feat_counts > 0)[0] # build final feats matrix X_loc_base = np.hstack( (X_loc_base[:, feat_mask], subset_tr[mask_tr][feat_list])) X_loc_hold = np.hstack( (X_loc_hold[:, feat_mask], subset_te[mask_te][feat_list])) # metafeatures iterators tmp_tr = np.zeros(sum(mask_tr)) tmp_te = np.zeros(sum(mask_te)) #print y_loc_train.shape, X_loc_base.shape for i in range(n_round): kf = KFold(len(y_loc_train), n_folds=2, shuffle=True, random_state=42 + i * 1000) for ind_tr, ind_te in kf: X_tr = X_loc_base[ind_tr] X_te = X_loc_base[ind_te] y_tr = y_loc_train[ind_tr] y_te = y_loc_train[ind_te] clf.fit(X_tr, y_tr) tmp_tr[ind_te] += clf.predict(X_te) tmp_te += clf.predict(X_loc_hold) * 0.5 mf_tr[mask_tr.values] = tmp_tr / n_round mf_te[mask_te.values] = tmp_te / n_round y_valid = subset_tr[target_col].values kappa = pykappa.quadratic_weighted_kappa(y_valid, np.round(mf_tr)) acc = np.mean(y_valid == np.round(mf_tr)) print '[{}] kappa:{}, acc:{}'.format(i, kappa, acc) return (mf_tr, mf_te)
def cross_validate(X, y, pca_reduce=True): if pca_reduce == True: X = pd.DataFrame(dimensionality_reduction(X, y)) kf = KFold(len(X), n_folds=10, shuffle=True) accuracies = [] conf = [] precisions = [] recalls = [] for train, test in kf: X_train, X_test, y_train, y_test = X.as_matrix()[train], X.as_matrix( )[test], y.as_matrix()[train], y.as_matrix()[test] clf = LogisticRegression() clf.fit(X_train, y_train) predictions = clf.predict(X_test) conf.append(confusion_matrix(y_test, predictions)) recalls.append(recall_score(y_test, predictions)) precisions.append(precision_score(y_test, predictions)) accuracies.append(clf.score(X_test, y_test)) print 'Accuracy: ' + str(np.average(accuracies)) print 'Precision: ' + str(np.average(precisions)) print 'Recall: ' + str(np.average(recalls)) confusion = np.zeros((2, 2)) for i in range(len(conf)): confusion += conf[i] print confusion # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() fpr[0], tpr[0], _ = roc_curve(y_test[:], predictions[:]) roc_auc[0] = auc(fpr[0], tpr[0]) plt.figure() lw = 2 plt.plot(fpr[0], tpr[0], color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[0]) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve') plt.legend(loc="lower right") plt.show() average_precision = dict() precision = dict() recall = dict() precision[0], recall[0], _ = precision_recall_curve( y_test[:], predictions[:]) average_precision[0] = average_precision_score(y_test[:], predictions[:]) plt.clf() plt.plot(recall[0], precision[0], lw=lw, color='navy', label='Precision-Recall curve') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title('Precision-Recall example: AUC={0:0.2f}'.format( average_precision[0])) plt.legend(loc="lower left") plt.show()
parser.add_argument('-b','--minbin', help='Minimum categorical bin size', type=int, default=1) parser.add_argument('-cv','--cv', action='store_true') parser.add_argument('-codetest','--codetest', action='store_true') parser.add_argument('-getcached', '--getcached', action='store_true') parser.add_argument('-extra', '--extra', action='store_true') m_params = vars(parser.parse_args()) # Load data X, y, X_sub, ids = data.load(m_params) print("BNP Parabas: classification...\n") clf = ExtraTreesRegressor(n_estimators=700, max_features=60, min_samples_split= 4, max_depth=40, n_jobs=-1, min_samples_leaf=2) if m_params['cv']: # do cross validation scoring kf = KFold(X.shape[0], n_folds=4, shuffle=True, random_state=1) scr = np.zeros([len(kf)]) oob_pred = np.zeros(X.shape[0]) for i, (tr_ix, val_ix) in enumerate(kf): clf.fit(X[tr_ix], y[tr_ix]) pred = clf.predict(X[val_ix]) oob_pred[val_ix] = np.array(pred) scr[i] = log_loss(y[val_ix], np.array(pred)) print('Train score is:', scr[i]) print(log_loss(y, oob_pred)) print oob_pred[1:10] oob_filename = '../output/oob_pred_extrees_' + str(np.mean(scr)) + '.p' pkl.dump(oob_pred, open(oob_filename, 'wb')) else:
x_valid, "lr", 19) return xgb_train, xgb_test, cv_scores def nbsvm(x_train, y_train, x_valid): xgb_train, xgb_test, cv_scores = stacking(lightgbm, x_train, y_train, x_valid, "nbsvm", 19) return xgb_train, xgb_test, cv_scores import lightgbm from sklearn.cross_validation import KFold folds = 5 seed = 2018 kf = KFold(train_x.shape[0], n_folds=folds, shuffle=True, random_state=seed) lgb_train, lgb_test, m = nbsvm(train_x, train_y, test_x) score = f1_score(train_y, np.argmax(lgb_train, axis=1), labels=range(0, 19), average='macro') score = str(score)[:7] print(score) #保存预测概率文件 train_prob = pd.DataFrame(lgb_train) train_prob.columns = [ "class_prob_%s" % i for i in range(1, lgb_test.shape[1] + 1) ] train_prob["id"] = list(train_id["id"]) train_prob.to_csv('../sub_prob/train_prob_nblr_cv_%s.csv' % score, index=None)
# -*- coding: utf-8 -*- """ Created on Tue Sep 6 10:10:40 2016 """ import numpy as np from sklearn.cross_validation import KFold y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2]) print("kfold") kf = KFold(9, n_folds=3) print(len(kf)) for train, test in kf: print(train, test) print() for train, test in kf: print(y[train], y[test]) print() from sklearn.cross_validation import StratifiedKFold print("StratifiedKFold") kf = StratifiedKFold(y, n_folds=3) print(len(kf)) for train, test in kf: print(train, test) print() for train, test in kf:
# 2. Вычислите TF-IDF-признаки для всех текстов. Обратите внимание, что в этом задании мы предлагаем вам # вычислить TF-IDF по всем данным. При таком подходе получается, что признаки на обучающем множестве используют # информацию из тестовой выборки — но такая ситуация вполне законна, поскольку мы не используем значения целевой # переменной из теста. На практике нередко встречаются ситуации, когда признаки объектов тестовой выборки известны на # момент обучения, и поэтому можно ими пользоваться при обучении алгоритма. vectorizer = TfidfVectorizer() vectorizer.fit_transform(X) # 3. Подберите минимальный лучший параметр C из множества [10^-5, 10^-4, ... 10^4, 10^5] для SVM с # линейным ядром (kernel='linear') при помощи кросс-валидации по 5 блокам. Укажите параметр random_state=241 и для SVM, # и для KFold. В качестве меры качества используйте долю верных ответов (accuracy). grid = {'C': np.power(10.0, np.arange(-5, 6))} cv = KFold(y.size, n_folds=5, shuffle=True, random_state=241) model = SVC(kernel='linear', random_state=241) gs = grid_search.GridSearchCV(model, grid, scoring='accuracy', cv=cv) gs.fit(vectorizer.transform(X), y) score = 0 C = 0 for attempt in gs.grid_scores_: if attempt.mean_validation_score > score: score = attempt.mean_validation_score C = attempt.parameters['C'] # 4. Обучите SVM по всей выборке с оптимальным параметром C, найденным на предыдущем шаге. model = SVC(kernel='linear', random_state=241, C=C) model.fit(vectorizer.transform(X), y)
metricas = ("Metricas del modelo " + nombreClasificador).capitalize() imprimirTextoCentrado(metricas, tamanoConsola) mostrarMetricasGenerales(model, x_train, y_train, y_pred_train, "train") mostrarMetricasGenerales(model, x_test, y_test, y_pred_test, "test") imprimirTextoCentrado("", tamanoConsola, "*") imprimirTextoCentrado("Metricas importantes para clasificación", tamanoConsola, "*") confusion_matrix_train = mostrarMetricasClasificacion( model, x_train, y_train, y_pred_train, "train") imprimirTextoCentrado("", tamanoConsola, "#") confusion_matrix_test = mostrarMetricasClasificacion( model, x_test, y_test, y_pred_test, "test") #Crear un iterador de validación cruzada k-fold #Nota: Por defecto, la puntuación utilizada es la que se devuelve por el # método de puntuación del estimador (precisión) cv = KFold(n=len(y_train), n_folds=5, shuffle=True, random_state=0) scores = cross_val_score(model, x_train, y_train, cv=cv) print("Scores: ", (scores)) print("Mean score: {0:.3f} (+/-{1:.3f})".format(np.mean(scores), sem(scores))) print( "*******************************************************************") ########################################################################### if generarCompar: #Graficacion #En esta sección se grafican los resultados obtenidos #Se grafican la clasificación emocional con respecto # con respecto a las variables independiente que en este caso son los # pixeles de las imágenes de los rostros. mostrarGraficacionPrediVsReal(x_train, y_train, y_pred_train, "train", nombreClasificador)