def modelfit(alg, dtrain, predictors, dtest=None, dscore=None, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): if useTrainCV: xgb_param = alg.get_xgb_params() xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics=['logloss'], early_stopping_rounds=early_stopping_rounds, show_progress=False) alg.set_params(n_estimators=cvresult.shape[0]) #Fit the algorithm on the data alg.fit(dtrain[predictors], dtrain['target'], eval_metric='logloss') #Predict training set: dtrain_predictions = alg.predict(dtrain[predictors]) dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1] if isinstance(dtest, pd.DataFrame): dtest_predprob = alg.predict_proba(dtest[predictors])[:,1] if isinstance(dscore, pd.DataFrame): dscore_predprob = alg.predict_proba(dscore[predictors])[:,1] np.savetxt('XGBoost_pred_raw.csv', dscore_predprob, delimiter=",") #Print model report: print "\nModel Report" print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['target'].values, dtrain_predictions) print "Metric Score (Train): %f" % metrics.log_loss(dtrain['target'], dtrain_predprob) if isinstance(dtest, pd.DataFrame): print "Metric Score (Test): %f" % metrics.log_loss(dtest['target'], dtest_predprob) feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') plt.show()
def Test(): """Testing ConstrainedMultinomialRegression Compare the results with scikit-learn LogisticRegression v.15 Returns ------- Log Loss for Logistic Regression, ConstrainedMultinomialRegression Accuracy for Logistic Regression, ConstrainedMultinomialRegression """ n = 1000; p = 10; k = 3 X = np.random.randn(n, p) beta = np.random.binomial(1, .5, (p, k)) log_odd = X.dot(beta) prob = np.exp(log_odd)/(1 + np.exp(log_odd)) y = np.array([np.argmax(i) for i in prob]) lb = LabelBinarizer() Y = lb.fit_transform(y) w = randn(k,p) cut = n/2 train = np.arange(cut); valid = np.arange(cut,n) # Split Train and Test b = [(0,None)]*(p+1)*k # Constraint on Beta cl1 = LogisticRegression() cl2 = ConstrainedMultinomialClassifier(bounds = b) cl1.fit(X[train], y[train]) cl2.fit(X[train], y[train]) prob1 = cl1.predict_proba(X[valid]) prob2 = cl2.predict_proba(X[valid]) print log_loss(y[valid], prob1) print log_loss(y[valid], prob2) yhat1 = cl1.predict(X[valid]) yhat2 = cl2.predict(X[valid]) print accuracy_score(y[valid], yhat1) print accuracy_score(y[valid], yhat2)
def generic_cv_reg(X,y,model,n_folds,random_state) : kf = cross_validation.KFold(y.shape[0],n_folds=n_folds, shuffle=True, random_state=random_state) trscores, cvscores, times = [], [], [] i = 0 stack_train = np.zeros((len(y))) # stacked predictions threshold = 0.000001 for i, (train_fold, validate) in enumerate(kf) : i = i + 1 t = time() trscore = log_loss(y.iloc[train_fold], model.fit(X.iloc[train_fold], y.iloc[train_fold]).predict(X.iloc[train_fold])) validation_prediction = model.predict(X.iloc[validate]) validation_prediction[validation_prediction>1-threshold] = 1-threshold validation_prediction[validation_prediction<threshold] = threshold cvscore = log_loss(y.iloc[validate], validation_prediction) trscores.append(trscore); cvscores.append(cvscore); times.append(time()-t) stack_train[validate] = validation_prediction print("TRAIN %.5f | TEST %.5f | TIME %.2fm (1-fold)" % (np.mean(trscores), np.mean(cvscores), np.mean(times)/60)) print(model.get_params(deep = True)) print("\n") return np.mean(cvscores), stack_train
def check_lambda(dirnm, datanm_train, datanm_valid, datanm_orig_train, datanm_orig_valid, samples_per_class, Cs, num_classes): spct = 10*70 tdata, tlabels = load_full(dirnm+datanm_train, spct) print tdata.shape, tlabels.shape spct = 10 otdata, otlabels = load_full(dirnm+datanm_orig_train, spct) spct = 10*30 vdata, vlabels = load_full(dirnm+datanm_valid, spct) spct = 10 ovdata, ovlabels = load_full(dirnm+datanm_orig_valid, spct) # artif ans = np.zeros((len(Cs), 4)) for i, C in enumerate(Cs): clf = LogisticRegression(C =C, penalty='l2', multi_class = 'ovr', tol=0.001, n_jobs = -1, verbose = 0, solver = 'newton-cg') clf.fit(tdata, tlabels) out_train = clf.predict_proba(tdata) out_valid = clf.predict_proba(vdata) out_train_real = clf.predict_proba(otdata) out_valid_real = clf.predict_proba(ovdata) ans[i, 0] += log_loss(tlabels, out_train) ans[i, 1] += log_loss(vlabels, out_valid) ans[i, 2] += log_loss(otlabels, out_train_real) ans[i, 3] += log_loss(ovlabels, out_valid_real) np.savez("logreg_lambda", ans= ans, Cs = Cs, num_classes = num_classes, samples_per_class = samples_per_class) return ans
def simple_model(data, test): targets = data.target X, tX, y, ty = train_test_split(data.drop("target", axis=1), targets, test_size=0.2, random_state=2016) predictions = [] print("\n\nTraining") # Sklearn GBM clf = GradientBoostingClassifier(n_estimators=2500, learning_rate=0.026, max_depth=2, random_state=2015) cal = CalibratedClassifierCV(clf, cv=5, method="isotonic") cal.fit(X,y) pred = cal.predict_proba(tX)[:,1] print("\n\tValidation for Calibrated GBM") print("\t", log_loss(ty, pred)) print("\t", roc_auc_score(ty, pred)) # ens["gbm"] = pred predictions.append(cal.predict_proba(test)[:,1]) # XGBoost data = X.values label = y.values dtrain = xgb.DMatrix(data, label=label) datat = tX.values dtest = xgb.DMatrix(datat) param = {} param['objective'] = 'binary:logistic' param['eta'] = 0.1 param['max_depth'] = 8 param['eval_metric'] = 'auc' param['silent'] = 1 param['min_child_weight'] = 2 param['subsample'] = 0.5 param['colsample_bytree'] = 0.5 param['nthread'] = 4 num_round = 50 bst = xgb.train(param, dtrain, num_round) pred = bst.predict(dtest) print("\n\tValidation for XGBoost") print("\t", log_loss(ty, pred)) print("\t", roc_auc_score(ty, pred)) # ens["xgb"] = pred predictions.append(cal.predict_proba(test)[:,1]) predictions = sum(predictions)/len(predictions) return predictions
def generic_cv_np(X,y,model,n_folds,random_state) : kf = cross_validation.KFold(y.shape[0],n_folds=n_folds, shuffle=True, random_state=random_state) trscores, cvscores, times = [], [], [] i = 0 stack_train = np.zeros((len(y))) # stacked predictions for i, (train_fold, validate) in enumerate(kf) : i = i + 1 t = time() model.fit(X[train_fold,], y[train_fold]) trscore = log_loss(y[train_fold], model.predict_proba(X[train_fold,])) validation_prediction = model.predict_proba(X[validate,]) cvscore = log_loss(y[validate], validation_prediction) trscores.append(trscore); cvscores.append(cvscore); times.append(time()-t) stack_train[validate] = validation_prediction print("TRAIN %.5f | TEST %.5f | TIME %.2fm (1-fold)" % (np.mean(trscores), np.mean(cvscores), np.mean(times)/60)) print(model.get_params()) print("\n") return np.mean(cvscores), stack_train
def svm_grid_search(): #get data training_input,training_target,validation_input,validation_target = prepare_input() #set up scorer for grid search. log-loss is error, not score, so set greater_is_better to false, #and log-loss requires a probability log_loss_scorer = make_scorer(log_loss,greater_is_better=False,needs_proba=True) training_input = training_input[:100000] training_target = training_target[:100000] print training_input.shape[0] print training_target.shape[0] start = time.time() svm = SVC(random_state=31,probability=True) svm_parameters = {'C':[.001,.01,.1,1,10,100],'kernel':["rbf","sigmoid"]} svm_grid_obj = GridSearchCV(svm,svm_parameters,log_loss_scorer,verbose=2,n_jobs=-1) svm_grid_obj = svm_grid_obj.fit(training_input,training_target) svm = svm_grid_obj.best_estimator_ print "Best params: " + str(svm_grid_obj.best_params_) svm_train_error = log_loss(training_target,svm.predict_proba(training_input)) svm_validation_error = log_loss(validation_target,svm.predict_proba(validation_input)) print "Best SVM training error: {:02.4f}".format(svm_train_error) print "Best SVM validation error: {:02.4f}".format(svm_validation_error) end = time.time() print "RF grid search took {:02.4f} seconds".format(end-start) return svm
def check_vb(dirnm, datanm_train, datanm_valid, C, num_classes): spct = 10*70 tdata, tlabels = load_full(dirnm+datanm_train, spct) #print tdata.shape, tlabels.shape spct = 10*30 vdata, vlabels = load_full(dirnm+datanm_valid, spct) h = np.arange(0, 310, 10) h[0] +=1 # artif ans = np.zeros((h.size, 2)) tind = kget(tlabels, num_classes, h[-1]) vind = kget(vlabels, num_classes, h[-1]) for l in xrange(0, h.size): clf = LogisticRegression(C =C, penalty='l2', multi_class = 'ovr', tol=0.001, n_jobs = -1, verbose = 0, solver = 'newton-cg') clf.fit(tdata[tind[:h[l]*num_classes]], tlabels[tind[:h[l]*num_classes]]) out_train = clf.predict_proba(tdata[tind[:h[l]*num_classes]]) out_valid = clf.predict_proba(vdata[vind[:h[l]*num_classes]]) ans[l, 0] += log_loss(tlabels[tind[:h[l]*num_classes]], out_train) ans[l, 1] += log_loss(vlabels[vind[:h[l]*num_classes]], out_valid) np.savez("logreg_bv", ans= ans, C = C, num_classes = num_classes) return ans
def xgb_common(train2, y, test2, v, z, N_seeds, N_splits, cname, xgb_params): scores = [] skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) dtest = xgb.DMatrix(test2) for s in range(N_seeds): cname2 = cname + str(s) v[cname2], z[cname2] = 0, 0 xgb_params['seed'] = s + 4242 for n, (itrain, ival) in enumerate(skf.split(train2, y)): dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train2.ix[ival], y[ival]) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False) p = clf.predict(dvalid) v.loc[ival, cname2] += pconvert(p) score = metrics.log_loss(y[ival], p) z[cname2] += pconvert(clf.predict(dtest)) print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now()) scores.append(score) z[cname2] /= N_splits vloss = [metrics.log_loss(y, prestore(v[cname + str(i)])) for i in range(N_seeds)] print('validation loss: ', vloss, np.mean(vloss), np.std(vloss)) cv=np.array(scores) print(cv, cv.mean(), cv.std())
def xgb2(train2, y, test2, v, z): cname = sys._getframe().f_code.co_name v[cname], z[cname] = 0, 0 N_splits = 5 scores = [] skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True) xgb_params = dict( max_depth = 3, learning_rate = 0.01, subsample = 0.7, #colsample_bytree = 0.8, objective = 'binary:logistic', eval_metric = 'logloss', seed = 1, silent = 1 ) dtest = xgb.DMatrix(test2) for n, (itrain, ival) in enumerate(skf.split(train2, y)): print('step %d of %d'%(n+1, skf.n_splits), now()) dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train2.ix[ival], y[ival]) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=1000) p = clf.predict(dvalid) v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) z[cname] += pconvert(clf.predict(dtest)) print(cname, 'seed %d step %d: '%(xgb_params['seed'], n+1), score, now()) scores.append(score) print('validation loss: ', metrics.log_loss(y, v[cname])) cv=np.array(scores) print(cv, cv.mean(), cv.std()) z[cname] /= N_splits
def rf_fit(): train_inp,valid_inp,train_target,valid_target = prepare_input() rf = RandomForestClassifier(random_state=31,n_jobs=-1,verbose=1,n_estimators=100,min_samples_split=5) start = time.time() rf.fit(train_inp,train_target) end = time.time() print "fitting took {:0.4} seconds".format(end-start) training_output = rf.predict_proba(train_inp) validation_output = rf.predict_proba(valid_inp) training_error = log_loss(train_target,training_output) validation_error = log_loss(valid_target,validation_output) print "Train error: {:02.4f}".format(training_error) print "Validation error: {:02.4f}".format(validation_error) joblib.dump(rf,rf_filename) return rf
def cross_valid(X, y, params, iterations, n_folds=6, silent=True): print 'Running cross validation' pprint.pprint(params) print 'Iterations:', iterations print 'X shape', X.shape y_size = len(y) if hasattr(X, 'values'): X = X.values y = np.array(y) kf = cross_validation.KFold(y_size, n_folds=n_folds, shuffle=True, random_state=params['seed']) y_pred = np.zeros((y_size, 9)) logs = [] for train, test in kf: X_train, X_test = X[train, :], X[test, :] y_train, y_test = y[train], y[test] predictions = predict(X_train, y_train, X_test, params, iterations, None if silent else y_test) y_pred[test] = predictions logs.append(metrics.log_loss(y_test, predictions)) print 'Current log_loss:', logs[-1] print 'Final log_loss: %s (avg: %s, stddev: %s)' % ( metrics.log_loss(y, y_pred), np.mean(logs), np.std(logs))
def xgboostcv(max_depth, eta, num_rounds, gamma, min_child_weight, max_delta_step, subsample, colsample_bytree, silent=True, seed=1234): print ('\nRunning XGBOOST on the cluster') # Call xgboost in distributed mode (CLI input for params) xgb_run = ['max_depth=%s' % int(max_depth), 'eta=%s' % eta, 'silent=%s' % silent, 'gamma=%s' % gamma, 'min_child_weight=%s' % int(min_child_weight), 'max_delta_step=%s' % max_delta_step, 'subsample=%s' % subsample, 'eval_metric=logloss', 'colsample_bytree=%s' % colsample_bytree, 'seed=%s' % seed, 'objective=binary:logistic', 'eval[eval_set]=%s' % deval, 'eval[train_set]=%s' % dtrain, 'num_round=%s' % int(num_rounds), 'data=%s' % dtrain, 'model_out=%s' % model_ouput] argv = ['wormhole/repo/dmlc-core/tracker/dmlc_yarn.py', # Where your instance is found!! '-n', '16', 'wormhole/bin/xgboost.dmlc', # Where your instance is found!! './examples/xgboost-avazu.txt'] + xgb_run print(' '.join(argv)) # Cluster specific ENV VARS. Popen(argv, env = {'JAVA_HOME': '/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.45-28.b13.el6_6.x86_64/', 'HADOOP_HOME': '/usr/', 'HADOOP_HDFS_HOME': '/usr/lib/hadoop-hdfs/', 'PATH': os.getenv('PATH')}).communicate() # Export model to local filesystem try: os.remove("avazu.model") except OSError: pass Popen(["hadoop","fs","-copyToLocal","/tmp/avazu.model", "."]).communicate() # Delete stored model. Popen(["hadoop","fs","-rm","/tmp/avazu.model", "."]).communicate() # Load Model file bst = xgb.Booster(model_file='avazu.model') preds = bst.predict(dtest) y_pred = bst.predict(dtest) y_valid = dtest.get_label() print('logloss = ', log_loss(y_valid, y_pred)) # We are maximizing the function. return -log_loss(y_valid, y_pred)
def plot_score(test_predictions, y_test, train_predictions, y_train, color, learning_rate): test_loss = [log_loss(y_test, pred) for pred in test_predictions] train_loss = [log_loss(y_train, pred) for pred in train_predictions] plt.plot(test_loss, color, linewidth=2) plt.plot(train_loss, color+'--', linewidth=2) looses[learning_rate] = test_loss
def svm_model(train_data_features, train_data_split_crossfold_features, test_data_features, labels, labels_cross_validation_classwise, using_cross_validation2, kf, settings): if using_cross_validation2: C_base = 4.5 C_step = 0.5#0.005 C = C_base _results = [] if(len(train_data_cross_validation_classwise_features) > 0): """train_all = np.append(train_data_features, train_data_cross_validation_classwise_features, axis=0) labels_all = np.append(labels, labels_cross_validation_classwise) kf_all = KFold(len(train_all)-1, n_folds=int(settings['Data']['CrossValidation2']), shuffle=True) for train, test in kf_all: svc = SVC(kernel="linear", C=C, probability=True) model = svc.fit(train_all[train], labels_all[train]) predicted_classes = model.predict(train_all[test]) predicted_classes_train = model.predict(train_all[train]) class_probabilities = model.predict_proba(train_all[test]) print("C: ",C," n points:", len(predicted_classes), " percentage: ",(labels_all[test] != predicted_classes).sum()*100/len(predicted_classes),"% percentage_train: ", (labels_all[train] != predicted_classes_train).sum()*100/len(predicted_classes_train),"%") _results.append((labels_all[test] != predicted_classes).sum()) C += C_step""" for c in pl.frange(C_base,9, C_step): svc = SVC(kernel="linear", C=c, probability=True) model = svc.fit(train_data_features, labels) predicted_classes = model.predict(train_data_cross_validation_classwise_features) class_probabilities = model.predict_proba(train_data_cross_validation_classwise_features) print("C: ",c," N points:", len(predicted_classes), " percentage: ",(labels_cross_validation_classwise != predicted_classes).sum()*100/len(predicted_classes),"%") print("Log_loss: ", log_loss(labels_cross_validation_classwise, class_probabilities)) for c in pl.frange(1,3, 1): svc = SVC(kernel="linear", C=c, probability=True) model = svc.fit(train_data_features, labels) predicted_classes = model.predict(train_data_cross_validation_classwise_features) class_probabilities = model.predict_proba(train_data_cross_validation_classwise_features) print("C: ",c," N points:", len(predicted_classes), " percentage: ",(labels_cross_validation_classwise != predicted_classes).sum()*100/len(predicted_classes),"%") print("Log_loss: ", log_loss(labels_cross_validation_classwise, class_probabilities)) else: for train, test in kf: svc = SVC(kernel="linear", C=C, probability=True) model = svc.fit(train_data_features[train], labels[train]) predicted_classes = model.predict(train_data_features[test]) predicted_classes_train = model.predict(train_data_features[train]) class_probabilities = model.predict_proba(train_data_features[test]) print("C: ",C," n points:", len(predicted_classes), " percentage: ",(labels[test] != predicted_classes).sum()*100/len(predicted_classes),"% percentage_train: ", (labels[train] != predicted_classes_train).sum()*100/len(predicted_classes_train),"%") _results.append((labels[test] != predicted_classes).sum()) C += C_step C = C_base + C_step * _results.index(min(_results)) print("C: ", C) if(len(train_data_cross_validation_classwise_features) > 0): svc = SVC(kernel="linear", C=C, probability=True) model = svc.fit(train_data_features, labels) predicted_classes = model.predict(train_data_cross_validation_classwise_features) class_probabilities = model.predict_proba(train_data_cross_validation_classwise_features) print("C: ",C," N points:", len(predicted_classes), " percentage: ",(labels_cross_validation_classwise != predicted_classes).sum()*100/len(predicted_classes),"%") print("Log_loss: ", log_loss(labels_cross_validation_classwise, class_probabilities)) svc = SVC(kernel="linear", C=C, probability=True) model = svc.fit(train_data_features, labels) return model.predict_proba(test_data_features), model.predict(test_data_features), model else: svc = SVC(kernel="linear", C=8, probability=True) model = svc.fit(train_data_features, labels) return model.predict_proba(test_data_features), model.predict(test_data_features), model
def go_by_category_2(category): input, targets, scaler = TrainingFactory.get_training_data_by_category(category,10000) input_train, input_test, target_train, target_test = train_test_split(input, targets, test_size=0.1) test_data_sparse = TestingFactory.get_test_data(limit=1000) test_data_scaled = scaler.transform(test_data_sparse) test_data = csr_matrix(test_data_scaled) classif = SVC(kernel='rbf',C=0.1, tol=0.001, probability=True) classif.fit(input_train, target_train) output_targets_proba = classif.predict_proba(input_test) outputs_predicted_proba = [item[1] for item in output_targets_proba] output_targets = classif.predict(input_test) # print output_targets.tolist() # print outputs_predicted_proba # print target_test print log_loss(target_test, output_targets) accuracy = accuracy_score(target_test, output_targets) print accuracy print confusion_matrix(target_test, output_targets) testing_output = classif.predict_proba(test_data) testing_output_proba = [item[1] for item in testing_output] print testing_output_proba return accuracy, output_targets, testing_output_proba
def train_model_with_feature(config_name, clf_name, fill_na_opt, PCA_n_comp, clf, X, X_test, y): if PCA_n_comp!=-1: pca = PCA(PCA_n_comp) #PCA dimension reduction logger.info('PCA fit on count matrix') # rescale num to (0,1) X_all = pca.fit_transform( minmax_scale(np.vstack([X, X_test])) ) X, X_test = X_all[:X.shape[0], :], X_all[X.shape[0]:, :] logger.info('PCA fit done') logger.info('start training') print 'training size', X.shape, 'test size', X_test.shape X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.9) if clf_name=='xgb': clf.fit(X_train,y_train,eval_metric='mlogloss') else: clf.fit(X_train,y_train) logger.info(clf_name+'-'+fill_na_opt+'-pca('+str(PCA_n_comp)+') train log-loss='\ +str(log_loss(y_train, clf.predict_proba(X_train)))) logger.info(clf_name+'-'+fill_na_opt+'-pca('+str(PCA_n_comp)+') validate log-loss='\ +str(log_loss(y_val, clf.predict_proba(X_val)))) clf.fit(X, y) y_pred = clf.predict_proba(X_test) df_test[group_list] = y_pred logger.info('finish training') # , 'phone_brand_en', 'device_model_en' df_test.to_csv('output/'+config_name+'-'+clf_name+'-'+fill_na_opt+'-pca'+\ str(PCA_n_comp)+'-'+str(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M'))\ +'.csv', columns=['device_id']+group_list, index=False) logger.info('finish outputing result')
def ctr_gbdt(model='sklearn-clicklog', from_cache=False, train_dataset_length=100000, test_dataset_length=100000): TRAIN_FILE, TEST_FILE = create_dataset(model, from_cache, train_dataset_length, test_dataset_length) prediction_model = GradientBoostingClassifier( loss='deviance', learning_rate=0.1, n_estimators=30, subsample=1.0, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=5, ) x_train, y_train = clean_data(TRAIN_FILE) x_test, y_test = clean_data(TEST_FILE) with Timer('fit model'): prediction_model.fit(x_train, y_train) with Timer('evaluate model'): y_prediction_train = prediction_model.predict_proba(x_train) y_prediction_test = prediction_model.predict_proba(x_test) loss_train = log_loss(y_train, y_prediction_train) loss_test = log_loss(y_test, y_prediction_test) print 'loss_train: %s' % loss_train print 'loss_test: %s' % loss_test
def main(job_id, params): print job_id, params params = get_params(params) print job_id, params crimes = np.load(DATA_FILE) model = RandomForestClassifier(n_estimators=params['n_estimators'], criterion=params['criterion'], max_depth=None if params['max_depth'] < 1 else params['max_depth'], min_samples_split=params['min_samples_split'], min_samples_leaf=params['min_samples_leaf'], max_features=params['max_features'], min_weight_fraction_leaf=0.0, max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=4, random_state=42, verbose=0, warm_start=False, class_weight=None) model.fit(crimes['features_train'], crimes['labels_train']) loss_train = log_loss(crimes['labels_train'], model.predict_proba(crimes['features_train'])) loss_val = log_loss(crimes['labels_val'], model.predict_proba(crimes['features_val'])) loss_all = log_loss(crimes['labels'], model.predict_proba(crimes['features'])) print 'loss_all: ', loss_all print 'loss_train: ', loss_train print 'loss_val: ', loss_val return loss_val
def xgb_base(train2, y, test2, v, z, xgb_params, N_splits, N_seeds, cname, base_seed=42): v[cname], z[cname] = 0, 0 scores = [] dtest = xgb.DMatrix(test2) for s in range(N_seeds): xgb_params['seed'] = s + base_seed skf = model_selection.StratifiedKFold(n_splits=N_splits, shuffle=True, random_state=s + base_seed) for n, (itrain, ival) in enumerate(skf.split(train2, y)): dtrain = xgb.DMatrix(train2.ix[itrain], y[itrain]) dvalid = xgb.DMatrix(train2.ix[ival], y[ival]) watch = [(dtrain, 'train'), (dvalid, 'valid')] clf = xgb.train(xgb_params, dtrain, 10000, watch, early_stopping_rounds=100, verbose_eval=False) p = clf.predict(dvalid) v.loc[ival, cname] += pconvert(p) score = metrics.log_loss(y[ival], p) z[cname] += pconvert(clf.predict(dtest)) print(cname, 'seed %d step %d of %d: '%(xgb_params['seed'], n+1, skf.n_splits), score, now()) scores.append(score) z[cname] /= N_splits * N_seeds v[cname] /= N_seeds print('validation loss: ', metrics.log_loss(y, prestore(v[cname]))) cv=np.array(scores) print(cv, cv.mean(), cv.std())
def main(): X, Y, encoder, scale = load_train_data('train.csv') estimators = 500 X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.2, random_state=0) X_train_real, X_test_real, Y_train_real, Y_test_real = train_test_split(X_train, Y_train, test_size=0.2, random_state=42) log.info('Loaded training file') X_test, _ = load_csv_file('test.csv', cut_end=False) log.info('Loaded test file') #Classifier Setup tree_clf = ExtraTreesClassifier(n_estimators=estimators, n_jobs=-1, random_state=42, max_depth=55, min_samples_split=1) clf = make_pipeline(TfidfTransformer(), DenseTransformer(), tree_clf) log.info('Fitting GradientBoost') clf.fit(X_train_real, Y_train_real) clf_probs = clf.predict_proba(X_test_real) score = log_loss(Y_test_real, clf_probs) log.info('Log Loss score un-trained = %f' % score) # Calibrate Classifier using ground truth in X,Y_valid sig_clf = CalibratedClassifierCV(clf, method="isotonic", cv="prefit") log.info('Fitting CalibratedClassifierCV') sig_clf.fit(X_valid, Y_valid) sig_clf_probs = sig_clf.predict_proba(X_test_real) sig_score = log_loss(Y_test_real, sig_clf_probs) log.info('Log loss score trained = %f' % sig_score) # Ok lets predict the test data with our funky new classifier sig_submission_probs = sig_clf.predict_proba(X_test) write_out_submission(sig_submission_probs, 'submission.csv')
def check_lambda(datanm, samples_per_class,depv, num_classes, criterion, num_iter = 100): data, labels = load_full(datanm, samples_per_class) slo = StratifiedShuffleSplit(labels, n_iter=num_iter, test_size=0.3, train_size=0.7, random_state=None) ans = np.zeros((len(depv), 4)) for train_index, test_index in slo: train_data = [data[train_index, :], labels[train_index]] valid_data = [data[test_index , :], labels[test_index ]] for i, d in enumerate(depv): clf = DecisionTreeClassifier(criterion=criterion, splitter='best', max_depth=d, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, class_weight=None, presort=False) clf.fit(train_data[0], train_data[1]) out_train = clf.predict_proba(train_data[0]) out_valid = clf.predict_proba(valid_data[0]) ans[i, 0] += log_loss(train_data[1], out_train) ans[i, 1] += log_loss(valid_data[1], out_valid) ans[i, 2] += brier(train_data[1], out_train, num_classes) ans[i, 3] += brier(valid_data[1], out_valid, num_classes) ans[:, :] /= num_iter np.savez("rand_forest_lambda_" + criterion, ans= ans, mdep = mdep, num_iter = num_iter, num_classes = num_classes, samples_per_class = samples_per_class) return ans
def train_model(estimator, xtr, xcv, ytr, ycv): model_list = get_model_name_list() #for rfc, rfr, etc, etr if type(estimator) in model_list[:4]: estimator.fit(xtr, ytr) #for rfc, rtc if hasattr(estimator, 'predict_proba'): train_predict = estimator.predict_proba(xtr) cv_predict = estimator.predict_proba(xcv) #for rfr, etr else: train_predict = estimator.predict(xtr) cv_predict = estimator.predict(xcv) best_iter = 0 #for xgbc, xgbr elif type(estimator) in model_list[4:]: estimator.fit(xtr, ytr, early_stopping_rounds=35, eval_metric='logloss', eval_set=[(xcv, ycv)], verbose=True) best_iter = estimator.best_iteration #for xgbc if hasattr(estimator, 'predict_proba'): train_predict = estimator.predict_proba(xtr, ntree_limit=best_iter) cv_predict = estimator.predict_proba(xcv, ntree_limit=best_iter) #for xgbr else: train_predict = estimator.predict(xtr, ntree_limit=best_iter) cv_predict = estimator.predict(xcv, ntree_limit=best_iter) train_loss = log_loss(ytr, train_predict) cv_loss = log_loss(ycv, cv_predict) return train_loss, cv_loss, best_iter
def gb_get_min_loss(clf, verbose=False): j = 0 min_loss_test = 1 print() for i, quality_train, quality_test in zip( range(1, 250 + 1), clf.staged_predict_proba(X_train), clf.staged_predict_proba(X_test) ): loss_train = log_loss(y_train, quality_train) loss_test = log_loss(y_test, quality_test) if min_loss_test > loss_test: min_loss_test = loss_test j = i if (verbose): print( 'Iteration:', i, ' ', 'Train:', '{0:.3f}'.format(loss_train), ' ', 'Test:', '{0:.3f}'.format(loss_test), ' ', '-' if min_loss_test == loss_test else '+' ) return min_loss_test, j
def fit_model_and_test(params): crimes = np.load(DATA_FILE) features_train = crimes['features_train'] all_labels = sorted(list(set(np.unique(crimes['labels_train'])) | set(np.unique(crimes['labels_val'])))) hidden_units = int(params['hidden_units']) batch_size = 64 labels_train = create_labels(crimes['labels_train'], all_labels) labels_vals = create_labels(crimes['labels_val'], all_labels) labels_full = create_labels(crimes['labels'], all_labels) labels_train = np_utils.to_categorical(labels_train) labels_vals = np_utils.to_categorical(labels_vals) labels_full = np_utils.to_categorical(labels_full) model = create_model_and_fit(features_train,labels_train, hidden_units, len(all_labels), params['layers'], params['input_dropout'], params['hidden_dropout'], batch_size, crimes['features_val'], labels_vals) loss_train = log_loss(labels_train, model.predict_proba(crimes['features_train'])) loss_val = log_loss(labels_vals, model.predict_proba(crimes['features_val'])) loss_all = log_loss(labels_full, model.predict_proba(crimes['features'])) print 'loss_all: ', loss_all print 'loss_train: ', loss_train print 'loss_val: ', loss_val sys.stdout.flush() return loss_val, model, crimes, all_labels
def check_vb(datanm, samples_per_class, Cs, num_classes, num_iter = 100): data, labels = load_full(datanm, samples_per_class) slo = StratifiedShuffleSplit(labels, n_iter=num_iter, test_size=0.5, train_size=0.5, random_state=None) ans = np.zeros((len(Cs), samples_per_class/2, 2)) for train_index, test_index in slo: train_data = [data[train_index, :], labels[train_index]] valid_data = [data[test_index , :], labels[test_index ]] for l in xrange(samples_per_class/2): ind_train = [] ind_valid = [] for k in xrange(num_classes): ind_train = ind_train + np.where(train_data[1] == k)[0].tolist()[:l+1] ind_valid = ind_valid + np.where(valid_data[1] == k)[0].tolist()[:l+1] ctrain_data = [ train_data[0][ind_train], train_data[1][ind_train] ] cvalid_data = [ valid_data[0][ind_valid], valid_data[1][ind_valid] ] for i, C in enumerate(Cs): clf = LogisticRegression(C =C , penalty='l2', multi_class = 'ovr', tol=0.001, n_jobs = -1 , verbose = 0)#, solver = 'newton-cg') clf.fit(ctrain_data[0], ctrain_data[1]) out_train = clf.predict_proba(ctrain_data[0]) out_valid = clf.predict_proba(cvalid_data[0]) ans[i, l, 0] += log_loss(ctrain_data[1], out_train) ans[i, l, 1] += log_loss(cvalid_data[1], out_valid) ans /= num_iter np.savez("logreg_bv", ans= ans, Cs = Cs, num_iter = num_iter, num_classes = num_classes, samples_per_class = samples_per_class) return ans
def learn(learning_rate, X_train, y_train, X_test, y_test): model = GradientBoostingClassifier( n_estimators=250, verbose=True, random_state=241, learning_rate=learning_rate ) model.fit(X_train, y_train) # plot scores test_score = list(range(250)) train_score = list(range(250)) for i, predictions in enumerate(model.staged_decision_function(X_test)): predictions = [x[0] for x in predictions.tolist()] # unpack this stupid format predictions = [1/(1 + math.exp(-x)) for x in predictions] test_score[i] = log_loss(y_test, predictions) for i, predictions in enumerate(model.staged_decision_function(X_train)): predictions = [x[0] for x in predictions.tolist()] # unpack this stupid format predictions = [1/(1 + math.exp(-x)) for x in predictions] train_score[i] = log_loss(y_train, predictions) plt.figure() plt.plot(test_score, 'r', linewidth=2) plt.plot(train_score, 'g', linewidth=2) plt.legend(['test', 'train']) plt.show() return train_score, test_score
def rf_grid_search(): train_inp,valid_inp,train_target,valid_target = prepare_input() #set up scorer for grid search. log-loss is error, not score, so set greater_is_better to false, #and log-loss requires a probability log_loss_scorer = make_scorer(log_loss,greater_is_better=False,needs_proba=True) train_inp = train_inp[:100000] train_target = train_target[:100000] start = time.time() random_forest = RandomForestClassifier(random_state=31) # r_forest_parameters = {'n_estimators' : [120,300,500,800,1200],'max_depth':[5,8,15,25,30,None],'max_features':['log2','sqrt',None], # 'min_samples_split':[1,2,5,10,15,100],'min_samples_leaf':[1,2,5,10]} #75.1 minutes to run with these paramters - 72 fits r_forest_parameters = {'min_samples_split':[2,5,10,20,50,100],'min_samples_leaf':[1,2,5,10,50,100]} #grid search too slow to not use all cores, and wayyyy too slow to have no output. r_forest_grid_obj = GridSearchCV(random_forest,r_forest_parameters,log_loss_scorer,verbose=2,n_jobs=-1) r_forest_grid_obj = r_forest_grid_obj.fit(train_inp,train_target) random_forest = r_forest_grid_obj.best_estimator_ print "Best params: " + str(r_forest_grid_obj.best_params_) random_forest_train_error = log_loss(train_target,random_forest.predict_proba(train_inp)) random_forest_validation_error = log_loss(valid_target,random_forest.predict_proba(valid_inp)) print "Best random forest training error: {:02.4f}".format(random_forest_train_error) print "Best random forest validation error: {:02.4f}".format(random_forest_validation_error) end = time.time() print "RF grid search took {:02.4f} seconds".format(end-start) return random_forest
def run_cross_validation(nfolds=10): img_rows, img_cols = 32, 32 batch_size = 32 nb_epoch = 100 random_state = 51 train_data, train_target, train_id = read_and_normalize_train_data(img_rows, img_cols) test_data, test_id = read_and_normalize_test_data(img_rows, img_cols) yfull_train = dict() yfull_test = [] kf = KFold(len(train_data), n_folds=nfolds, shuffle=True, random_state=random_state) num_fold = 0 sum_score = 0 for train_index, test_index in kf: model = create_model(img_rows, img_cols) X_train, X_valid = train_data[train_index], train_data[test_index] Y_train, Y_valid = train_target[train_index], train_target[test_index] num_fold += 1 print('Start KFold number {} from {}'.format(num_fold, nfolds)) print('Split train: ', len(X_train), len(Y_train)) print('Split valid: ', len(X_valid), len(Y_valid)) callbacks = [ EarlyStopping(monitor='val_loss', patience=5, verbose=0), ] model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch, shuffle=True, verbose=2, validation_data=(X_valid, Y_valid), callbacks=callbacks) predictions_valid = model.predict(X_valid, batch_size=batch_size, verbose=1) score = log_loss(Y_valid, predictions_valid) print('Score log_loss: ', score) # Store valid predictions for i in range(len(test_index)): yfull_train[test_index[i]] = predictions_valid[i] # Store test predictions test_prediction = model.predict(test_data, batch_size=batch_size, verbose=2) yfull_test.append(test_prediction) predictions_valid = get_validation_predictions(train_data, yfull_train) score = log_loss(train_target, predictions_valid) print("Log_loss train independent avg: ", score) print('Final log_loss: {}, rows: {} cols: {} nfolds: {} epoch: {}'.format(score, img_rows, img_cols, nfolds, nb_epoch)) perc = getPredScorePercent(train_target, train_id, predictions_valid) print('Percent success: {}'.format(perc)) info_string = 'loss_' + str(score) \ + '_r_' + str(img_rows) \ + '_c_' + str(img_cols) \ + '_folds_' + str(nfolds) \ + '_ep_' + str(nb_epoch) test_res = merge_several_folds_mean(yfull_test, nfolds) create_submission(test_res, test_id, info_string)
def log_res(train_data_features, train_data_cross_validation_classwise_features, test_data_features, labels, labels_cross_validation_classwise, using_cross_validation2, kf, settings): if using_cross_validation2: logres_C = 1 logres_results = [] if(len(train_data_cross_validation_classwise_features) > 0): """train_all = np.append(train_data_features, train_data_cross_validation_classwise_features, axis=0) labels_all = np.append(labels, labels_cross_validation_classwise) kf_all = KFold(len(train_all)-1, n_folds=int(settings['Data']['CrossValidation2']), shuffle=True) for train, test in kf_all: C = logres_C p = 'l1' clf_l1_LR = LogisticRegression(C=C, penalty=p, tol=0.01) model = clf_l1_LR.fit(train_all[train], labels_all[train]) predicted_classes = model.predict(train_all[test]) predicted_classes_train = model.predict(train_all[train]) print("N points:", len(predicted_classes), " percentage: ",(labels_all[test] != predicted_classes).sum()*100/len(predicted_classes),"%, percentage_train: ", (labels_all[train] != predicted_classes_train).sum()*100/len(predicted_classes_train)) logres_results.append((labels_all[test] != predicted_classes).sum()) logres_C += 1""" for c in pl.frange(logres_C,15, 1): clf_l1_LR = LogisticRegression(C=c, solver='lbfgs', penalty='l2', tol=0.01) model = clf_l1_LR.fit(train_data_features, labels) predicted_classes = model.predict(train_data_cross_validation_classwise_features) predicted_classes_train = model.predict(train_data_features) class_probabilities = model.predict_proba(train_data_cross_validation_classwise_features) logres_results.append(log_loss(labels_cross_validation_classwise, class_probabilities)) print("N points:", len(predicted_classes), " percentage: ",(labels_cross_validation_classwise != predicted_classes).sum()*100/len(predicted_classes), "%, percentage_train: ", (labels != predicted_classes_train).sum()*100/len(predicted_classes_train)) print("Log_loss: ", log_loss(labels_cross_validation_classwise, class_probabilities)) else: for train, test in kf: C = logres_C p = 'l1' clf_l1_LR = LogisticRegression(C=C, penalty=p, tol=0.01) model = clf_l1_LR.fit(train_data_features[train], labels[train]) predicted_classes = model.predict(train_data_features[test]) predicted_classes_train = model.predict(train_data_features[train]) print("N points:", len(predicted_classes), " percentage: ",(labels[test] != predicted_classes).sum()*100/len(predicted_classes),"%, percentage_train: ", (labels[train] != predicted_classes_train).sum()*100/len(predicted_classes_train)) logres_results.append((labels[test] != predicted_classes).sum()) logres_C += 1 print(logres_results) logres_C = logres_results.index(min(logres_results)) + 1 print("Log Res C: ", logres_C) if(len(train_data_cross_validation_classwise_features) > 0): clf_l1_LR = LogisticRegression(C=logres_C, penalty='l2', tol=0.01) model = clf_l1_LR.fit(train_data_features, labels) predicted_classes = model.predict(train_data_cross_validation_classwise_features) predicted_classes_train = model.predict(train_data_features) class_probabilities = model.predict_proba(train_data_cross_validation_classwise_features) print("N points:", len(predicted_classes), " percentage: ",(labels_cross_validation_classwise != predicted_classes).sum()*100/len(predicted_classes),"%, percentage_train: ", (labels != predicted_classes_train).sum()*100/len(predicted_classes_train)) print("Log_loss: ", log_loss(labels_cross_validation_classwise, class_probabilities)) clf_l1_LR = LogisticRegression(C=logres_C, penalty='l1', tol=0.01) model = clf_l1_LR.fit(train_data_features, labels) return model.predict_proba(test_data_features), model.predict(test_data_features), model else: C = 1 p = 'l1' clf_l1_LR = LogisticRegression(C=C, penalty=p, tol=0.01) model = clf_l1_LR.fit(train_data_features, labels) return model.predict_proba(test_data_features), model.predict(test_data_features), model
def error(p, x, y): preds = blended(p, x) err = log_loss(y, preds) # 看来一维向量是可以和一个矩阵直接比较求logloss的 return err
# model = pickle.load(open('E:/nmb/nmb_data/cp/m03_mels_CatBoostClassifier.data', 'rb')) # rb : read # time >> # evaluate y_pred = model.predict(x_test) # print(y_pred[:100]) # print(y_pred[100:]) accuracy = accuracy_score(y_test, y_pred) recall = recall_score(y_test, y_pred) precision = precision_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) hamm_loss = hamming_loss(y_test, y_pred) hinge_loss = hinge_loss(y_test, y_pred) log_loss = log_loss(y_test, y_pred) print("accuracy : \t", accuracy) print("recall : \t", recall) print("precision : \t", precision) print("f1 : \t", f1) print("hamming_loss : \t", hamm_loss) print("hinge_loss : \t", hinge_loss) # SVM에 적합한 cross-entropy print("log_loss : \t", log_loss) # Cross-entropy loss와 유사한 개념 # predict 데이터 pred_pathAudio = 'E:/nmb/nmb_data/pred_voice/' files = librosa.util.find_files(pred_pathAudio, ext=['wav']) files = np.asarray(files) for file in files:
def stacking(clf,train_x,train_y,test_x,clf_name,class_num=3): train=np.zeros((train_x.shape[0],class_num)) test=np.zeros((test_x.shape[0],class_num)) test_pre=np.empty((folds,test_x.shape[0],class_num)) cv_scores=[] for i,(train_index,test_index) in enumerate(kf): tr_x=train_x[train_index] tr_y=train_y[train_index] te_x=train_x[test_index] te_y = train_y[test_index] if clf_name in ["rf","ada","gb","et","lr","knn","mnb","ovr","gnb"]: clf.fit(tr_x,tr_y) pre=clf.predict_proba(te_x) train[test_index]=pre test_pre[i,:]=clf.predict_proba(test_x) cv_scores.append(log_loss(te_y, pre)) elif clf_name in ["lsvc"]: clf.fit(tr_x,tr_y) pre=clf.decision_function(te_x) train[test_index]=pre test_pre[i,:]=clf.decision_function(test_x) cv_scores.append(log_loss(te_y, pre)) elif clf_name in ["xgb"]: train_matrix = clf.DMatrix(tr_x, label=tr_y, missing=-1) test_matrix = clf.DMatrix(te_x, label=te_y, missing=-1) z = clf.DMatrix(test_x, label=te_y, missing=-1) params = {'booster': 'gbtree', 'objective': 'multi:softprob', 'eval_metric': 'mlogloss', 'gamma': 1, 'min_child_weight': 1.5, 'max_depth': 4, 'lambda': 10, 'subsample': 0.7, 'colsample_bytree': 0.7, 'colsample_bylevel': 0.7, 'eta': 0.03, 'tree_method': 'exact', 'seed': 2017, 'nthread': 12, "num_class": class_num } num_round = 10000 early_stopping_rounds = 100 watchlist = [(train_matrix, 'train'), (test_matrix, 'eval') ] if test_matrix: model = clf.train(params, train_matrix, num_boost_round=num_round,evals=watchlist, early_stopping_rounds=early_stopping_rounds ) pre= model.predict(test_matrix,ntree_limit=model.best_ntree_limit) train[test_index]=pre test_pre[i, :]= model.predict(z, ntree_limit=model.best_ntree_limit) cv_scores.append(log_loss(te_y, pre)) elif clf_name in ["lgb"]: train_matrix = clf.Dataset(tr_x, label=tr_y) test_matrix = clf.Dataset(te_x, label=te_y) #z = clf.Dataset(test_x, label=te_y) #z=test_x params = { 'boosting_type': 'gbdt', #'boosting_type': 'dart', 'objective': 'multiclass', 'metric': 'multi_logloss', 'min_child_weight': 1.5, 'num_leaves': 2**4, 'lambda_l2': 10, 'subsample': 0.7, 'colsample_bytree': 0.7, 'colsample_bylevel': 0.7, 'learning_rate': 0.03, 'tree_method': 'exact', 'seed': 2017, 'nthread': 12, "num_class": class_num, 'silent': True, } num_round = 10000 early_stopping_rounds = 100 if test_matrix: model = clf.train(params, train_matrix,num_round,valid_sets=test_matrix, early_stopping_rounds=early_stopping_rounds ) pre= model.predict(te_x,num_iteration=model.best_iteration) train[test_index]=pre test_pre[i, :]= model.predict(test_x, num_iteration=model.best_iteration) cv_scores.append(log_loss(te_y, pre)) elif clf_name in ["nn"]: from keras.layers import Dense, Dropout, BatchNormalization,SReLU from keras.optimizers import SGD,RMSprop from keras.callbacks import EarlyStopping, ReduceLROnPlateau from keras.utils import np_utils from keras.regularizers import l2 from keras.models import Sequential clf = Sequential() clf.add(Dense(64, input_dim=tr_x.shape[1],activation="relu", W_regularizer=l2())) #clf.add(SReLU()) #clf.add(Dropout(0.2)) clf.add(Dense(64,activation="relu",W_regularizer=l2())) #clf.add(SReLU()) #clf.add(Dense(64, activation="relu", W_regularizer=l2())) # model.add(Dropout(0.2)) clf.add(Dense(class_num, activation="softmax")) clf.summary() early_stopping = EarlyStopping(monitor='val_loss', patience=20) reduce = ReduceLROnPlateau(min_lr=0.0002,factor=0.05) clf.compile(optimizer="rmsprop", loss="categorical_crossentropy") clf.fit(tr_x, tr_y, batch_size=2560, nb_epoch=1000, validation_data=[te_x, te_y], callbacks=[early_stopping,reduce]) pre=clf.predict_proba(te_x) train[test_index]=pre test_pre[i,:]=clf.predict_proba(test_x) cv_scores.append(log_loss(te_y, pre)) else: raise IOError("Please add new clf.") print "%s now score is:"%clf_name,cv_scores with open("score.txt","a") as f: f.write("%s now score is:"%clf_name+str(cv_scores)+"\n") test[:]=test_pre.mean(axis=0) print "%s_score_list:"%clf_name,cv_scores print "%s_score_mean:"%clf_name,np.mean(cv_scores) with open("score.txt", "a") as f: f.write("%s_score_mean:"%clf_name+str(np.mean(cv_scores))+"\n") return train.reshape(-1,class_num),test.reshape(-1,class_num)
def run_cross_validation_create_models(num_fold=5): #Input image dimensions batch_size = 4 nb_epoch = 50 restore_from_last_checkpoint = 1 data, target = preprocess_data() X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=42) t1 = DenseNet(classes=4, input_shape=(300, 300, 3), depth=40, growth_rate=12, bottleneck=True, reduction=0.5) # model = Model(input=img_input, output = output_3) # model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy']) # model.summary() # model = base_model() top_model = Sequential() top_model.add(t1) top_model.add(Flatten(input_shape=t1.output_shape[1:])) top_model.add(Dense(256, activation='relu')) top_model.add(Dropout(0.5)) top_model.add(Dense(4, activation='softmax')) # note that it is necessary to start with a fully-trained # classifier, including the top classifier, # in order to successfully do fine-tuning # add the model on top of the convolutional base model = Sequential() model.add(top_model) model.summary() model.compile(loss='categorical_crossentropy', optimizer=SGD(lr=1e-2, momentum=0.9), metrics=['accuracy']) num_fold += 1 print('Start KFold number {} from {}'.format(num_fold, num_fold)) print('Split train:', len(X_train), len(y_train)) print('Split test:', len(X_test), len(y_test)) kfold_weights_path = os.path.join( 'cache', 'weights_kfold_vgg16_' + str(num_fold) + '.h5') # if not os.path.isfile(kfold_weights_path) or restore_from_last_checkpoint == 0: callbacks = [ # EarlyStoppingbyLossVal(monitor = 'val_loss', value = 0.00001, verbose = 1), # EarlyStopping(monitor = 'val_loss', patience = 5, verbose = 1), ModelCheckpoint(kfold_weights_path, monitor='val_loss', save_best_only=True, verbose=0), TensorBoard(log_dir='./LogsForAUC', write_images=True) ] cnn = model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True, verbose=1, validation_data=(X_test, y_test), callbacks=callbacks) if os.path.isfile(kfold_weights_path): model.load_weights(kfold_weights_path) score1 = model.evaluate(X_test, y_test, show_accuracy=True, verbose=0) print('Score on test was : ', score1) predictions = model.predict(X_train.astype('float32'), batch_size=batch_size, verbose=1) score = log_loss(y_test, predictions) print('Score log_loss on test is', score) plt.plot(cnn.history['acc']) plt.plot(cnn.history['val_acc']) plt.title('model accuracy') plt.ylabel('accuracy') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.show() # summarize history for loss plt.plot(cnn.history['loss']) plt.plot(cnn.history['val_loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.show() pd.DataFrame(cnn.history).to_csv("/historyAUC.csv")
def __call__(self, y_true_proba, y_proba): score = log_loss(y_true_proba, y_proba) return score
X.append(xi) y.append(yi) X = np.asarray(X) y = np.asarray(y) Test_ind = np.asarray(Test_ind) X_train, y_train = X[:int(datanum*0.6)], y[:int(datanum*0.6)] X_valid, y_valid = X[int(datanum*0.6):int(datanum*0.8)], y[int(datanum*0.6):int(datanum*0.8)] X_train_valid, y_train_valid = X[:int(datanum*0.8)], y[:int(datanum*0.8)] X_test, y_test = X[int(datanum*0.8):], y[int(datanum*0.8):] clf = RandomForestClassifier(n_estimators=100) clf.fit(X_train_valid, y_train_valid) clf_probs = clf.predict_proba(X_test) score = log_loss(y_test, clf_probs) print clf.score(X_test, y_test) x_pro = clf.predict_proba(X) Test_res = np.c_[Test_ind, x_pro] val_res = np.c_[X_test, y_test] val_res = np.c_[val_res, clf_probs] Real_ind = [] rdata = [] infile = open(realdata,'rb') for line in infile: line = line.strip('\n') sent = line.split('\t') Real_ind.append(sent)
log = pd.DataFrame(columns=log_cols) for clf in classifiers: clf.fit(X_train, y_train) name = clf.__class__.__name__ # print("=" * 30) # print(name) # print('****Results****') train_predictions = clf.predict(X_test) acc = accuracy_score(y_test, train_predictions) # from sklearn # print("Accuracy: {:.4%}".format(acc)) train_predictions = clf.predict_proba(X_test) ll = log_loss(y_test, train_predictions) # print("Log Loss: {}".format(ll)) log_entry = pd.DataFrame([[name, acc * 100, ll]], columns=log_cols) log = log.append(log_entry) # print("=" * 30) print log # LogisticRegression 59.090909 4.173216 ###################### # I would like to choose # LogisticRegression # for future improvements by tuning their hyper-parameters ###################### if log_reg: # We initialise the Exhaustive Grid Search, we leave the scoring as the default function of
y_pred = rfc_model.predict(X_test) precScore = precision_score(Y_test, y_pred, average="macro") print "precision score", precScore predProb = rfc_model.predict_proba(X_test) print "y predicted", set(y_pred) log_loss = log_loss(Y_test, predProb) print "log loss", log_loss acc = accuracy_score(Y_test, y_pred) print "Accuracy is : ", acc gnb = GaussianNB()
print("DT F1-score: ", f1_score_dt) # Logistic Regression # In[143]: # predicted y yhat_lg = LR.predict(x_test) yhat_lg_prob = LR.predict_proba(x_test) # jaccard jaccard_lg = jaccard_similarity_score(y_test, yhat_lg) print("LR Jaccard index: ", jaccard_lg) # f1_score f1_score_lg = f1_score(y_test, yhat_lg, average='weighted') print("LR F1-score: ", f1_score_lg) # logloss logloss_lg = log_loss(y_test, yhat_lg_prob) print("LR log loss: ", logloss_lg) # # Report # You should be able to report the accuracy of the built model using different evaluation metrics: # | Algorithm | Jaccard | F1-score | LogLoss | # |--------------------|---------|----------|---------| # | KNN | 0.56302 | 0.547122 | NA | # | Decision Tree | 0.56285 | 0.534773 | NA | # | LogisticRegression | 0.52435 | 0.509146 | 0.68563 |
def computeloss_lib(p, labels): return metrics.log_loss(labels, p)
def calculateLoss(self, y_true, pred): """ This functions calculates the cross entropy for analytical purposes""" return log_loss(y_true, pred)
def train_linear_classifier_model(learning_rate, steps, batch_size, training_examples, training_targets, validation_examples, validation_targets): """Trains a linear classification model. In addition to training, this function also prints training progress information, as well as a plot of the training and validation loss over time. Args: learning_rate: A `float`, the learning rate. steps: A non-zero `int`, the total number of training steps. A training step consists of a forward and backward pass using a single batch. batch_size: A non-zero `int`, the batch size. training_examples: A `DataFrame` containing one or more columns from `california_housing_dataframe` to use as input features for training. training_targets: A `DataFrame` containing exactly one column from `california_housing_dataframe` to use as target for training. validation_examples: A `DataFrame` containing one or more columns from `california_housing_dataframe` to use as input features for validation. validation_targets: A `DataFrame` containing exactly one column from `california_housing_dataframe` to use as target for validation. Returns: A `LinearClassifier` object trained on the training data. """ periods = 10 steps_per_period = steps / periods # Create a linear classifier object. my_optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) my_optimizer = tf.contrib.estimator.clip_gradients_by_norm( my_optimizer, 5.0) linear_classifier = tf.estimator.LinearClassifier( feature_columns=construct_feature_columns(training_examples), optimizer=my_optimizer) # YOUR CODE HERE: Construct the linear classifier. # Create input functions. training_input_fn = lambda: my_input_fn(training_examples, training_targets[ "median_house_value_is_high"], batch_size=batch_size) predict_training_input_fn = lambda: my_input_fn( training_examples, training_targets["median_house_value_is_high"], num_epochs=1, shuffle=False) predict_validation_input_fn = lambda: my_input_fn( validation_examples, validation_targets["median_house_value_is_high"], num_epochs=1, shuffle=False) # Train the model, but do so inside a loop so that we can periodically assess # loss metrics. print "Training model..." print "LogLoss (on training data):" training_log_losses = [] validation_log_losses = [] for period in range(0, periods): # Train the model, starting from the prior state. linear_classifier.train(input_fn=training_input_fn, steps=steps_per_period) # Take a break and compute predictions. training_probabilities = linear_classifier.predict( input_fn=predict_training_input_fn) training_probabilities = np.array( [item['probabilities'] for item in training_probabilities]) validation_probabilities = linear_classifier.predict( input_fn=predict_validation_input_fn) validation_probabilities = np.array( [item['probabilities'] for item in validation_probabilities]) training_log_loss = metrics.log_loss(training_targets, training_probabilities) validation_log_loss = metrics.log_loss(validation_targets, validation_probabilities) # Occasionally print the current loss. print " period %02d : %0.2f" % (period, training_log_loss) # Add the loss metrics from this period to our list. training_log_losses.append(training_log_loss) validation_log_losses.append(validation_log_loss) print "Model training finished." # Output a graph of loss metrics over periods. plt.ylabel("LogLoss") plt.xlabel("Periods") plt.title("LogLoss vs. Periods") plt.tight_layout() plt.plot(training_log_losses, label="training") plt.plot(validation_log_losses, label="validation") plt.legend() plt.show() return linear_classifier
def run_cross_validation_train_models(train_data, train_target, model_struc, nfolds=10, nb_epoch=200): # input image dimensions batch_size = 600 yfull_train = dict() # kf = KFold(, n_folds=nfolds, shuffle=True, random_state=random_state) skf = StratifiedKFold(n_splits=nfolds, shuffle=True) num_fold = 0 sum_score = 0 models = [] for train_index, test_index in skf.split(train_data, train_target): X_train = train_data[train_index] Y_train = train_target[train_index] X_valid = train_data[test_index] Y_valid = train_target[test_index] num_fold += 1 print('-- Start KFold train number {} from {}'.format( num_fold, nfolds)) print('---- Split train: ', len(X_train), len(Y_train)) print('---- Split valid: ', len(X_valid), len(Y_valid)) model = make_cnn(model_struc, train_data.shape[1:], verbose=False) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(X_train, Y_train, batch_size=batch_size, epochs=nb_epoch, shuffle=True, verbose=1, validation_data=(X_valid, Y_valid)) predictions_valid, summary = DL_mcc(model, X_valid, Y_valid) score = log_loss(Y_valid, predictions_valid) print('-- Score log_loss: ', score) sum_score += score * len(test_index) # Store valid predictions for i in range(len(test_index)): yfull_train[test_index[i]] = predictions_valid[i] models.append(model) score = sum_score / len(train_data) print("-- Train: Log_loss independent avg: ", score) ytrue = train_target[list(yfull_train.keys())] pred = np.array(list(yfull_train.values())) binary_y = score_to_binary(pred.reshape(-1)) # print(ytrue) summary = PNmetrics2(ytrue, binary_y) info_string = '-- loss_' + str(score) + '_folds_' + str( nfolds) + '_ep_' + str(nb_epoch) return (info_string, models, summary)
'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN' ] # 只取星期几和街区作为分类器输入特征 # 下面这行是添加 hour 属性, 只要取消注释即可 # features = features + [x for x in range(0, 24)] # 添加犯罪的小时时间点作为特征 training, validation = train_test_split(trainData, test_size=.40) # 分割训练集(3/5)和测试集(2/5) ################################################################## ## 朴素贝叶斯建模, 计算 log_loss model = BernoulliNB() nbStart = time.time() model.fit(training[features], training['crime']) # 这个很快 nbCostTime = time.time() - nbStart predicted = np.array(model.predict_proba(validation[features])) print("朴素贝叶斯建模耗时 %f 秒" % (nbCostTime)) print("朴素贝叶斯 log 损失为 %f" % (log_loss(validation['crime'], predicted))) # 2.617892 / 2.582167; 后者是将上面的 特征加入 hour 后 ################################################################## ## 逻辑回归建模, 计算 log_loss model = LogisticRegression(C=.01) lrStart = time.time() model.fit(training[features], training['crime']) # 这个就很慢了 lrCostTime = time.time() - lrStart predicted = np.array(model.predict_proba(validation[features])) print("逻辑回归建模耗时 %f 秒" % (lrCostTime)) # 近 2min print( "逻辑回归 log 损失为 %f" % (log_loss(validation['crime'], predicted))) # 2.624773 / 2.592119; 还没 NB 好, 后者是将上面的 特征加入 hour 后 # 可以看到在这三个类别特征下, 朴素贝叶斯相对于逻辑回归, 依旧有一定的优势(log 损失更小), 同时训练时间很短, 这意味着模型虽然简单, 但是效果依旧强大 # 顺便提一下, 朴素贝叶斯 1.13s 训练出来的模型, 预测的效果在 Kaggle 排行榜上已经能进入 Top 35%了, 如果进行一些优化,
for i,img_id in tqdm(enumerate(labels['id'])): img = read_img(img_id,'train',(INPUT_SIZE,INPUT_SIZE)) x = xception.preprocess_input(np.expand_dims(img.copy(),axis=0)) x_train[i] = x print 'Train Image shape: {} size: {:,}'.format(x_train.shape,x_train.size) x_test = np.zeros((len(sample_submission),INPUT_SIZE,INPUT_SIZE,3),dtype='float32') for i,img_id in tqdm(enumerate(sample_submission['id'])): img = read_img(img_id,'test',(INPUT_SIZE,INPUT_SIZE)) x = xception.preprocess_input(np.expand_dims(img.copy(),axis=0)) x_test[i] = x print 'Test Image shape: {} size: {:,}'.format(x_test.shape,x_test.size) print x_train.shape xception_bottleneck = xception.Xception(weights='imagenet',include_top=False,pooling=POOLING) train_x_bf = xception_bottleneck.predict(x_train,batch_size=32,verbose=1) test_x_bf = xception_bottleneck.predict(x_test,batch_size=32,verbose=1) print 'Xception train bottleneck features shape: {} size: {:,}'.format(train_x_bf.shape,train_x_bf.size) logreg = LogisticRegression(multi_class='multinomial',solver='lbfgs',random_state=SEED) logreg.fit(train_x_bf,(y_train*range(NUM_CLASSES)).sum(axis=1)) train_probs = logreg.predict_proba(train_x_bf) train_preds = logreg.predict(train_x_bf) print 'Xception train loss: {}'.format(log_loss(y_train,train_probs)) print 'Xception train accuracy: {}'.format(accuracy_score((y_train*range(NUM_CLASSES)).sum(axis=1),train_preds)) test_probs = logreg.predict_proba(test_x_bf) test_preds = logreg.predict(test_x_bf) result = pd.DataFrame(data=test_probs,index=sample_submission.id,columns=unique_breeds,dtype='float32',copy=True) result.to_csv('my_submission.csv')
def gbdt_lr_predict(data, category_feature, continuous_feature): # 0.43616 # 离散特征one-hot编码 print('开始one-hot...') for col in category_feature: onehot_feats = pd.get_dummies(data[col], prefix=col) data.drop([col], axis=1, inplace=True) data = pd.concat([data, onehot_feats], axis=1) print('one-hot结束') train = data[data['flag'] != -1] target = train.pop('flag') test = data[data['flag'] == -1] test.drop(['flag'], axis=1, inplace=True) # 划分数据集 print('划分数据集...') x_train, x_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=2018) print('开始训练gbdt..') gbm = lgb.LGBMRegressor( objective='binary', subsample=0.8, min_child_weight=0.5, colsample_bytree=0.7, num_leaves=100, max_depth=12, learning_rate=0.05, n_estimators=10, ) gbm.fit( x_train, y_train, eval_set=[(x_train, y_train), (x_val, y_val)], eval_names=['train', 'val'], eval_metric='binary_logloss', # early_stopping_rounds = 100, ) model = gbm.booster_ print('训练得到叶子数') gbdt_feats_train = model.predict(train, pred_leaf=True) gbdt_feats_test = model.predict(test, pred_leaf=True) gbdt_feats_name = [ 'gbdt_leaf_' + str(i) for i in range(gbdt_feats_train.shape[1]) ] df_train_gbdt_feats = pd.DataFrame(gbdt_feats_train, columns=gbdt_feats_name) df_test_gbdt_feats = pd.DataFrame(gbdt_feats_test, columns=gbdt_feats_name) print('构造新的数据集...') train = pd.concat([train, df_train_gbdt_feats], axis=1) test = pd.concat([test, df_test_gbdt_feats], axis=1) train_len = train.shape[0] data = pd.concat([train, test]) del train del test gc.collect() # # 连续特征归一化 # print('开始归一化...') # scaler = MinMaxScaler() # for col in continuous_feature: # data[col] = scaler.fit_transform(data[col].values.reshape(-1, 1)) # print('归一化结束') # 叶子数one-hot print('开始one-hot...') for col in gbdt_feats_name: print('this is feature:', col) onehot_feats = pd.get_dummies(data[col], prefix=col) data.drop([col], axis=1, inplace=True) data = pd.concat([data, onehot_feats], axis=1) print('one-hot结束') train = data[:train_len] test = data[train_len:] del data gc.collect() x_train, x_val, y_train, y_val = train_test_split(train, target, test_size=0.3, random_state=2018) # lr print('开始训练lr..') lr = LogisticRegression() #lbl = preprocessing.LabelEncoder() #x_train['hav_car_grp_ind'] = lbl.fit_transform(x_train['hav_car_grp_ind'].astype(str)) #x_train['hav_hou_grp_ind'] = lbl.fit_transform(x_train['hav_hou_grp_ind'].astype(str)) #x_train['job_year'] = lbl.fit_transform(x_train['job_year'].astype(str)) lr.fit(x_train, y_train) tr_logloss = log_loss(y_train, lr.predict_proba(x_train)[:, 1]) print('tr-logloss: ', tr_logloss) val_logloss = log_loss(y_val, lr.predict_proba(x_val)[:, 1]) print('val-logloss: ', val_logloss) print('开始预测...') y_pred = lr.predict_proba(test)[:, 1] print('写入结果...') for i in y_pred: print(i) #res = pd.read_csv('data/test.csv') #submission = pd.DataFrame({'id': res['id'], 'flag': y_pred}) #submission.to_csv('submission/submission_gbdt+lr_trlogloss_%s_vallogloss_%s.csv' % (tr_logloss, val_logloss), index = False) print('结束')
def evaluate(self, feat_index, feat_val, label): y_pred = self.predict(feat_index, feat_val) if self.metric_type == 'auc': return roc_auc_score(label, y_pred) elif self.metric_type == 'logloss': return log_loss(label, y_pred)
def check_all_variants( start_config={ "dataset": "Superstore", "file_name_list": [ "Superstore", "Superstore_train", "Superstore_test", "Superstore_valid" ], "col_name_list": ["Kundenname", "Produktname"], "show_progress": False, "use_user_info": False, "user_features": ["Segment", "Kategorie"], "user_features_prep_types": ["one_hot", "one_hot"], "n_info_cols": 0, "approach": "binary", "model_type": "complement", "count": True, "split_ratio": [0.7, 0.2, 0.1], "split": "clients", "alpha": 1.0, "fit_prior": True, "train_batch_size": 5000, "pred_batch_size": 5000, "n_Produkte": 1915, "n_Kunden": 784, "info_string": "", "fit_set": "train", "pred_set": "test", }, param_dict={ "approach": ["multi", "binary"], "model_type": ["multinomial", "complement", "bernoulli"], "split": ["clients", "orders"], "use_user_info": [False, True], "count": [True, False], "alpha": [1.0, 0.9, 0.8, 0.7], "fit_prior": [True, False], "user_features": [["Segment"], ["Kategorie"], ["Segment", "Kategorie"]], "user_features_prep_types": [["one_hot"], ["one_hot"], ["one_hot", "one_hot"]] }): top_n_list = [10, 20, 50, 100, 200, 500] full_out_dict = { "approach": [], "model_type": [], "split": [], "use_user_info": [], "threshold": [], "count": [], "info_str": [], "filename": [], "mse": [], "neg_log_loss": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": [], "tn": [], "fp": [], "fn": [], "tp": [] } for top_n in top_n_list: full_out_dict["top_" + str(top_n) + "_score"] = [] dataset = start_config["dataset"] #configure.do(dataset) with open(dataset + "/json_files/config.json", "w") as fp: json.dump(start_config, fp, indent=5) for approach in param_dict["approach"]: for model_type in param_dict["model_type"]: for split in param_dict["split"]: for count in param_dict["count"]: for fit_prior in param_dict["fit_prior"]: for alpha in param_dict["alpha"]: for use_user_info in param_dict["use_user_info"]: for user_features, user_features_prep_types in zip( param_dict["user_features"], param_dict["user_features_prep_types"] ): if not use_user_info and not user_features == param_dict[ "user_features"][-1]: continue update_config(dataset, approach, model_type, split, use_user_info, user_features, user_features_prep_types, count, fit_prior, alpha) if use_user_info: info_str = str(user_features) else: info_str = "" print() print("Precess with new config:") print("approach", "model_type", "split", "use_user_info", "info_str", "count", "fit_prior", "alpha") print(approach, model_type, split, use_user_info, info_str, count, fit_prior, alpha) print() serve_data.do(dataset) NaiveBayes.do(dataset) with open( dataset + "/json_files/config.json", "r") as fp: config = json.load(fp) title = dataset + "_predictions_" + \ "fit" + config["fit_set"] + \ "_pred" + config["pred_set"] + \ "_" + config["model_type"] + \ "_approach" + str(config["approach"]) + \ "_split" + config["split"] + \ "_count" + str(config["count"]) + \ "_info" + str(config["use_user_info"]) + config["info_string"] pred_file = dataset + "/npy_files/" + title + ".npy" if split == "orders": KPM = np.sign( np.load(dataset + "/npy_files/test_KPM.npy")) elif split == "clients": KPM = np.sign( np.load(dataset + "/npy_files/full_KPM.npy") [np.load( dataset + "/npy_files/test_index.npy")]) if approach == "binary": threshold = 0.5 elif approach == "multi": threshold = 1 / config["n_Produkte"] n_orders = np.sum(KPM, axis=None) predictions = np.load(pred_file) y_prop = predictions.flatten() y_soll = KPM.flatten() y_pred = y_prop > threshold top_n_score_list = [] for top_n in top_n_list: n_hits = 0 for client_index in range( len(predictions)): bought_items = np.argwhere( KPM[client_index] == 1)[:, 0] for item_index in bought_items: if item_index in np.array( sorted(zip( predictions[ client_index], np.arange( len(predictions[ client_index] ))), reverse=True) )[:, 1][:top_n]: n_hits += 1 top_n_score_list.append(n_hits / n_orders) cmat = metrics.confusion_matrix( y_soll, y_pred) [[tn, fp], [fn, tp]] = cmat out_dict = { "filename": str(pred_file), "mse": float( metrics.mean_squared_error( y_soll, y_prop)), "neg_log_loss": float(metrics.log_loss(y_soll, y_prop)), "Accuracy": float( metrics.accuracy_score( y_soll, y_pred)), "Precision": float( metrics.precision_score( y_soll, y_pred)), "Recall": float( metrics.recall_score( y_soll, y_pred)), "F1": float(metrics.f1_score(y_soll, y_pred)), "tn": int(tn), "fp": int(fp), "fn": int(fn), "tp": int(tp) } for top_n, score in zip( top_n_list, top_n_score_list): full_out_dict["top_" + str(top_n) + "_score"].append( float(score)) print(pred_file + ":") print("MSE", out_dict["mse"]) print("neg_log_loss", out_dict["neg_log_loss"]) print("Accuracy", out_dict["Accuracy"]) print("Precision", out_dict["Precision"]) print("Recall", out_dict["Recall"]) print("F1", out_dict["F1"]) print("Confusion Matrix (tn,fp,fn,tp)") print(cmat) for top_n, score in zip( top_n_list, top_n_score_list): print( str(score * 100) + "%\tder getätigten käufte sind in der top", top_n, "der Produktempfehlungen") full_out_dict["filename"].append( str(pred_file)) full_out_dict["approach"].append( str(approach)) full_out_dict["model_type"].append( str(model_type)) full_out_dict["split"].append(str(split)) full_out_dict["count"].append(str(count)) full_out_dict["use_user_info"].append( str(use_user_info)) full_out_dict["threshold"].append( float(threshold)) full_out_dict["info_str"].append( str(info_str)) full_out_dict["fit_prior"].append( fit_prior) full_out_dict["alpha"].append(float(alpha)) full_out_dict["mse"].append( float(out_dict["mse"])) full_out_dict["neg_log_loss"].append( float(out_dict["neg_log_loss"])) full_out_dict["Accuracy"].append( float(out_dict["Accuracy"])) full_out_dict["Precision"].append( float(out_dict["Precision"])) full_out_dict["Recall"].append( float(out_dict["Recall"])) full_out_dict["F1"].append( float(out_dict["F1"])) full_out_dict["tn"].append( int(out_dict["tn"])) full_out_dict["fp"].append( int(out_dict["fp"])) full_out_dict["fn"].append( int(out_dict["fn"])) full_out_dict["tp"].append( int(out_dict["tp"])) pd.DataFrame(full_out_dict).to_csv( dataset + "/csv_files/variant_check.csv", index_label="row_index", sep=";") print("-" * 100)
def eval_model(cls, master_gpu_id, model, eval_dataset, eval_batch_size=1, use_cuda=False, num_workers=1): model.eval() eval_dataloader = DataLoader(dataset=eval_dataset, pin_memory=use_cuda, batch_size=eval_batch_size, num_workers=num_workers, shuffle=False) predicted_probs = [] true_labels = [] batch_count = 1 for batch in tqdm(eval_dataloader, unit="batch", ncols=100, desc="Evaluating process: "): labels = batch["label"].cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else batch["label"] tokens = batch["tokens"].cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else batch["tokens"] segment_ids = batch["segment_ids"].cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else batch[ "segment_ids"] attention_mask = batch["attention_mask"].cuda( master_gpu_id ) if use_cuda and master_gpu_id is not None else batch[ "attention_mask"] with torch.no_grad(): main_output, asr_output = model(tokens, segment_ids, attention_mask) # 将模型输出转为列表 main_output = torch.softmax(main_output, dim=1).cpu().tolist() # 获取正例结果 prob = np.array(main_output)[:, 1] # 将该Batch的正例预测值列表拼接至全局正例预测值列表中 predicted_probs.extend(prob.tolist()) # 将真实label列表拼接至全局真实label列表 true_labels.extend(labels.tolist()) LoggerHelper.info("Batch: " + str(batch_count)) batch_count += 1 predicted_probs = [round(prob, 2) for prob in predicted_probs] precision, recall, _thresholds = precision_recall_curve( true_labels, predicted_probs) auc = roc_auc_score(true_labels, predicted_probs) logloss = log_loss(true_labels, predicted_probs) for i in range(len(_thresholds)): log_str_th = 'VAL => Thresholds: {0:>2}, Precision: {1:>7.2%}, Recall: {2:>7.2%}, F1: {3:>7.2%}'.format( _thresholds[i], precision[i], recall[i], f1_score(precision[i], recall[i])) LoggerHelper.info(log_str_th) LoggerHelper.info("AUC: " + str(auc)) LoggerHelper.info("Logloss: " + str(logloss)) return
#from sklearn.metrics import accuracy_score from sklearn.metrics import accuracy_score print ("Accuracy : ", accuracy_score(y_test,predictions)*100) #MAE L1 loss function - Should be close to 0 from sklearn.metrics import mean_absolute_error mean_absolute_error(y_test,predictions) #y_target, y_pred #MAE L2 loss function - Should be close to 0 from sklearn.metrics import mean_squared_error mean_squared_error(y_test,predictions) #y_target, y_pred # Log Loss - Should be close to 0 - Only for classification models from sklearn.metrics import log_loss log_loss(y_test,predictions) # Get ROC curve for Logistic Regression get_roc(y_test,predictions) get_prec_recall(y_test,predictions) """Logistic Regression Model evaluation based on K-fold cross-validation using cross_validate() function""" from sklearn.model_selection import cross_validate scoring = {'accuracy': 'accuracy', 'log_loss': 'neg_log_loss', 'auc': 'roc_auc'} results = cross_validate(logmodel, X, y, cv=10, scoring=list(scoring.values()), return_train_score=False)
# optimizer = Nadam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) # optimizer = Lookahead(optimizer=optimizer, k=10, alpha=0.5) scheduler = None # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, # max_lr=1e-2, epochs=EPOCHS, steps_per_epoch=len(train_loader)) # loss_fn = nn.BCEWithLogitsLoss() loss_fn = SmoothBCEwLogits(smoothing=0.005) model_weights = f"{CACHE_PATH}/online_model{_fold}.pth" es = EarlyStopping(patience=EARLYSTOP_NUM, mode="max") for epoch in range(EPOCHS): train_loss = train_fn(model, optimizer, scheduler, loss_fn, train_loader, device) valid_pred = inference_fn(model, valid_loader, device) valid_auc = roc_auc_score(valid[target_cols].values, valid_pred) valid_logloss = log_loss(valid[target_cols].values, valid_pred) valid_pred = np.median(valid_pred, axis=1) valid_pred = np.where(valid_pred >= 0.5, 1, 0).astype(int) valid_u_score = utility_score_bincount(date=valid.date.values, weight=valid.weight.values, resp=valid.resp.values, action=valid_pred) print(f"FOLD{_fold} EPOCH:{epoch:3} train_loss={train_loss:.5f} " f"valid_u_score={valid_u_score:.5f} valid_auc={valid_auc:.5f} " f"time: {(time.time() - start_time) / 60:.2f}min") es(valid_auc, model, model_path=model_weights) #auc로 중단하지말고 train_loss나 u_score를 기준으로 중단해야할 듯 if es.early_stop: print("Early stopping") break # torch.save(model.state_dict(), model_weights) if True: valid_pred = np.zeros((len(valid), len(target_cols))) for _fold in range(NFOLDS):
def compare_models(X_train, y_train) : # Split between train and cross-validation sets X_train, X_cv, y_train, y_cv = train_test_split( X_train, y_train, test_size=0.2, random_state=4) ### KNN K = 10 accuracy = np.zeros((K-1)) for n in range(1,K): #Train model KNN = KNeighborsClassifier(n_neighbors = n) KNN.fit(X_train,y_train) # Predict yhat_KNN = KNN.predict(X_cv) accuracy[n-1] = metrics.accuracy_score(y_cv, yhat_KNN) # Display results # plt.plot(range(1,K),accuracy,'g') # plt.legend('Accuracy') # plt.ylabel('Accuracy ') # plt.xlabel('Number of Neighbours (K)') # plt.tight_layout() # plt.show() # print( "KNearestNeighbour's accuracy (with k =", accuracy.argmax()+1, ") :", accuracy.max()) # Train model with the best k k_KNN = accuracy.argmax()+1 KNN = KNeighborsClassifier(n_neighbors = k_KNN) KNN.fit(X_train,y_train) yhat_KNN = KNN.predict(X_test) Jaccard_KNN = metrics.jaccard_score(y_test, yhat_KNN, pos_label='PAIDOFF') F1Score_KNN = f1_score(y_test, yhat_KNN, average='weighted') KNN_validity = sum(yhat_KNN != 'PAIDOFF')/len(yhat_KNN) if KNN_validity < 0.1 : KNN_validity = False else : KNN_validity = True print("KNN\n", (classification_report(y_test, yhat_KNN))) ### Decision Tree # Train model K = 20 accuracy = np.zeros((K-3)) for depth in range(3,K) : # Train model LoanTree = DecisionTreeClassifier(criterion="entropy", max_depth = depth) LoanTree.fit(X_train,y_train) # Predict yhat_Tree = LoanTree.predict(X_cv) accuracy[depth-3] = metrics.accuracy_score(y_cv, yhat_Tree) # Display results # plt.plot(range(3,K),accuracy,'g') # plt.legend('Accuracy') # plt.ylabel('Accuracy ') # plt.xlabel('Max depth') # plt.tight_layout() # plt.show() # print( "Decision Tree's accuracy (with max depth =", accuracy.argmax()+3, ") :", accuracy.max()) # Train model with the best max_depth max_depth = accuracy.argmax()+1 LoanTree = DecisionTreeClassifier(criterion="entropy", max_depth = max_depth) LoanTree.fit(X_train,y_train) yhat_Tree = LoanTree.predict(X_test) Jaccard_Tree = metrics.jaccard_score(y_test, yhat_Tree, pos_label='PAIDOFF') F1Score_Tree = f1_score(y_test, yhat_Tree, average='weighted') Tree_validity = sum(yhat_Tree != 'PAIDOFF')/len(yhat_KNN) if Tree_validity < 0.1 : Tree_validity = False else : Tree_validity = True print("Decision Tree\n", classification_report(y_test, yhat_Tree)) ### SVM # Train model SVM = svm.SVC() param_grid_SVM = [{'C': [0.01, 0.1, 0.3, 1, 10], 'gamma': [0.001], 'kernel': ['linear', 'rbf', 'sigmoid']}] gridSVM=GridSearchCV(SVM, param_grid=param_grid_SVM, cv=5) gridSVM.fit(X_train,y_train) #print('Accuracy :', gridSVM.best_score_) SVM_params = gridSVM.best_params_ #print('Best parameters :', gridSVM.best_params_) # Train model with best parameters SVM = svm.SVC(C = SVM_params['C'], gamma = SVM_params['gamma'], kernel = SVM_params['kernel']) SVM.fit(X_train, y_train) yhat_SVM = SVM.predict(X_test) Jaccard_SVM = jaccard_score(y_test, yhat_SVM, pos_label='PAIDOFF') F1Score_SVM = f1_score(y_test, yhat_SVM, average='weighted') SVM_validity = sum(yhat_SVM != 'PAIDOFF')/len(yhat_SVM) if SVM_validity < 0.1 : SVM_validity = False else : SVM_validity = True print("SVM\n", classification_report(y_test, yhat_SVM)) ### Logistic regression LR = LogisticRegression(C=0.01, solver='liblinear') LR.fit(X_train,y_train) reg_parameter = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1] accuracy = np.zeros(len(reg_parameter)) for i,c in enumerate(reg_parameter) : # Train model LR = LogisticRegression(C = c, solver='liblinear') LR.fit(X_train,y_train) # Predict yhat_LR = LR.predict(X_cv) accuracy[i] = metrics.accuracy_score(y_cv, yhat_LR) # Display results # plt.semilogx(reg_parameter,accuracy,'g') # plt.legend('Accuracy') # plt.ylabel('Accuracy ') # plt.xlabel('C') # plt.tight_layout() # plt.show() # print( "Logistic regression's accuracy (with C =", reg_parameter[accuracy.argmax()], ") :", accuracy.max()) # Train model with the best C C = reg_parameter[accuracy.argmax()] LR = LogisticRegression(C = C, solver='liblinear') LR.fit(X_train,y_train) yhat_LR = LR.predict(X_test) yhat_prob = LR.predict_proba(X_test) Jaccard_LR = jaccard_score(y_test, yhat_LR, pos_label='PAIDOFF') F1Score_LR = f1_score(y_test, yhat_LR, average='weighted') Log_LR = log_loss(y_test, yhat_prob) LR_validity = sum(yhat_LR != 'PAIDOFF')/len(yhat_LR) if LR_validity < 0.1 : LR_validity = False else : LR_validity = True print("Logistic regression\n", classification_report(y_test, yhat_LR)) ##################### ### Final results ### ##################### Table = pd.DataFrame() Table['Algorithm'] = ["KNN", "Decision Tree", "SVM", "Logistic Regression"] Table['Jaccard'] = [Jaccard_KNN, Jaccard_Tree, Jaccard_SVM, Jaccard_LR] Table['F1-score'] = [F1Score_KNN, F1Score_Tree, F1Score_SVM, F1Score_LR] Table['Log Loss'] = ["NA", "NA", "NA", Log_LR] Table['Valid'] = [KNN_validity, Tree_validity, SVM_validity, LR_validity] return print("Results table\n", Table.head())
# 线下评分 pre_score = model.predict(test_all, num_iteration=model.best_iteration) #pre_score = model.predict(test_all) score_data = test_all.copy() score_data['label'] = pre_score score_data = score_data[["order_id", "product_id", "label"]].copy() #保存训练的以后后期做stacking使用 if st == 0: stacking_data = score_data else: stacking_data = stacking_data.append(score_data) st += 1 #test_all[:10000].to_csv("save_little_2_%s.csv"%st) #pd.DataFrame(y_test).to_csv("save_y_2_%s.csv"%st) logloss = log_loss(y_test, pre_score) logloss_list.append(logloss) print(logloss_list) pred = model.predict(test_data, num_iteration=model.best_iteration) preds[:, j] = pred j += 1 del model gc.collect() stacking_data.to_csv("stacking_data_v12_shuff_10000.csv", index=None) with open("score_note.txt", "a") as f: f.write( str(train_x.shape[1]) + "\n" + str(score_list) + "=====>" +
def train(self, data, labels, epochs=30, cv_split_num=None, validation_data=None, savebest=False, filepath=None): """ train network on given data parameters: - data: numpy array 2d numpy array (doc x word ids) of input data - labels: numpy array 2d numpy array of one-hot-encoded labels - epochs: int (default: 30) number of epochs to train for - validation_data: tuple (optional) tuple of numpy arrays (X,y) representing validation data - savebest: boolean (default: False) set to True to save the best model based on validation score per epoch - filepath: string (optional) path to save model if savebest is set to True outputs: None """ if savebest == True and filepath == None: raise Exception("Please enter a path to save the network") if validation_data: validation_size = len(validation_data[0]) else: validation_size = len(data) print('training network on %i documents, validating on %i documents' \ % (len(data), validation_size)) # Removing: with self.sess as sess: #with self.sess as sess: # Output directory for models and summaries timestamp = str(int(time.time())) # Track best model for saving. prevbest = 0 for i in range(epochs): # TODO FEATURE Add gathering of stats for confusion matrix. correct = 0 y_pred = [] y_true = [] start = time.time() # Train. counter = 0 for doc in range(len(data)): counter += 1 inputval = self._list_to_numpy(data[doc]) feed_dict = { self.doc_input: inputval, self.labels: labels[doc], self.dropout: self.dropout_keep } pred, cost, _ = self.sess.run( [self.prediction, self.loss, self.optimizer], feed_dict=feed_dict) # Collect raw stats for calculating metrics. if np.argmax(pred) == np.argmax(labels[doc]): correct += 1 # Collect predictions for calculating metrics with sklearn. # Build array of y_pred. # Insert each prediction at the same index of its label # in the y_true array. y_pred.insert(doc, np.argmax(pred)) y_true.insert(doc, np.argmax(labels[doc])) sys.stdout.write( "epoch %i, sample %i of %i, loss: %f \r" % (i + 1, doc + 1, len(data), cost)) sys.stdout.flush() if (doc + 1) % 50000 == 0: score = self.score(validation_data[0], validation_data[1]) print("iteration %i validation accuracy: %.4f%%" % (doc + 1, score * 100)) print() # print("training time: %.2f" % (time.time()-start)) trainscore = correct / len(data) print("epoch %i (Gao's) training accuracy: %.4f" % (i + 1, trainscore)) # Log metrics per epoch. # TODO Print a clean, well-organized report. logging.debug(print('correct:', correct)) logging.debug(print('total:', counter)) logging.debug(confusion_matrix(y_true, y_pred)) # Get results from confusion matrix. conf_matrix_arr = confusion_matrix(y_true, y_pred) TP = conf_matrix_arr[1][1] FP = conf_matrix_arr[0][1] TN = conf_matrix_arr[0][0] FN = conf_matrix_arr[1][0] logging.debug(classification_report(y_true, y_pred)) logging.debug(print('accuracy:', accuracy_score(y_true, y_pred))) logging.debug(print('precision:', precision_score(y_true, y_pred))) logging.debug(print('recall:', recall_score(y_true, y_pred))) logging.debug(print('f1:', f1_score(y_true, y_pred))) logging.debug(print('log loss:', log_loss(y_true, y_pred))) # Validate. if validation_data: score = self.score(validation_data[0], validation_data[1]) print("epoch %i validation accuracy: %.4f%%" % (i + 1, score)) # Write results to file. results_dir = Path('results') / str(self.run_id) # Create directory for run within results directory. try: os.makedirs(results_dir) except FileExistsError: logging.info('Run directory already exists.') # Build path to which to write results. results_filename = str(self.run_id) + '.csv' results_path = results_dir / results_filename # Check for existence of csv to determine if header is needed. results_file_exists = os.path.isfile(results_path) # Open file, write/append results. with open(str(results_path), mode='a') as csv_file: fieldnames = [ 'cv_split', 'epoch', 'num_recs', 'tp', 'fp', 'tn', 'fn', 'skl_acc', 'skl_prec', 'skl_recall', 'skl_f1', 'skl_f1_micro_avg', 'skl_f1_macro_avg', 'skl_f1_weighted_avg', 'skl_log_loss', 'skl_auc', 'gao_train_acc', 'gao_val_acc' ] writer = csv.DictWriter(csv_file, fieldnames=fieldnames) # Write header only if results csv did not exist at beginning # of this trip through. if not results_file_exists: writer.writeheader() # Write row for each epoch. csv_row = { 'cv_split': cv_split_num, 'epoch': i + 1, 'num_recs': counter, 'tp': TP, 'fp': FP, 'tn': TN, 'fn': FN, 'skl_acc': accuracy_score(y_true, y_pred), 'skl_prec': precision_score(y_true, y_pred), 'skl_recall': recall_score(y_true, y_pred), 'skl_f1': f1_score(y_true, y_pred), 'skl_f1_micro_avg': f1_score(y_true, y_pred, average='micro'), 'skl_f1_macro_avg': f1_score(y_true, y_pred, average='macro'), 'skl_f1_weighted_avg': f1_score(y_true, y_pred, average='weighted'), 'skl_log_loss': log_loss(y_true, y_pred), 'skl_auc': roc_auc_score(y_true, y_pred), 'gao_train_acc': trainscore, 'gao_val_acc': score } writer.writerow(csv_row) # Plot ROC Curve and AUC score for last epoch. if i == epochs - 1: fpr, tpr, thresholds = roc_curve(y_true, y_pred) auc = roc_auc_score(y_true, y_pred) plt.clf() plt.plot(fpr, tpr, 'r-', label='CNN: %.3f' % auc) plt.plot([0, 1], [0, 1], 'k-', label='Random') plt.plot([0, 0, 1, 1], [0, 1, 1, 1], 'g-', label='Perfect') plt.legend() plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') fig_filename = 'ROC_CV_Split_' + str(cv_split_num) fig_path = results_dir / fig_filename plt.savefig(fig_path) # Save if performance better than previous best. if savebest and score >= prevbest: prevbest = score self.save(filepath)
def train_linear_classifier(learning_rate, steps, batch_size, df_train, df_validate, features, target, threshold): """Trains a dnn classification model. In addition to training, this function also prints training progress information, a plot of the training and validation loss over time, and a confusion matrix. Args: learning_rate: An `int`, the learning rate to use. steps: A non-zero `int`, the total number of training steps. A training step consists of a forward and backward pass using a single batch. batch_size: A non-zero `int`, the batch size. df_train: A `DataFrame` containing the training features and labels. df_validate: A `DataFrame` containing the validation features and labels. Returns: The trained `DNNClassifier` object. """ periods = 10 steps_per_period = steps / periods # prepare features and targets train_features = df_train[features] train_targets = df_train[target] validate_features = df_validate[features] validate_targets = df_validate[target] # create the input functions. train_fn = lambda: train_input_fn(features=train_features, targets=train_targets, batch_size=batch_size, shuffle=True, num_epochs=None) train_pred_fn = lambda: train_input_fn(features=train_features, targets=train_targets, batch_size=1, shuffle=False, num_epochs=1) validate_pred_fn = lambda: train_input_fn(features=validate_features, targets=validate_targets, batch_size=1, shuffle=False, num_epochs=1) # Create a LinearClassifier object. my_optimizer = tf.train.GradientDescentOptimizer( learning_rate=learning_rate) my_optimizer = tf.contrib.estimator.clip_gradients_by_norm( my_optimizer, 5.0) classifier = tf.estimator.LinearClassifier( feature_columns=construct_feature_columns(train_features), optimizer=my_optimizer, config=tf.estimator.RunConfig(keep_checkpoint_max=1)) # Train the model, but do so inside a loop so that we can periodically assess # loss metrics. print("Training model...") train_validate_metrics = pd.DataFrame() for period in range(0, periods): # Train the model, starting from the prior state. classifier.train(input_fn=train_fn, steps=steps_per_period) # Take a break and compute probabilities. train_pred = list(classifier.predict(input_fn=train_pred_fn)) train_prob = np.array([item['probabilities'] for item in train_pred]) validate_pred = list(classifier.predict(input_fn=validate_pred_fn)) validate_prob = np.array( [item['probabilities'] for item in validate_pred]) # Compute training and validation errors. train_metrics = { 'train-logloss': [metrics.log_loss(train_targets, train_prob)], 'test-logloss': [metrics.log_loss(validate_targets, validate_prob)], 'train-error': [ calc_err_at_threshold([p[1] for p in train_prob], train_targets, threshold) ], 'test-error': [ calc_err_at_threshold([p[1] for p in validate_prob], validate_targets, threshold) ], } # Occasionally print the current loss. print( " period %02d (%d samples): LogLoss: %0.2f/%0.2f, Error: %0.2f/%0.2f" % (period, (period + 1) * steps_per_period * batch_size, train_metrics['train-logloss'][0], train_metrics['test-logloss'][0], train_metrics['train-error'][0], train_metrics['test-error'][0])) # Add the loss metrics from this period to our list. train_validate_metrics = train_validate_metrics.append( train_metrics, ignore_index=True) print("Model training finished.") # Remove event files to save disk space. _ = map( os.remove, glob.glob(os.path.join(classifier.model_dir, 'events.out.tfevents*'))) # Output a graph of loss metrics over periods. plt.figure(figsize=(10, 5)) plt.subplot(1, 2, 1) plt.ylabel("LogLoss") plt.xlabel("Periods") plt.title("LogLoss vs. Periods") plt.plot(list(train_validate_metrics['train-logloss']), label="training") plt.plot(list(train_validate_metrics['test-logloss']), label="validation") plt.legend() # Output a graph of error metrics over periods. plt.subplot(1, 2, 2) plt.ylabel("Error") plt.xlabel("Periods") plt.title("Error vs. Periods") plt.plot(list(train_validate_metrics['train-error']), label="training") plt.plot(list(train_validate_metrics['test-error']), label="validation") plt.legend() plt.tight_layout() return classifier
# Lets look what Hydra sees y_valud_preds_df = hydra_model_df.predict(prepare_input_data( hydra_model_df, X_valid), verbose=0) base_class = np.argmax(y_valid, axis=1) preds = np.argmax(y_valud_preds_df, axis=1) # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html sns.heatmap(pd.DataFrame(confusion_matrix(base_class, preds)), annot=True, linewidths=.5, fmt="d") print("f1: {:0.6f} log loss: {:0.6f}".format( f1_score(base_class, preds, average='macro'), log_loss(y_valid, y_valud_preds_df))) print(timer(start_time)) sub_preds_df = hydra_model_df.predict(prepare_input_data(hydra_model_df, test), verbose=0) predictions_df = pd.DataFrame( sub_preds_df, columns=["Class_1", "Class_2", "Class_3", "Class_4"]) blend_l1 = pd.read_csv( "/kaggle/input/tps05blender-v2/tps05-remek-blender_v2.csv") output = predictions_df.copy() output["Class_1"] = (predictions_df.Class_1 * 0.3 + blend_l1.Class_1 * 0.7) output["Class_2"] = (predictions_df.Class_2 * 0.3 + blend_l1.Class_2 * 0.7) output["Class_3"] = (predictions_df.Class_3 * 0.3 + blend_l1.Class_3 * 0.7) output["Class_4"] = (predictions_df.Class_4 * 0.3 + blend_l1.Class_4 * 0.7)
if (log < min): min = log a = (n, learning_rate) test_loss = np.hstack((test_loss, np.array([log]))) y_pred_gen = gbc.staged_decision_function(X_train) for i in y_pred_gen: #print(i) y_pred = 1 / (1 + np.exp(-i)) train_loss = np.hstack((train_loss, np.array([log_loss(y_train, y_pred)]))) import matplotlib.pyplot as plt #%matplotlib inline plt.figure() plt.plot(test_loss, 'r', linewidth=2) plt.plot(train_loss, 'g', linewidth=2) plt.legend(['test', 'train']) ''' clf = RandomForestClassifier(n_estimators=37, random_state=241) clf.fit(X_train, y_train) predictions = clf.predict_proba(X_test) y_pred = 1 / (1 + np.exp(-predictions)) print(log_loss(y_test, predictions))
def buildKB15(): ## data # read the training/test data print('Importing Data') xtrain = pd.read_csv('../input/train.csv') xtest = pd.read_csv('../input/test.csv') xtrain.fillna(-1, inplace=True) xtest.fillna(-1, inplace=True) # separate id_train = xtrain.ID xtrain.drop('ID', axis=1, inplace=True) ytrain = xtrain.target xtrain.drop('target', axis=1, inplace=True) id_test = xtest.ID xtest.drop('ID', axis=1, inplace=True) # drop v22 - categorical with 18211 possible values xtrain.drop('v22', axis=1, inplace=True) xtest.drop('v22', axis=1, inplace=True) # folds for cv xfolds = pd.read_csv('../input/xfolds.csv') fold_index = xfolds.fold5 fold_index = np.array(fold_index) - 1 n_folds = len(np.unique(fold_index)) ## processing # identify columns classes categorical_cols = [ f for f in xtrain.columns if xtrain[f].dtype not in ['float64', 'int64'] ] numerical_cols = [ f for f in xtrain.columns if xtrain[f].dtype in ['float64'] ] # number of unique values # headcounts = [len(np.unique(xtrain[f])) for f in categorical_cols] # convert all categoricals: expand into binary indicators, use as features # fed into NaiveBayes, drop the original for col in categorical_cols: print(col) newname = 'nb_' + col # transform the joint set into dummies xloc = pd.concat((xtrain[col], xtest[col]), axis=0, ignore_index=True) xloc = pd.get_dummies(xloc) # separate back into training and test xtr = xloc.ix[range(0, xtrain.shape[0])] xte = xloc.ix[range(xtrain.shape[0], xloc.shape[0])] # storage vector for the new features (train and test) newvar_train = np.zeros((xtrain.shape[0])) # build a stacked version along the training set for j in range(0, n_folds): idx0 = np.where(fold_index != j) idx1 = np.where(fold_index == j) x0 = np.array(xtr)[idx0, :][0] x1 = np.array(xtr)[idx1, :][0] y0 = np.array(ytrain)[idx0] y1 = np.array(ytrain)[idx1] nb = BernoulliNB() nb.fit(x0, y0) newvar_train[idx1] = nb.predict_proba(x1)[:, 1] print(log_loss(y1, newvar_train[idx1])) # build a stacked version along the test set nb.fit(xtr, ytrain) newvar_test = nb.predict_proba(xte)[:, 1] # park into training and test sets xtrain[newname] = newvar_train xtest[newname] = newvar_test xtrain.drop(col, axis=1, inplace=True) xtest.drop(col, axis=1, inplace=True) ## store the results # add indices etc xtrain = pd.DataFrame(xtrain) xtrain['ID'] = id_train xtrain['target'] = ytrain # xtest = pd.DataFrame(xtest) xtest['ID'] = id_test # # # # save the files xtrain.to_csv('../input/xtrain_kb15.csv', index=False, header=True) xtest.to_csv('../input/xtest_kb15.csv', index=False, header=True) return
from sklearn.externals import joblib def load_train_data(): train = pd.read_csv('train.csv') labels = train.target.values lbl_enc = preprocessing.LabelEncoder() labels = lbl_enc.fit_transform(labels) train = train.drop('id', axis=1) train = train.drop('target', axis=1) return train.values, labels.astype('int32') def train_model_random_forest(train, labels): # train a random forest classifier model = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=1000) model.fit(train, labels) joblib.dump(model, 'rf_model2.model') return model X, y = load_train_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3253) model = train_model_random_forest(X_train, y_train) preds = model.predict_proba(X_test) print "MLogloss: ", log_loss(y_test, preds)
y_train, y_valid = y[train_index], y[valid_index] if count == 0: actual = y_valid else: actual = np.append(actual, y_valid, axis=0) list_result = Parallel(n_jobs=6)(delayed(train_class)(X_train, y_train, X_valid, i) for i in range(num_classes)) preds_fold = pd.concat(list_result, axis = 1) if count == 0: preds_epoch = preds_fold.copy() else: preds_epoch = preds_epoch.append(preds_fold, ignore_index=True) count += 1 print "logloss", log_loss(actual, preds_epoch.as_matrix()) if cv == 0: preds_epoch['id'] = ids_test.astype(float).astype(int) preds_epoch.to_csv('../data/output-py/test_raw/' + os.path.splitext(pred_file)[0] + '.epoch' + str(e) + '.csv', index=False) preds_epoch = preds_epoch.drop('id', axis=1) else: preds_epoch['id'] = ids_train_folds.astype(float).astype(int) preds_epoch.to_csv('../data/output-py/train_raw/' + os.path.splitext(pred_file)[0] + '.epoch' + str(e) + '.csv', index=False) preds_epoch = preds_epoch.drop('id', axis=1) if e == 0: preds = preds_epoch.copy() else: preds = preds.add(preds_epoch, fill_value=0) if cv == 1: preds_epoch = preds.copy()