def test_auc(self): self.assertAlmostEqual(metrics.auc([1, 0, 1, 1], [.32, .52, .26, .86]), 1.0 / 3) self.assertAlmostEqual( metrics.auc([1, 0, 1, 0, 1], [.9, .1, .8, .1, .7]), 1) self.assertAlmostEqual(metrics.auc([0, 1, 1, 0], [.2, .1, .3, .4]), 1.0 / 4) self.assertAlmostEqual( metrics.auc([1, 1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 1.0 / 2)
def run(self, hyper_classifier, training_data, training_target, testing_data, testing_target): ''' TODO DOCUMENTATION ''' results = {'name': self.name, 'parameterization': self.parameterization, 'exception': None} try: self.classifier = hyper_classifier.make_classifier(training_data, training_target, **self.parameterization) self.classifier.fit(training_data, training_target) results['predicted'] = self.classifier.predict(testing_data) except MemoryError as e: raise e except Exception as e: print(repr(e)) results['exception'] = e else: # attempt to save memory del(self.classifier) self.classifier = None results['ml_metric_ce'] = ml_metrics.ce(testing_target, results['predicted']) results['ml_metric_rmse'] = ml_metrics.rmse(testing_target, results['predicted']) results['sklearn_metric_accuracy'] = sklearn.metrics.accuracy_score(testing_target, results['predicted']) results['sklearn_metric_f1'] = sklearn.metrics.f1_score(testing_target, results['predicted']) results['sklearn_metric_precision'] = sklearn.metrics.precision_score(testing_target, results['predicted']) results['sklearn_metric_recall'] = sklearn.metrics.recall_score(testing_target, results['predicted']) results['ml_metric_auc'] = {} results['sklearn_metric_auc'] = {} for label in set(testing_target): binary_testing_target = np.array(map(lambda x: 1 if x == label else 0, testing_target)) binary_predicted = np.array(map(lambda x: 1 if x == label else 0, results['predicted'])) results['ml_metric_auc'][label] = ml_metrics.auc(binary_testing_target, binary_predicted) results['sklearn_metric_auc'][label] = sklearn.metrics.auc_score(binary_testing_target, binary_predicted) return results
def score(): gold = pandas.read_table(insults.DataFile('Inputs','test_with_solutions.csv'),sep=',') private = gold[gold.Usage=='PrivateTest'].Insult public = gold[gold.Usage=='PublicTest'].Insult data = [] for fn in os.listdir(insults.DataDirectory('Submissions')): if fn[-4:] == ".csv": guess = pandas.read_table(insults.DataFile('submissions',fn),sep=',') pub_guess = guess.Insult[public.index] priv_guess = guess.Insult[private.index] data.append({"fn": fn[:-4], "score" :ml_metrics.auc(gold.Insult,guess.Insult), "public": ml_metrics.auc(np.array(public),np.array(pub_guess)), "private": ml_metrics.auc(np.array(private),np.array(priv_guess)), }) print pandas.DataFrame(data,columns=("fn","score","public","private")).sort('score')
def score(pred, y): ''' 给最后测试结果打分,根据不同的标准,这里需要每次都改 ''' print(y, pred) metric = metrics.auc(y, pred) print(metric) return -metric
def first_test(): from ml_metrics import auc import random from sklearn import datasets b = BasicLogisticRegression(4) iris = datasets.load_iris() train_data = iris.data[:75] train_y = iris.target[:75] test_x = iris.data[75:100] tmp = iris.target[:100] random.shuffle(tmp) test_y = tmp[:50] def to_dict(x): return {i: k for i, k in enumerate(x, start=1)} for z in xrange(50): for x, y in random.shuffle(zip(train_data, train_y)): # print x, y b.sgd_fit_one(to_dict(x), y) print "fit done" rst_y = map(b.predict_raw, map(to_dict, test_x)) print b.weights print test_y print rst_y print auc(test_y, rst_y) # print len(iris.data) # # another implementation from sgd import log_reg_sgd, h theta, err = log_reg_sgd(train_data, train_y, 0.001, max_iter=100) pred = [h(i, theta) for i in test_x] print "theta,", theta print auc(test_y, pred)
def staged_auc(self,X,y): """ calculate the AUC after each of the stages. returns: ns -- list of iteration numbers aucs -- list of corresponding areas under the curve. """ y = np.array(y) results = [ (n, ml_metrics.auc(y,p)) for n,p in self.staged_predict(X)] return zip(*results) # Python idiom unzips list into two parallel ones.
def score(): gold = pandas.read_table(insults.DataFile("Inputs", "test_with_solutions.csv"), sep=",") private = gold[gold.Usage == "PrivateTest"].Insult public = gold[gold.Usage == "PublicTest"].Insult data = [] for fn in os.listdir(insults.DataDirectory("Submissions")): if fn[-4:] == ".csv": guess = pandas.read_table(insults.DataFile("submissions", fn), sep=",") pub_guess = guess.Insult[public.index] priv_guess = guess.Insult[private.index] data.append( { "fn": fn[:-4], "score": ml_metrics.auc(gold.Insult, guess.Insult), "public": ml_metrics.auc(np.array(public), np.array(pub_guess)), "private": ml_metrics.auc(np.array(private), np.array(priv_guess)), } ) print pandas.DataFrame(data, columns=("fn", "score", "public", "private")).sort("score")
def objective(df, selector, trial): selector.set_trial(trial) # NOTE: Use validation set for practical usage. df_trn, df_tst = train_test_split(df, test_size=0.5) X_trn = selector.fit_transform(df_trn).values X_tst = selector.transform(df_tst).values model = LogisticRegression(solver="lbfgs", max_iter=10000) model.fit(X_trn, df_trn["target"]) y_pred = model.predict_proba(X_tst)[:, 1] score = auc(df_tst["target"].values, y_pred) return score
def main(): markdown = PagedownToHtml() print("Reading the private leaderboard file") test = data_io.get_test_df() for i in test.index: test["BodyMarkdown"][i] = markdown.convert(test["BodyMarkdown"][i]) print("Loading the trained model") classifier = data_io.load_model("model.pickle") print("Making predictions") probs = classifier.predict_proba(test) solution = data_io.get_private_leaderboard_solution_df() print("Open AUC: %0.6f" % metrics.auc(solution["open"], probs[:,1]))
def tune_one_fold(i,train_i,test_i): """ Tune one fold of the data. """ global train clf = make_clf(args) ftrain = train[train_i] logging.info('fold %d' % i) clf.fit(ftrain.Comment,ftrain.Insult) ypred = clf.predict(ftrain.Comment) logging.info("%d train auc=%f" % (i, ml_metrics.auc(np.array(ftrain.Insult),ypred))) ypred = clf.predict(train[test_i].Comment) # record information about the auc at each stage of training. xs,ys = clf.staged_auc(train[test_i].Comment,train[test_i].Insult) xs = np.array(xs) ys = np.array(ys) return pandas.DataFrame({ ('auc%d' % i):ys},index=xs)
def classification_model(model, m, predictors, outcome): #Fit the model: model.fit(m[predictors], m[outcome]) #Make predictions on training set: predictions = model.predict(m[predictors]) #Print accuracy accuracy = metrics.accuracy_score(predictions, m[outcome]) print "Accuracy : %s" % "{0:.3%}".format(accuracy) auc = metric.auc(predictions, m[outcome]) print "Auc : %s" % "{0:.3%}".format(auc) recall = metrics.recall_score(predictions, m[outcome]) print "Recall : %s" % "{0:.3%}".format(recall) #Fit the model again so that it can be refered outside the function: model.fit(m[predictors], m[outcome])
RandomForestClassifier(n_estimators=165, max_depth=4, criterion='entropy')) models.append(GradientBoostingClassifier(max_depth=4)) models.append(KNeighborsClassifier(n_neighbors=20)) models.append(GaussianNB()) TRNtrain, TRNtest, TARtrain, TARtest = train_test_split(train, target, test_size=0.3, random_state=0) plt.figure(figsize=(10, 10)) for model in models: model.fit(TRNtrain, TARtrain) pred_scr = model.predict_proba(TRNtest)[:, 1] fpr, tpr, thresholds = roc_curve(TARtest, pred_scr) roc_auc = ml_metrics.auc(TARtest, pred_scr) md = str(model) md = md[:md.find('(')] pl.plot(fpr, tpr, label='ROC fold %s (auc = %0.2f)' % (md, roc_auc)) pl.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6)) pl.xlim([0, 1]) pl.ylim([0, 1]) pl.xlabel('False Positive Rate') pl.ylabel('True Positive Rate') pl.title('Receiver operating characteristic example') pl.legend(loc="lower right") pl.show() #приводим тестовую выборку к нужному формату FieldDrop.append('bad')
def calculate_ftrl_features(train, test, fnames, ftablename, ftrl_type='', optional_date_ftrl3='', optional_condition_ftrl4=''): folds = [x for x in range(1, nfold+1)] global_mean = np.mean(train.is_screener) pred_file = '../data/output-py/ftrl/pred_ftrl.csv' ftrl_all = pd.DataFrame() count = 0 for L in range(1, len(folds)+1): for train_folds in itertools.combinations(folds, L): count = count + 1 print train_folds test_folds = [x for x in folds if not x in list(train_folds)] if len(test_folds) == 0: test_folds = [0] print test_folds if False: store = pd.HDFStore('../data/output-py/ftrl/ftrl_feats' + str(count) + '.h5') ftrl_feats = store.get('ftrl_feats') store.close() else: train_file = save_ftrl_data('train', fnames, ftablename, test_folds, list(train_folds), ftrl_type, optional_date_ftrl3, optional_condition_ftrl4) if 0 in test_folds: test_file = save_ftrl_data('test', fnames, ftablename, test_folds, list(train_folds), ftrl_type, optional_date_ftrl3, optional_condition_ftrl4) else: test_file = save_ftrl_data('val', fnames, ftablename, test_folds, list(train_folds), ftrl_type, optional_date_ftrl3, optional_condition_ftrl4) non_factor_cols = "''" non_feature_cols = "''" text_cols = "'diagnosis_description'" os.system('pypy ftrl' + ftrl_type + '.py' + ' --alpha ' + str(0.07) + ' --beta ' + str(1.0) + ' --L1 ' + str(0.01) + ' --L2 ' + str(1.0) + ' --epoch ' + str(1) + ' --train ' + train_file + ' --test ' + test_file + ' --submission ' + pred_file + ' --non_feature_cols ' + non_feature_cols + ' --non_factor_cols ' + non_factor_cols + ' --text_cols ' + text_cols) ftrl_feats = pd.read_csv(pred_file) ftrl_feats = ftrl_feats.groupby('patient_id')['is_screener_pred'].max().reset_index() for x in folds: if x in list(train_folds): ftrl_feats['fold'+str(x)] = 1 else: ftrl_feats['fold'+str(x)] = 0 store = pd.HDFStore('../data/output-py/ftrl/ftrl_feats' + str(count) + '.h5') store.append('ftrl_feats', ftrl_feats) store.close() os.system('rm -R ' + train_file) os.system('rm -R ' + test_file) os.system('rm -R ' + pred_file) ftrl_all = ftrl_all.append(ftrl_feats, ignore_index=True) ftrl_feats = pd.merge(ftrl_feats, train[['patient_id', 'is_screener']], on='patient_id', how='inner') if len(ftrl_feats)>0: print "Pearson correlation: " + str(pearsonr(ftrl_feats.is_screener, ftrl_feats.is_screener_pred)) print "AUC: " + str(auc(ftrl_feats.is_screener, ftrl_feats.is_screener_pred)) del ftrl_feats feats_all = train[['patient_id']].append(test[['patient_id']], ignore_index=True) for test_fold in ([0] + folds): train_folds = [x for x in folds if (x != test_fold) and (x != 0)] if len(train_folds) == len(folds): pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds]) else: pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds]) + ' and ' + ' and '.join(['fold'+str(x)+'==0' for x in folds if not x in train_folds]) print pd_query ftrl_feats = ftrl_all.query(pd_query).copy().reset_index(drop=True) for x in folds: ftrl_feats.drop('fold'+str(x), axis=1, inplace=True) if test_fold == 0: feats_fold = test[['patient_id']].copy() else: feats_fold = train.query('cv_index==@test_fold')[['patient_id']].copy() feats_fold = pd.merge(feats_fold, ftrl_feats, on='patient_id', how='left') del ftrl_feats for val_fold in [x for x in folds if (x != test_fold) and (x != 0)]: train_folds = [x for x in folds if (x != test_fold) and (x != val_fold) and (x != 0)] pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds]) + ' and ' + ' and '.join(['fold'+str(x)+'==0' for x in folds if not x in train_folds]) ftrl_feats = ftrl_all.query(pd_query).copy().reset_index(drop=True) for x in folds: ftrl_feats.drop('fold'+str(x), axis=1, inplace=True) feats_val_fold = train.query('cv_index==@val_fold')[['patient_id']].copy() feats_val_fold = pd.merge(feats_val_fold, ftrl_feats, on='patient_id', how='left') del ftrl_feats feats_fold = feats_fold.append(feats_val_fold, ignore_index=True) feats_fold = feats_fold.reset_index(drop=True) feats_fold['is_screener_pred'].fillna(global_mean, inplace=True) feats_fold = feats_fold.rename(columns={'is_screener_pred' : '_'.join(fnames) + '_' + ftablename + '_ftrl' + ftrl_type + '_fold_'+str(test_fold)}) feats_all = pd.merge(feats_all, feats_fold, on='patient_id', how='left') print "Writing to HDF5 store..." store = pd.HDFStore('../data/output-py/' + '_'.join(fnames) + '_' + ftablename + '_ftrl' + ftrl_type + '.h5') store.append('feats_all', feats_all) print 'Feature ' + '_'.join(fnames) + '_' + ftablename + '_ftrl' + ftrl_type + ' is saved in file.' store.close() return '_'.join(fnames) + '_' + ftablename + '_ftrl' + ftrl_type
def reverse_auc(labels, predictions): target_neg_one = [1 if x == -1 else 0 for x in labels] neg_predictions = [-x for x in predictions] score = ml_metrics.auc(target_neg_one, neg_predictions) return score
X, y = make_classification(1000000) t_X, t_y = map(torch.FloatTensor, (X, y)) net = LogsticRegression(20, 2) loss_func = torch.nn.modules.loss.CrossEntropyLoss() optimizer = torch.optim.Adam(net.parameters()) bar_epochs = tqdm_notebook(range(epochs)) for e in bar_epochs: bar_epochs.set_description(f"Epoch {e}:") t = tqdm_notebook(range(0, t_X.size(0), batch_size)) for b in t: # for each training step # train your data... b_X = t_X[b:b + batch_size] b_y = t_y[b:b + batch_size] output = net(b_X) # rnn output loss = loss_func( output, b_y.long().view(-1)) # cross entropy loss and y is not one-hotted optimizer.zero_grad() # clear gradients for this training step loss.backward() # backpropagation, compute gradients optimizer.step() if b % 10000 == 0: t.set_description( f"Epoch {e}:" f"Loss: {loss.data.numpy():.5f} | " f"Auc: {auc(b_y.numpy(), output.data.numpy()[:, 1]):.5}") _net = net.eval() auc(y, _net(t_X).data.numpy()[:, -1])
y_pred = train_predict_adaboost_classifier(X_train, y_train, X_test) if 'ftrl' in MODEL: y_pred = train_predict_ftrl(X_train, y_train, X_test) preds = pd.DataFrame() preds['ID'] = test_split['ID'].values preds['FOLD'] = fold preds['ITER'] = it preds[MODEL] = y_pred preds_model = preds_model.append(preds, ignore_index=True) preds = preds.loc[preds['ID'].isin(ids_val)].copy() preds = pd.merge(preds, train[['ID', 'TARGET']], on='ID', how='left') fold_auc = auc(preds['TARGET'], preds[MODEL]) aucs.append(fold_auc) print np.mean(aucs), np.std(aucs) preds_model.loc[preds_model[MODEL]<0, MODEL] = 0.0 preds_model.loc[preds_model[MODEL]>1, MODEL] = 1.0 preds_model = preds_model.groupby(['ID', 'ITER'])[MODEL].mean().reset_index() for it in range(1, 21): preds_model.loc[preds_model['ITER']==it, MODEL] = preds_model.loc[preds_model['ITER']==it, MODEL].rank() preds_model = preds_model.groupby('ID')[MODEL].mean().reset_index() preds_model.columns = ['ID', 'dmitry_'+MODEL] preds_all = pd.merge(preds_all, preds_model, on='ID', how='left') preds_all.to_csv('all_models_temp.csv', index=False) preds_train = pd.merge(train[['ID']], preds_all, on='ID', how='left') preds_train.to_csv(OUTPUT_PATH + 'train/' + 'dmitry_train.csv', index=False)
def forward_auc(labels, predictions): target_one = [1 if x == 1 else 0 for x in labels] score = ml_metrics.auc(target_one, predictions) return score
'compute AUC from VW validation and predictions file' import sys, csv, math from ml_metrics import auc test_file = sys.argv[1] predictions_file = sys.argv[2] test_reader = csv.reader(open(test_file), delimiter=" ") p_reader = csv.reader(open(predictions_file), delimiter="\n") ys = [] ps = [] for p_line in p_reader: test_line = test_reader.next() p = float(p_line[0]) p = math.tanh(p) ps.append(p) y = float(test_line[0]) ys.append(y) AUC = auc(ys, ps) print "AUC: %s" % (AUC) print
'compute AUC from VW validation and predictions file' import sys, csv, math from ml_metrics import auc test_file = sys.argv[1] predictions_file = sys.argv[2] test_reader = csv.reader( open( test_file ), delimiter = " " ) p_reader = csv.reader( open( predictions_file ), delimiter = "\n" ) ys = [] ps = [] for p_line in p_reader: test_line = test_reader.next() p = float( p_line[0] ) p = math.tanh( p ) ps.append( p ) y = float( test_line[0] ) ys.append( y ) AUC = auc( ys, ps ) print "AUC: %s" % ( AUC ) print
def stacking_model_sk_svc(task='together'): nfold = 5 #task='together' train_df = None test_df = None if task == 'together': train_df = pd.read_csv('./data/train_df_day_night_together.csv') test_df = pd.read_csv('./data/test_df_day_night_together.csv') from together_fn_param import list_param elif task == 'split': train_df = pd.read_csv('./data/train_df_day_night_split.csv') test_df = pd.read_csv('./data/test_df_day_night_split.csv') from split_fn_param import list_param train_df = train_df.fillna(-1) test_df = test_df.fillna(-1) print("Data loading Done!") target = 'label' predictors = train_df.columns.values.tolist()[1:-1] categorical = None X_train = train_df[predictors].values X_test = test_df[predictors].values labels = train_df['label'] scaler = MinMaxScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) seeds = np.random.randint(5000, 10000, size=10).tolist() auc_lst = [] auc_lst1 = [] n_estimators_lst = [] stratified = True debug = True param = list_param('sk_svc') oof_preds_folds = np.zeros((train_df.shape[0], len(seeds))) sub_preds_folds = np.zeros((test_df.shape[0], len(seeds))) sub_preds_folds_vote = np.zeros((test_df.shape[0], len(seeds))) oof_preds_folds_vote = np.zeros((train_df.shape[0], len(seeds))) feature_importance_df_folds = pd.DataFrame() list_thresholds_global = [] for seed_id in range(len(seeds)): if stratified: folds = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seeds[seed_id]) else: folds = KFold(n_splits=nfold, shuffle=True, random_state=1001) oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) oof_preds_local_vote = np.zeros(train_df.shape[0]) sub_preds_local_vote = np.zeros((test_df.shape[0], nfold)) feature_importance_df = pd.DataFrame() gfold_Id = list(folds.split(X_train, labels)) params_iter = {'random_state': seeds[seed_id]} param.update(params_iter) clf = SVC( C=param['C'], kernel='rbf', gamma=param['gamma'], shrinking=True, probability=True, tol=param['tol'], # 0.001,#may be 0.0001 for stoping criteria max_iter=int(param['max_iter']), verbose=False, decision_function_shape='ovr', random_state=seeds[seed_id]) for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, labels)): xtrain, xtest = X_train[train_idx, :], X_train[valid_idx, :] ytrain, ytest = labels[train_idx], labels[valid_idx] clf.fit(xtrain, ytrain) oof_preds[valid_idx] = clf.predict_proba(xtest)[:, 1] pred = clf.predict_proba(X_test)[:, 1] sub_preds += pred / folds.n_splits fpr, tpr, thresholds = metrics.roc_curve( train_df[target].iloc[valid_idx], oof_preds[valid_idx]) optimal_idx = np.argmax(tpr - fpr) optimal_thresholds = thresholds[optimal_idx] list_thresholds_global.append(optimal_thresholds) sub_preds_local_vote[:, n_fold] = [ 1 if y_cont > optimal_thresholds else 0 for y_cont in pred ] oof_preds_local_vote[valid_idx] = [ 1 if y_cont > optimal_thresholds else 0 for y_cont in oof_preds[valid_idx] ] print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(ytest, oof_preds[valid_idx]))) del xtrain, xtest, ytrain, ytest gc.collect() oof_preds_folds[:, seed_id] = oof_preds sub_preds_folds[:, seed_id] = sub_preds from scipy import stats a, b = stats.mode(sub_preds_local_vote, axis=1) oof_preds_folds_vote[:, seed_id] = oof_preds_local_vote sub_preds_folds_vote[:, seed_id] = a.reshape(-1) feature_importance_df_folds = pd.concat( [feature_importance_df_folds, feature_importance_df], axis=0) auc_lst.append(ml_metrics.auc(train_df[target], oof_preds)) auc_lst1.append(roc_auc_score(train_df[target], oof_preds)) print('Full AUC score %.6f' % roc_auc_score(train_df[target], oof_preds)) print("auc_lst1") print(auc_lst1) print(list_thresholds_global) #oof_preds_folds = pd.DataFrame(oof_preds_folds,columns=['sk_svc_seed_' + str(seeds[l]) for l in range(len(seeds))]) #sub_preds_folds = pd.DataFrame(sub_preds_folds,columns=['sk_svc_seed_' + str(seeds[l]) for l in range(len(seeds))]) oof_preds_folds_vote = pd.DataFrame( oof_preds_folds_vote, columns=['sk_svc_seed_' + str(seeds[l]) for l in range(len(seeds))]) sub_preds_folds_vote = pd.DataFrame( sub_preds_folds_vote, columns=['sk_svc_seed_' + str(seeds[l]) for l in range(len(seeds))]) #oof_preds_folds.to_csv("../" + task + "_train_stack/sk_svc.csv", index=False) #sub_preds_folds.to_csv("../" + task + "_test_stack/sk_svc.csv", index=False) oof_preds_folds_vote.to_csv("./output/" + task + "_train_stack_vote/sk_svc.csv", index=False) sub_preds_folds_vote.to_csv("./output/" + task + "_test_stack_vote/sk_svc.csv", index=False)
def calculate_likelihoods(train, test, fnames, ftablename, function_type='max', query_type='', optional_filter_feature_likeli6='', optional_filter_value_likeli6=''): global_mean = np.mean(train.is_screener) folds = [x for x in range(1, nfold+1)] likeli_all = pd.DataFrame() for L in range(1, len(folds)+1): for train_folds in itertools.combinations(folds, L): print train_folds sql_query = open('genentech-sql/pattern_likeli_multiple' + query_type + '.sql').read() sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename) sql_query = sql_query.replace('GENERIC_FEATURE_NAME', '_'.join(fnames)) sql_query = sql_query.replace('FEATURE_NAMES_COMMA_SEPARATED', ','.join(fnames)) sql_query = sql_query.replace('T1_COMMA_SEPARATED', ','.join(['t1.'+x for x in fnames])) sql_query = sql_query.replace('T3_T4_CONDITION', ' AND '.join(['t3.'+x+'=t4.'+x for x in fnames])) sql_query = sql_query.replace('OPTIONAL_CV_EXPRESSION', 'WHERE ' + ' OR '.join(['cv_index='+str(x) for x in list(train_folds)])) sql_query = sql_query.replace('GROUP_FUNCTION', function_type) sql_query = sql_query.replace('OPTIONAL_CONDITION_LIKELI6', 'WHERE ' + optional_filter_feature_likeli6 + "='" + optional_filter_value_likeli6 + "'") #sql_query = sql_query.replace('OPTIONAL_CONDITION_LIKELI6', 'WHERE ' + optional_filter_feature_likeli6 + ">=" + optional_filter_value_likeli6) if len(list(train_folds)) == len(folds): choosing_patients_expression = 'patients_test2' else: choosing_patients_expression = 'train_cv_indices ' + 'WHERE ' + ' OR '.join(['cv_index='+str(x) for x in folds if not x in list(train_folds)]) sql_query = sql_query.replace('CHOOSING_PATIENTS_EXPRESSION', choosing_patients_expression) conn = utils.connect_to_database() cur = conn.cursor() cur.execute(sql_query) if (query_type == '3') or (query_type == '4') or (query_type == '5'): conn.commit() sql_query = open('genentech-sql/pattern_likeli_multiple' + query_type + '_2.sql').read() sql_query = sql_query.replace('GENERIC_FEATURE_NAME', '_'.join(fnames)) sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename) cur.execute(sql_query) likeli = pd.DataFrame(cur.fetchall()) likeli.columns = [x.name for x in cur.description] cur.execute('DROP TABLE patient_likeli_table;') conn.commit() else: likeli = pd.DataFrame(cur.fetchall()) likeli.columns = [x.name for x in cur.description] for x in folds: if x in list(train_folds): likeli['fold'+str(x)] = 1 else: likeli['fold'+str(x)] = 0 cur.close() conn.close() likeli_all = likeli_all.append(likeli, ignore_index=True) col = likeli.columns[1] likeli = pd.merge(likeli, train[['patient_id', 'is_screener']], on='patient_id', how='inner') if len(likeli)>0: print "Pearson correlation: " + str(pearsonr(likeli.is_screener, likeli[col])) print "AUC: " + str(auc(likeli.is_screener, likeli[col])) del likeli feats_all = train[['patient_id']].append(test[['patient_id']], ignore_index=True) for test_fold in ([0] + folds): train_folds = [x for x in folds if (x != test_fold) and (x != 0)] if len(train_folds) == len(folds): pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds]) else: pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds]) + ' and ' + ' and '.join(['fold'+str(x)+'==0' for x in folds if not x in train_folds]) print pd_query likeli = likeli_all.query(pd_query).copy().reset_index(drop=True) for x in folds: likeli.drop('fold'+str(x), axis=1, inplace=True) if test_fold == 0: feats_fold = test[['patient_id']].copy() else: feats_fold = train.query('cv_index==@test_fold')[['patient_id']].copy() feats_fold = pd.merge(feats_fold, likeli, on='patient_id', how='left') del likeli for val_fold in [x for x in folds if (x != test_fold) and (x != 0)]: train_folds = [x for x in folds if (x != test_fold) and (x != val_fold) and (x != 0)] pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds]) + ' and ' + ' and '.join(['fold'+str(x)+'==0' for x in folds if not x in train_folds]) likeli = likeli_all.query(pd_query).copy().reset_index(drop=True) for x in folds: likeli.drop('fold'+str(x), axis=1, inplace=True) feats_val_fold = train.query('cv_index==@val_fold')[['patient_id']].copy() feats_val_fold = pd.merge(feats_val_fold, likeli, on='patient_id', how='left') del likeli feats_fold = feats_fold.append(feats_val_fold, ignore_index=True) col = feats_fold.columns[1] feats_fold = feats_fold.reset_index(drop=True) feats_fold[col].fillna(global_mean, inplace=True) #feats_fold[fname_w_likeli].fillna(global_mean, inplace=True) feats_fold = feats_fold.rename(columns={col : col+'_fold_'+str(test_fold)}) #feats_fold = feats_fold.rename(columns={fname_w_likeli : fname_w_likeli+'_fold_'+str(test_fold)}) feats_all = pd.merge(feats_all, feats_fold, on='patient_id', how='left') print "Writing to HDF5 store..." store = pd.HDFStore('../data/output-py/' + col + '.h5') store.append('feats_all', feats_all) store.close() conn.close() print "Feature " + col + " is saved in file." return col
def calculate_likelihoods2(train, test, fnames_list, ftablename): global_mean = np.mean(train.is_screener) folds = [x for x in range(1, nfold+1)] likeli_all = pd.DataFrame() for L in range(1, len(folds)+1): for train_folds in itertools.combinations(folds, L): print train_folds test_folds = [x for x in folds if not x in list(train_folds)] if len(test_folds) == 0: test_folds = [0] print test_folds for fnames in fnames_list: likeli_table_name = '_'.join(fnames) + '_likeli_table' generate_likelihood_table(likeli_table_name, fnames, ftablename, train_folds) likeli = merge_likelihood_tables(fnames_list, ftablename, train_folds) for fnames in fnames_list: likeli_table_name = '_'.join(fnames) + '_likeli_table' drop_likelihood_table(likeli_table_name) for x in folds: if x in list(train_folds): likeli['fold'+str(x)] = 1 else: likeli['fold'+str(x)] = 0 likeli_all = likeli_all.append(likeli, ignore_index=True) col = likeli.columns[1] likeli = pd.merge(likeli, train[['patient_id', 'is_screener']], on='patient_id', how='inner') if len(likeli)>0: print "Pearson correlation: " + str(pearsonr(likeli.is_screener, likeli[col])) print "AUC: " + str(auc(likeli.is_screener, likeli[col])) del likeli file_name = likeli_all.columns[1] feats_all = train[['patient_id']].append(test[['patient_id']], ignore_index=True) for test_fold in ([0] + folds): train_folds = [x for x in folds if (x != test_fold) and (x != 0)] if len(train_folds) == len(folds): pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds]) else: pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds]) + ' and ' + ' and '.join(['fold'+str(x)+'==0' for x in folds if not x in train_folds]) print pd_query likeli = likeli_all.query(pd_query).copy().reset_index(drop=True) for x in folds: likeli.drop('fold'+str(x), axis=1, inplace=True) if test_fold == 0: feats_fold = test[['patient_id']].copy() else: feats_fold = train.query('cv_index==@test_fold')[['patient_id']].copy() feats_fold = pd.merge(feats_fold, likeli, on='patient_id', how='left') del likeli for val_fold in [x for x in folds if (x != test_fold) and (x != 0)]: train_folds = [x for x in folds if (x != test_fold) and (x != val_fold) and (x != 0)] pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds]) + ' and ' + ' and '.join(['fold'+str(x)+'==0' for x in folds if not x in train_folds]) likeli = likeli_all.query(pd_query).copy().reset_index(drop=True) for x in folds: likeli.drop('fold'+str(x), axis=1, inplace=True) feats_val_fold = train.query('cv_index==@val_fold')[['patient_id']].copy() feats_val_fold = pd.merge(feats_val_fold, likeli, on='patient_id', how='left') del likeli feats_fold = feats_fold.append(feats_val_fold, ignore_index=True) feats_fold = feats_fold.reset_index(drop=True) for cols in [x for x in feats_fold.columns if x != 'patient_id']: feats_fold[cols].fillna(global_mean*len(fnames), inplace=True) feats_fold = feats_fold.rename(columns={cols : cols+'_fold_'+str(test_fold)}) feats_all = pd.merge(feats_all, feats_fold, on='patient_id', how='left') print "Writing to HDF5 store..." store = pd.HDFStore('../data/output-py/' + file_name + '.h5') store.append('feats_all', feats_all) print "Feature " + file_name + " is saved in file." store.close() return file_name
max_iter = 10 # create a synthetic data set x, y = datasets.make_classification(EX) print "sample", x[251] print "feature num ", x.shape[1] # append a 1 column at index 0 in x x = np.hstack((np.ones((x.shape[0], 1)), x)) print x[251] from sgd import log_reg_sgd, h theta = log_reg_sgd(x[:EX / 2], y[:EX / 2], a, max_iter=max_iter) pred = [h(x[i], theta) for i in xrange(EX / 2, EX)] print "weights ",theta # print "err ",err print auc(y[EX / 2:], pred) def to_dict(x): # print x return {i: k for i, k in enumerate(x[1:], start=1)} b = BasicLogisticRegression(x.shape[1]-1, a) for z in xrange(max_iter ): for i in xrange(EX / 2): b.sgd_fit_one(to_dict(x[i]), y[i]) rst_y = map(b.predict_raw, map(to_dict, x[EX / 2:])) print rst_y print b.weights
def auc(self,X,y): yhat = self.predict(X) return ml_metrics.auc(np.array(y),yhat)
def auc_score(self, y, y_pred): sorted_y_pred = sorted(y_pred,reverse=True) return auc(y,sorted_y_pred)
def auc(self, X, y): yhat = self.predict(X) return ml_metrics.auc(np.array(y), yhat)
def auc_score(self, y, y_pred): sorted_y_pred = sorted(y_pred, reverse=True) return auc(y, sorted_y_pred)
def reverse_auc(labels, predictions): target_neg_one = [1 if x==-1 else 0 for x in labels] neg_predictions = [-x for x in predictions] score = metrics.auc(target_neg_one, neg_predictions) return score
def test_auc(self): self.assertAlmostEqual(metrics.auc([1,0,1,1], [.32,.52,.26,.86]), 1.0/3) self.assertAlmostEqual(metrics.auc([1,0,1,0,1], [.9,.1,.8,.1,.7]), 1) self.assertAlmostEqual(metrics.auc([0,1,1,0], [.2,.1,.3,.4]), 1.0/4) self.assertAlmostEqual(metrics.auc([1,1,1,1,0,0,0,0,0,0], [1,1,1,1,1,1,1,1,1,1]), 1.0/2)
def stacking_model_cat(task='together'): nfold = 5 #task='together' train_df = None test_df = None if task == 'together': train_df = pd.read_csv('./data/train_df_day_night_together.csv') test_df = pd.read_csv('./data/test_df_day_night_together.csv') from together_fn_param import list_param elif task == 'split': train_df = pd.read_csv('./data/train_df_day_night_split.csv') test_df = pd.read_csv('./data/test_df_day_night_split.csv') from split_fn_param import list_param train_df = train_df.fillna(-1) test_df = test_df.fillna(-1) print("Data loading Done!") target = 'label' predictors = train_df.columns.values.tolist()[1:-1] categorical = None X_train = train_df.drop(['bird_id', 'label'], axis=1) labels = train_df['label'] #cat seeds = np.random.randint(5000, 10000, size=10).tolist() auc_lst = [] auc_lst1 = [] n_estimators_lst = [] stratified = True debug = True param = list_param('cat') oof_preds_folds = np.zeros((train_df.shape[0], len(seeds))) sub_preds_folds = np.zeros((test_df.shape[0], len(seeds))) sub_preds_folds_vote = np.zeros((test_df.shape[0], len(seeds))) oof_preds_folds_vote = np.zeros((train_df.shape[0], len(seeds))) feature_importance_df_folds = pd.DataFrame() list_thresholds_global = [] for seed_id in range(len(seeds)): if stratified: folds = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seeds[seed_id]) else: folds = KFold(n_splits=nfold, shuffle=True, random_state=1001) oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) oof_preds_local_vote = np.zeros(train_df.shape[0]) sub_preds_local_vote = np.zeros((test_df.shape[0], nfold)) feature_importance_df = pd.DataFrame() gfold_Id = list(folds.split(X_train, labels)) params_iter = { 'iterations': 5000, # int 'border_count': 128, # (128) 1 - 255 'bootstrap_type': 'Bernoulli', 'loss_function': 'Logloss', 'eval_metric': 'F1', # 'AUC', 'od_type': 'Iter', 'allow_writing_files': False, 'early_stopping_rounds': 50, 'custom_metric': ['AUC'], 'random_seed': seeds[seed_id], 'use_best_model': True } param.update(params_iter) pool = ctb.Pool(train_df[predictors], train_df[target]) bst1 = ctb.cv(pool=pool, params=param, fold_count=10, partition_random_seed=seeds[seed_id], stratified=True) res0 = pd.DataFrame(bst1) n_estimators = res0['test-F1-mean'].argmax() + 1 params_iter2 = { 'iterations': n_estimators, } param.update(params_iter2) for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, labels)): if 'use_best_model' in param: param.__delitem__("use_best_model") pool_0 = ctb.Pool(train_df[predictors].iloc[train_idx], train_df[target].iloc[train_idx]) clf = ctb.train(pool=pool_0, params=param) #oof_preds[valid_idx] = clf.predict(train_df[predictors].iloc[valid_idx], prediction_type='Probability')[:, 1] #sub_preds += (clf.predict(test_df[predictors], prediction_type='Probability')[:, 1]) / folds.n_splits oof_preds[valid_idx] = clf.predict( train_df[predictors].iloc[valid_idx], prediction_type='Probability')[:, 1] pred = clf.predict(test_df[predictors], prediction_type='Probability')[:, 1] sub_preds += pred / folds.n_splits fpr, tpr, thresholds = metrics.roc_curve( train_df[target].iloc[valid_idx], oof_preds[valid_idx]) optimal_idx = np.argmax(tpr - fpr) optimal_thresholds = thresholds[optimal_idx] list_thresholds_global.append(optimal_thresholds) sub_preds_local_vote[:, n_fold] = [ 1 if y_cont > optimal_thresholds else 0 for y_cont in pred ] oof_preds_local_vote[valid_idx] = [ 1 if y_cont > optimal_thresholds else 0 for y_cont in oof_preds[valid_idx] ] fold_importance_df = pd.DataFrame( list( zip(train_df[predictors].iloc[train_idx].dtypes.index, clf.get_feature_importance(pool_0))), columns=['feature', 'importance']) fold_importance_df = fold_importance_df.sort_values( by='importance', ascending=False, inplace=False, kind='quicksort', na_position='last') fold_importance_df["fold"] = n_fold + 1 fold_importance_df["seed"] = 'seed_' + str(seeds[seed_id]) feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(train_df[target].iloc[valid_idx], oof_preds[valid_idx]))) del clf, pool_0 gc.collect() oof_preds_folds[:, seed_id] = oof_preds sub_preds_folds[:, seed_id] = sub_preds from scipy import stats a, b = stats.mode(sub_preds_local_vote, axis=1) oof_preds_folds_vote[:, seed_id] = oof_preds_local_vote sub_preds_folds_vote[:, seed_id] = a.reshape(-1) feature_importance_df_folds = pd.concat( [feature_importance_df_folds, feature_importance_df], axis=0) auc_lst.append(ml_metrics.auc(train_df[target], oof_preds)) auc_lst1.append(roc_auc_score(train_df[target], oof_preds)) print('Full AUC score %.6f' % roc_auc_score(train_df[target], oof_preds)) print("auc_lst1") print(auc_lst1) print(list_thresholds_global) #oof_preds_folds = pd.DataFrame(oof_preds_folds,columns=['cat_seed_' + str(seeds[l]) for l in range(len(seeds))]) #sub_preds_folds = pd.DataFrame(sub_preds_folds,columns=['cat_seed_' + str(seeds[l]) for l in range(len(seeds))]) oof_preds_folds_vote = pd.DataFrame( oof_preds_folds_vote, columns=['cat_seed_' + str(seeds[l]) for l in range(len(seeds))]) sub_preds_folds_vote = pd.DataFrame( sub_preds_folds_vote, columns=['cat_seed_' + str(seeds[l]) for l in range(len(seeds))]) #oof_preds_folds.to_csv("./output/" + task + "_train_stack/cat.csv", index=False) #sub_preds_folds.to_csv("./output/" + task + "_test_stack/cat.csv", index=False) oof_preds_folds_vote.to_csv("./output/" + task + "_train_stack_vote/cat.csv", index=False) sub_preds_folds_vote.to_csv("./output/" + task + "_test_stack_vote/cat.csv", index=False) feature_importance_df_folds = feature_importance_df_folds.sort_values( 'importance', ascending=False) feature_importance_df_folds.to_csv("./output/" + task + "_feature/cat.csv", index=False)
def stacking_model_xgb_rank(task='together'): nfold = 5 #task='together' train_df = None test_df = None if task == 'together': train_df = pd.read_csv('./data/train_df_day_night_together.csv') test_df = pd.read_csv('./data/test_df_day_night_together.csv') from together_fn_param import list_param elif task == 'split': train_df = pd.read_csv('./data/train_df_day_night_split.csv') test_df = pd.read_csv('./data/test_df_day_night_split.csv') from split_fn_param import list_param train_df = train_df.fillna(-1) test_df = test_df.fillna(-1) print("Data loading Done!") X_train = train_df.drop(['bird_id', 'label'], axis=1) features = X_train.columns labels = train_df['label'] X_train = X_train.fillna(-1) y_train = np.int32(labels) X_test = test_df.drop(['bird_id', 'label'], axis=1) X_test = X_test.fillna(-1) #xgboost xg_train = xgb.DMatrix(X_train, label=y_train, missing=-1.0) xg_test = xgb.DMatrix(X_test, missing=-1.0) def xg_f1(yhat, dtrain): y = dtrain.get_label() pre, rec, th = metrics.precision_recall_curve(y, yhat) f1_all = 2 / ((1 / rec) + (1 / pre)) optimal_idx = np.argmax(f1_all) optimal_thresholds = th[optimal_idx] y_bin = [1. if y_cont > optimal_thresholds else 0. for y_cont in yhat] # binaryzing your output tn, fp, fn, tp = confusion_matrix(y, y_bin).ravel() specificity = tn / (tn + fp) sensitivity = tp / (tp + fn) optimal_f1 = np.nanmax(f1_all) return 'f1', -optimal_f1 seeds = np.random.randint(5000, 10000, size=10).tolist() auc_lst = [] auc_lst1 = [] n_estimators_lst = [] stratified = True debug = True param = list_param('xgb_rank') oof_preds_folds = np.zeros((train_df.shape[0], len(seeds))) sub_preds_folds = np.zeros((test_df.shape[0], len(seeds))) sub_preds_folds_vote = np.zeros((test_df.shape[0], len(seeds))) oof_preds_folds_vote = np.zeros((train_df.shape[0], len(seeds))) feature_importance_df_folds = pd.DataFrame() list_thresholds_global = [] for seed_id in range(len(seeds)): if stratified: folds = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seeds[seed_id]) else: folds = KFold(n_splits=nfold, shuffle=True, random_state=1001) oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) oof_preds_local_vote = np.zeros(train_df.shape[0]) sub_preds_local_vote = np.zeros((test_df.shape[0], nfold)) feature_importance_df = pd.DataFrame() gfold_Id = list(folds.split(X_train, labels)) params_iter = { 'seed': seeds[seed_id], 'objective': 'rank:pairwise', 'silent': False, } param.update(params_iter) res = xgb.cv(param, xg_train, num_boost_round=5000, folds=gfold_Id, feval=xg_f1, metrics={'auc'}, stratified=True, maximize=False, verbose_eval=50, callbacks=[ xgb.callback.print_evaluation(show_stdv=True), xgb.callback.early_stop(50) ]) n_estimators = res.shape[0] for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, labels)): xgg_train = X_train.iloc[train_idx] xgg_valid = X_train.iloc[valid_idx] ygg_train = labels[train_idx] ygg_valid = labels[valid_idx] xgg_train = xgb.DMatrix(xgg_train, label=ygg_train, missing=-1.0) xgg_valid = xgb.DMatrix(xgg_valid, missing=-1.0) #xg_test = xgb.DMatrix(X_test, missing=-1.0) clf = xgb.train(param, xgg_train, num_boost_round=n_estimators, verbose_eval=1) oof_preds[valid_idx] = clf.predict(xgg_valid) pred = clf.predict(xg_test) sub_preds += pred / folds.n_splits fpr, tpr, thresholds = metrics.roc_curve(ygg_valid, oof_preds[valid_idx]) optimal_idx = np.argmax(tpr - fpr) optimal_thresholds = thresholds[optimal_idx] list_thresholds_global.append(optimal_thresholds) sub_preds_local_vote[:, n_fold] = [ 1 if y_cont > optimal_thresholds else 0 for y_cont in pred ] oof_preds_local_vote[valid_idx] = [ 1 if y_cont > optimal_thresholds else 0 for y_cont in oof_preds[valid_idx] ] fold_raw_importance = pd.DataFrame( list(clf.get_score(importance_type='gain').items()), columns=['feature', 'importance']).sort_values('importance', ascending=False) fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = features fold_importance_df = pd.merge(fold_importance_df, fold_raw_importance, on='feature', how='left') fold_importance_df = fold_importance_df.fillna(value=0) fold_importance_df = fold_importance_df.sort_values( 'importance', ascending=False) fold_importance_df["fold"] = n_fold + 1 fold_importance_df["seed"] = 'seed_' + str(seeds[seed_id]) feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(ygg_valid, oof_preds[valid_idx]))) del clf, xgg_train, xgg_valid, ygg_train, ygg_valid gc.collect() oof_preds_folds[:, seed_id] = oof_preds sub_preds_folds[:, seed_id] = sub_preds from scipy import stats a, b = stats.mode(sub_preds_local_vote, axis=1) oof_preds_folds_vote[:, seed_id] = oof_preds_local_vote sub_preds_folds_vote[:, seed_id] = a.reshape(-1) feature_importance_df_folds = pd.concat( [feature_importance_df_folds, feature_importance_df], axis=0) auc_lst.append(ml_metrics.auc(y_train, oof_preds)) auc_lst1.append(roc_auc_score(y_train, oof_preds)) print('Full AUC score %.6f' % roc_auc_score(y_train, oof_preds)) print("auc_lst1") print(auc_lst1) print(list_thresholds_global) #oof_preds_folds = pd.DataFrame(oof_preds_folds,columns=['xgb_rank_seed_' + str(seeds[l]) for l in range(len(seeds))]) #sub_preds_folds = pd.DataFrame(sub_preds_folds,columns=['xgb_rank_seed_' + str(seeds[l]) for l in range(len(seeds))]) oof_preds_folds_vote = pd.DataFrame( oof_preds_folds_vote, columns=['xgb_rank_seed_' + str(seeds[l]) for l in range(len(seeds))]) sub_preds_folds_vote = pd.DataFrame( sub_preds_folds_vote, columns=['xgb_rank_seed_' + str(seeds[l]) for l in range(len(seeds))]) #oof_preds_folds.to_csv("./output/" + task + "_train_stack/xgb_rank.csv", index=False) #sub_preds_folds.to_csv("./output/" + task + "_test_stack/xgb_rank.csv", index=False) oof_preds_folds_vote.to_csv("./output/" + task + "_train_stack_vote/xgb_rank.csv", index=False) sub_preds_folds_vote.to_csv("./output/" + task + "_test_stack_vote/xgb_rank.csv", index=False) feature_importance_df_folds = feature_importance_df_folds.sort_values( 'importance', ascending=False) feature_importance_df_folds.to_csv("./output/" + task + "_feature/xgb_rank.csv", index=False)
def calculate_ftrl_features(train, test, fnames, ftablename, ftrl_type='', optional_date_ftrl3='', optional_condition_ftrl4=''): folds = [x for x in range(1, nfold + 1)] global_mean = np.mean(train.is_screener) pred_file = '../data/output-py/ftrl/pred_ftrl.csv' ftrl_all = pd.DataFrame() count = 0 for L in range(1, len(folds) + 1): for train_folds in itertools.combinations(folds, L): count = count + 1 print train_folds test_folds = [x for x in folds if not x in list(train_folds)] if len(test_folds) == 0: test_folds = [0] print test_folds if False: store = pd.HDFStore('../data/output-py/ftrl/ftrl_feats' + str(count) + '.h5') ftrl_feats = store.get('ftrl_feats') store.close() else: train_file = save_ftrl_data('train', fnames, ftablename, test_folds, list(train_folds), ftrl_type, optional_date_ftrl3, optional_condition_ftrl4) if 0 in test_folds: test_file = save_ftrl_data('test', fnames, ftablename, test_folds, list(train_folds), ftrl_type, optional_date_ftrl3, optional_condition_ftrl4) else: test_file = save_ftrl_data('val', fnames, ftablename, test_folds, list(train_folds), ftrl_type, optional_date_ftrl3, optional_condition_ftrl4) non_factor_cols = "''" non_feature_cols = "''" text_cols = "'diagnosis_description'" os.system('pypy ftrl' + ftrl_type + '.py' + ' --alpha ' + str(0.07) + ' --beta ' + str(1.0) + ' --L1 ' + str(0.01) + ' --L2 ' + str(1.0) + ' --epoch ' + str(1) + ' --train ' + train_file + ' --test ' + test_file + ' --submission ' + pred_file + ' --non_feature_cols ' + non_feature_cols + ' --non_factor_cols ' + non_factor_cols + ' --text_cols ' + text_cols) ftrl_feats = pd.read_csv(pred_file) ftrl_feats = ftrl_feats.groupby( 'patient_id')['is_screener_pred'].max().reset_index() for x in folds: if x in list(train_folds): ftrl_feats['fold' + str(x)] = 1 else: ftrl_feats['fold' + str(x)] = 0 store = pd.HDFStore('../data/output-py/ftrl/ftrl_feats' + str(count) + '.h5') store.append('ftrl_feats', ftrl_feats) store.close() os.system('rm -R ' + train_file) os.system('rm -R ' + test_file) os.system('rm -R ' + pred_file) ftrl_all = ftrl_all.append(ftrl_feats, ignore_index=True) ftrl_feats = pd.merge(ftrl_feats, train[['patient_id', 'is_screener']], on='patient_id', how='inner') if len(ftrl_feats) > 0: print "Pearson correlation: " + str( pearsonr(ftrl_feats.is_screener, ftrl_feats.is_screener_pred)) print "AUC: " + str( auc(ftrl_feats.is_screener, ftrl_feats.is_screener_pred)) del ftrl_feats feats_all = train[['patient_id']].append(test[['patient_id']], ignore_index=True) for test_fold in ([0] + folds): train_folds = [x for x in folds if (x != test_fold) and (x != 0)] if len(train_folds) == len(folds): pd_query = ' and '.join( ['fold' + str(x) + '==1' for x in train_folds]) else: pd_query = ' and '.join( ['fold' + str(x) + '==1' for x in train_folds]) + ' and ' + ' and '.join([ 'fold' + str(x) + '==0' for x in folds if not x in train_folds ]) print pd_query ftrl_feats = ftrl_all.query(pd_query).copy().reset_index(drop=True) for x in folds: ftrl_feats.drop('fold' + str(x), axis=1, inplace=True) if test_fold == 0: feats_fold = test[['patient_id']].copy() else: feats_fold = train.query('cv_index==@test_fold')[['patient_id' ]].copy() feats_fold = pd.merge(feats_fold, ftrl_feats, on='patient_id', how='left') del ftrl_feats for val_fold in [x for x in folds if (x != test_fold) and (x != 0)]: train_folds = [ x for x in folds if (x != test_fold) and (x != val_fold) and (x != 0) ] pd_query = ' and '.join( ['fold' + str(x) + '==1' for x in train_folds]) + ' and ' + ' and '.join([ 'fold' + str(x) + '==0' for x in folds if not x in train_folds ]) ftrl_feats = ftrl_all.query(pd_query).copy().reset_index(drop=True) for x in folds: ftrl_feats.drop('fold' + str(x), axis=1, inplace=True) feats_val_fold = train.query('cv_index==@val_fold')[['patient_id' ]].copy() feats_val_fold = pd.merge(feats_val_fold, ftrl_feats, on='patient_id', how='left') del ftrl_feats feats_fold = feats_fold.append(feats_val_fold, ignore_index=True) feats_fold = feats_fold.reset_index(drop=True) feats_fold['is_screener_pred'].fillna(global_mean, inplace=True) feats_fold = feats_fold.rename( columns={ 'is_screener_pred': '_'.join(fnames) + '_' + ftablename + '_ftrl' + ftrl_type + '_fold_' + str(test_fold) }) feats_all = pd.merge(feats_all, feats_fold, on='patient_id', how='left') print "Writing to HDF5 store..." store = pd.HDFStore('../data/output-py/' + '_'.join(fnames) + '_' + ftablename + '_ftrl' + ftrl_type + '.h5') store.append('feats_all', feats_all) print 'Feature ' + '_'.join( fnames) + '_' + ftablename + '_ftrl' + ftrl_type + ' is saved in file.' store.close() return '_'.join(fnames) + '_' + ftablename + '_ftrl' + ftrl_type
def stacking_model_lgb_gbt(task='together'): nfold = 5 #task='together' train_df = None test_df = None if task == 'together': train_df = pd.read_csv('./data/train_df_day_night_together.csv') test_df = pd.read_csv('./data/test_df_day_night_together.csv') from together_fn_param import list_param elif task == 'split': train_df = pd.read_csv('./data/train_df_day_night_split.csv') test_df = pd.read_csv('./data/test_df_day_night_split.csv') from split_fn_param import list_param train_df = train_df.fillna(-1) test_df = test_df.fillna(-1) print("Data loading Done!") target = 'label' predictors = train_df.columns.values.tolist()[1:-1] categorical = None gc.collect() #lightgbm X_train = train_df[predictors].values labels = train_df['label'] def xg_f1(preds, train_data): yhat = preds dtrain = train_data y = dtrain.get_label() pre, rec, th = metrics.precision_recall_curve(y, yhat) f1_all = 2 / ((1 / rec) + (1 / pre)) optimal_idx = np.argmax(f1_all) optimal_thresholds = th[optimal_idx] y_bin = [1. if y_cont > optimal_thresholds else 0. for y_cont in yhat] # binaryzing your output tn, fp, fn, tp = confusion_matrix(y, y_bin).ravel() specificity = tn / (tn + fp) sensitivity = tp / (tp + fn) optimal_f1 = np.nanmax(f1_all) return 'f1', -optimal_f1, False xg_train = lgb.Dataset(train_df[predictors].values, label=train_df[target].values, feature_name=predictors) seeds = np.random.randint(5000, 10000, size=10).tolist() auc_lst = [] auc_lst1 = [] n_estimators_lst = [] stratified = True debug = True param = list_param('lgb_gbdt') oof_preds_folds = np.zeros((train_df.shape[0], len(seeds))) sub_preds_folds = np.zeros((test_df.shape[0], len(seeds))) sub_preds_folds_vote = np.zeros((test_df.shape[0], len(seeds))) oof_preds_folds_vote = np.zeros((train_df.shape[0], len(seeds))) feature_importance_df_folds = pd.DataFrame() list_thresholds_global = [] for seed_id in range(len(seeds)): if stratified: folds = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seeds[seed_id]) else: folds = KFold(n_splits=nfold, shuffle=True, random_state=1001) oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) oof_preds_local_vote = np.zeros(train_df.shape[0]) sub_preds_local_vote = np.zeros((test_df.shape[0], nfold)) feature_importance_df = pd.DataFrame() gfold_Id = list(folds.split(X_train, labels)) params_iter = { 'max_bin': 63, # fixed #int 'save_binary': True, # fixed 'seed': seeds[seed_id], 'feature_fraction_seed': seeds[seed_id], 'bagging_seed': seeds[seed_id], 'drop_seed': seeds[seed_id], 'data_random_seed': seeds[seed_id], 'objective': 'binary', 'boosting_type': 'gbdt', 'verbose': 1, 'metric': 'auc', } param.update(params_iter) bst1 = lgb.cv(param, xg_train, num_boost_round=5000, early_stopping_rounds=50, folds=gfold_Id) res0 = pd.DataFrame(bst1) n_estimators = res0.shape[0] for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, labels)): xgg_train = lgb.Dataset(data=train_df[predictors].iloc[train_idx], label=train_df[target].iloc[train_idx], free_raw_data=False, silent=True) xgg_valid = lgb.Dataset(data=train_df[predictors].iloc[valid_idx], label=train_df[target].iloc[valid_idx], free_raw_data=False, silent=True) clf = lgb.train( param, xgg_train, num_boost_round=n_estimators, # fobj=loglikelood, # feval=binary_error, verbose_eval=1, ) oof_preds[valid_idx] = clf.predict(xgg_valid.data) pred = clf.predict(test_df[predictors]) sub_preds += pred / folds.n_splits fpr, tpr, thresholds = metrics.roc_curve(xgg_valid.label, oof_preds[valid_idx]) optimal_idx = np.argmax(tpr - fpr) optimal_thresholds = thresholds[optimal_idx] list_thresholds_global.append(optimal_thresholds) sub_preds_local_vote[:, n_fold] = [ 1 if y_cont > optimal_thresholds else 0 for y_cont in pred ] oof_preds_local_vote[valid_idx] = [ 1 if y_cont > optimal_thresholds else 0 for y_cont in oof_preds[valid_idx] ] fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = clf.feature_name() fold_importance_df["importance"] = clf.feature_importance( importance_type='gain') fold_importance_df = fold_importance_df.fillna(value=0) fold_importance_df = fold_importance_df.sort_values( 'importance', ascending=False) fold_importance_df["fold"] = n_fold + 1 fold_importance_df["seed"] = 'seed_' + str(seeds[seed_id]) feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(xgg_valid.label, oof_preds[valid_idx]))) del clf, xgg_train, xgg_valid gc.collect() oof_preds_folds[:, seed_id] = oof_preds sub_preds_folds[:, seed_id] = sub_preds from scipy import stats a, b = stats.mode(sub_preds_local_vote, axis=1) oof_preds_folds_vote[:, seed_id] = oof_preds_local_vote sub_preds_folds_vote[:, seed_id] = a.reshape(-1) feature_importance_df_folds = pd.concat( [feature_importance_df_folds, feature_importance_df], axis=0) auc_lst.append(ml_metrics.auc(xg_train.label, oof_preds)) auc_lst1.append(roc_auc_score(xg_train.label, oof_preds)) print('Full AUC score %.6f' % roc_auc_score(xg_train.label, oof_preds)) print("auc_lst1") print(auc_lst1) print(list_thresholds_global) #oof_preds_folds = pd.DataFrame(oof_preds_folds,columns=['lgb_gbt_seed_' + str(seeds[l]) for l in range(len(seeds))]) #sub_preds_folds = pd.DataFrame(sub_preds_folds,columns=['lgb_gbt_seed_' + str(seeds[l]) for l in range(len(seeds))]) oof_preds_folds_vote = pd.DataFrame( oof_preds_folds_vote, columns=['lgb_gbt_seed_' + str(seeds[l]) for l in range(len(seeds))]) sub_preds_folds_vote = pd.DataFrame( sub_preds_folds_vote, columns=['lgb_gbt_seed_' + str(seeds[l]) for l in range(len(seeds))]) #oof_preds_folds.to_csv("./output/" + task + "_train_stack/lgb_gbt.csv", index=False) #sub_preds_folds.to_csv("./output/" + task + "_test_stack/lgb_gbt.csv", index=False) oof_preds_folds_vote.to_csv("./output/" + task + "_train_stack_vote/lgb_gbt.csv", index=False) sub_preds_folds_vote.to_csv("./output/" + task + "_test_stack_vote/lgb_gbt.csv", index=False) feature_importance_df_folds = feature_importance_df_folds.sort_values( 'importance', ascending=False) feature_importance_df_folds.to_csv("./output/" + task + "_feature/lgb_gbt.csv", index=False)
def calculate_likelihoods(train, test, fnames, ftablename, function_type='max', query_type='', optional_filter_feature_likeli6='', optional_filter_value_likeli6=''): global_mean = np.mean(train.is_screener) folds = [x for x in range(1, nfold + 1)] likeli_all = pd.DataFrame() for L in range(1, len(folds) + 1): for train_folds in itertools.combinations(folds, L): print train_folds sql_query = open('genentech-sql/pattern_likeli_multiple' + query_type + '.sql').read() sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename) sql_query = sql_query.replace('GENERIC_FEATURE_NAME', '_'.join(fnames)) sql_query = sql_query.replace('FEATURE_NAMES_COMMA_SEPARATED', ','.join(fnames)) sql_query = sql_query.replace( 'T1_COMMA_SEPARATED', ','.join(['t1.' + x for x in fnames])) sql_query = sql_query.replace( 'T3_T4_CONDITION', ' AND '.join(['t3.' + x + '=t4.' + x for x in fnames])) sql_query = sql_query.replace( 'OPTIONAL_CV_EXPRESSION', 'WHERE ' + ' OR '.join(['cv_index=' + str(x) for x in list(train_folds)])) sql_query = sql_query.replace('GROUP_FUNCTION', function_type) sql_query = sql_query.replace( 'OPTIONAL_CONDITION_LIKELI6', 'WHERE ' + optional_filter_feature_likeli6 + "='" + optional_filter_value_likeli6 + "'") #sql_query = sql_query.replace('OPTIONAL_CONDITION_LIKELI6', 'WHERE ' + optional_filter_feature_likeli6 + ">=" + optional_filter_value_likeli6) if len(list(train_folds)) == len(folds): choosing_patients_expression = 'patients_test2' else: choosing_patients_expression = 'train_cv_indices ' + 'WHERE ' + ' OR '.join( [ 'cv_index=' + str(x) for x in folds if not x in list(train_folds) ]) sql_query = sql_query.replace('CHOOSING_PATIENTS_EXPRESSION', choosing_patients_expression) conn = utils.connect_to_database() cur = conn.cursor() cur.execute(sql_query) if (query_type == '3') or (query_type == '4') or (query_type == '5'): conn.commit() sql_query = open('genentech-sql/pattern_likeli_multiple' + query_type + '_2.sql').read() sql_query = sql_query.replace('GENERIC_FEATURE_NAME', '_'.join(fnames)) sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename) cur.execute(sql_query) likeli = pd.DataFrame(cur.fetchall()) likeli.columns = [x.name for x in cur.description] cur.execute('DROP TABLE patient_likeli_table;') conn.commit() else: likeli = pd.DataFrame(cur.fetchall()) likeli.columns = [x.name for x in cur.description] for x in folds: if x in list(train_folds): likeli['fold' + str(x)] = 1 else: likeli['fold' + str(x)] = 0 cur.close() conn.close() likeli_all = likeli_all.append(likeli, ignore_index=True) col = likeli.columns[1] likeli = pd.merge(likeli, train[['patient_id', 'is_screener']], on='patient_id', how='inner') if len(likeli) > 0: print "Pearson correlation: " + str( pearsonr(likeli.is_screener, likeli[col])) print "AUC: " + str(auc(likeli.is_screener, likeli[col])) del likeli feats_all = train[['patient_id']].append(test[['patient_id']], ignore_index=True) for test_fold in ([0] + folds): train_folds = [x for x in folds if (x != test_fold) and (x != 0)] if len(train_folds) == len(folds): pd_query = ' and '.join( ['fold' + str(x) + '==1' for x in train_folds]) else: pd_query = ' and '.join( ['fold' + str(x) + '==1' for x in train_folds]) + ' and ' + ' and '.join([ 'fold' + str(x) + '==0' for x in folds if not x in train_folds ]) print pd_query likeli = likeli_all.query(pd_query).copy().reset_index(drop=True) for x in folds: likeli.drop('fold' + str(x), axis=1, inplace=True) if test_fold == 0: feats_fold = test[['patient_id']].copy() else: feats_fold = train.query('cv_index==@test_fold')[['patient_id' ]].copy() feats_fold = pd.merge(feats_fold, likeli, on='patient_id', how='left') del likeli for val_fold in [x for x in folds if (x != test_fold) and (x != 0)]: train_folds = [ x for x in folds if (x != test_fold) and (x != val_fold) and (x != 0) ] pd_query = ' and '.join( ['fold' + str(x) + '==1' for x in train_folds]) + ' and ' + ' and '.join([ 'fold' + str(x) + '==0' for x in folds if not x in train_folds ]) likeli = likeli_all.query(pd_query).copy().reset_index(drop=True) for x in folds: likeli.drop('fold' + str(x), axis=1, inplace=True) feats_val_fold = train.query('cv_index==@val_fold')[['patient_id' ]].copy() feats_val_fold = pd.merge(feats_val_fold, likeli, on='patient_id', how='left') del likeli feats_fold = feats_fold.append(feats_val_fold, ignore_index=True) col = feats_fold.columns[1] feats_fold = feats_fold.reset_index(drop=True) feats_fold[col].fillna(global_mean, inplace=True) #feats_fold[fname_w_likeli].fillna(global_mean, inplace=True) feats_fold = feats_fold.rename( columns={col: col + '_fold_' + str(test_fold)}) #feats_fold = feats_fold.rename(columns={fname_w_likeli : fname_w_likeli+'_fold_'+str(test_fold)}) feats_all = pd.merge(feats_all, feats_fold, on='patient_id', how='left') print "Writing to HDF5 store..." store = pd.HDFStore('../data/output-py/' + col + '.h5') store.append('feats_all', feats_all) store.close() conn.close() print "Feature " + col + " is saved in file." return col
def forward_auc(labels, predictions): target_one = [1 if x==1 else 0 for x in labels] score = metrics.auc(target_one, predictions) return score
def calculate_likelihoods2(train, test, fnames_list, ftablename): global_mean = np.mean(train.is_screener) folds = [x for x in range(1, nfold + 1)] likeli_all = pd.DataFrame() for L in range(1, len(folds) + 1): for train_folds in itertools.combinations(folds, L): print train_folds test_folds = [x for x in folds if not x in list(train_folds)] if len(test_folds) == 0: test_folds = [0] print test_folds for fnames in fnames_list: likeli_table_name = '_'.join(fnames) + '_likeli_table' generate_likelihood_table(likeli_table_name, fnames, ftablename, train_folds) likeli = merge_likelihood_tables(fnames_list, ftablename, train_folds) for fnames in fnames_list: likeli_table_name = '_'.join(fnames) + '_likeli_table' drop_likelihood_table(likeli_table_name) for x in folds: if x in list(train_folds): likeli['fold' + str(x)] = 1 else: likeli['fold' + str(x)] = 0 likeli_all = likeli_all.append(likeli, ignore_index=True) col = likeli.columns[1] likeli = pd.merge(likeli, train[['patient_id', 'is_screener']], on='patient_id', how='inner') if len(likeli) > 0: print "Pearson correlation: " + str( pearsonr(likeli.is_screener, likeli[col])) print "AUC: " + str(auc(likeli.is_screener, likeli[col])) del likeli file_name = likeli_all.columns[1] feats_all = train[['patient_id']].append(test[['patient_id']], ignore_index=True) for test_fold in ([0] + folds): train_folds = [x for x in folds if (x != test_fold) and (x != 0)] if len(train_folds) == len(folds): pd_query = ' and '.join( ['fold' + str(x) + '==1' for x in train_folds]) else: pd_query = ' and '.join( ['fold' + str(x) + '==1' for x in train_folds]) + ' and ' + ' and '.join([ 'fold' + str(x) + '==0' for x in folds if not x in train_folds ]) print pd_query likeli = likeli_all.query(pd_query).copy().reset_index(drop=True) for x in folds: likeli.drop('fold' + str(x), axis=1, inplace=True) if test_fold == 0: feats_fold = test[['patient_id']].copy() else: feats_fold = train.query('cv_index==@test_fold')[['patient_id' ]].copy() feats_fold = pd.merge(feats_fold, likeli, on='patient_id', how='left') del likeli for val_fold in [x for x in folds if (x != test_fold) and (x != 0)]: train_folds = [ x for x in folds if (x != test_fold) and (x != val_fold) and (x != 0) ] pd_query = ' and '.join( ['fold' + str(x) + '==1' for x in train_folds]) + ' and ' + ' and '.join([ 'fold' + str(x) + '==0' for x in folds if not x in train_folds ]) likeli = likeli_all.query(pd_query).copy().reset_index(drop=True) for x in folds: likeli.drop('fold' + str(x), axis=1, inplace=True) feats_val_fold = train.query('cv_index==@val_fold')[['patient_id' ]].copy() feats_val_fold = pd.merge(feats_val_fold, likeli, on='patient_id', how='left') del likeli feats_fold = feats_fold.append(feats_val_fold, ignore_index=True) feats_fold = feats_fold.reset_index(drop=True) for cols in [x for x in feats_fold.columns if x != 'patient_id']: feats_fold[cols].fillna(global_mean * len(fnames), inplace=True) feats_fold = feats_fold.rename( columns={cols: cols + '_fold_' + str(test_fold)}) feats_all = pd.merge(feats_all, feats_fold, on='patient_id', how='left') print "Writing to HDF5 store..." store = pd.HDFStore('../data/output-py/' + file_name + '.h5') store.append('feats_all', feats_all) print "Feature " + file_name + " is saved in file." store.close() return file_name
y_pred = train_predict_ftrl(X_train, y_train, X_test) preds = pd.DataFrame() preds['ID'] = test_split['ID'].values preds['FOLD'] = fold preds['ITER'] = it preds[MODEL] = y_pred preds_model = preds_model.append(preds, ignore_index=True) preds = preds.loc[preds['ID'].isin(ids_val)].copy() preds = pd.merge(preds, train[['ID', 'TARGET']], on='ID', how='left') fold_auc = auc(preds['TARGET'], preds[MODEL]) aucs.append(fold_auc) print np.mean(aucs), np.std(aucs) preds_model.loc[preds_model[MODEL] < 0, MODEL] = 0.0 preds_model.loc[preds_model[MODEL] > 1, MODEL] = 1.0 preds_model = preds_model.groupby(['ID', 'ITER'])[MODEL].mean().reset_index() for it in range(1, 21): preds_model.loc[preds_model['ITER'] == it, MODEL] = preds_model.loc[preds_model['ITER'] == it, MODEL].rank() preds_model = preds_model.groupby('ID')[MODEL].mean().reset_index() preds_model.columns = ['ID', 'dmitry_' + MODEL] preds_all = pd.merge(preds_all, preds_model, on='ID', how='left') preds_all.to_csv('all_models_temp.csv', index=False)
train = coder.fit_transform(train) models = [] models.append(RandomForestClassifier(n_estimators=165, max_depth=4, criterion='entropy')) models.append(GradientBoostingClassifier(max_depth =4)) models.append(KNeighborsClassifier(n_neighbors=20)) models.append(GaussianNB()) TRNtrain, TRNtest, TARtrain, TARtest = train_test_split(train, target, test_size=0.3, random_state=0) plt.figure(figsize=(10, 10)) for model in models: model.fit(TRNtrain, TARtrain) pred_scr = model.predict_proba(TRNtest)[:, 1] fpr, tpr, thresholds = roc_curve(TARtest, pred_scr) roc_auc = ml_metrics.auc(TARtest, pred_scr) md = str(model) md = md[:md.find('(')] pl.plot(fpr, tpr, label='ROC fold %s (auc = %0.2f)' % (md, roc_auc)) pl.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6)) pl.xlim([0, 1]) pl.ylim([0, 1]) pl.xlabel('False Positive Rate') pl.ylabel('True Positive Rate') pl.title('Receiver operating characteristic example') pl.legend(loc="lower right") pl.show() #приводим тестовую выборку к нужному формату