Example #1
0
 def test_auc(self):
     self.assertAlmostEqual(metrics.auc([1, 0, 1, 1], [.32, .52, .26, .86]),
                            1.0 / 3)
     self.assertAlmostEqual(
         metrics.auc([1, 0, 1, 0, 1], [.9, .1, .8, .1, .7]), 1)
     self.assertAlmostEqual(metrics.auc([0, 1, 1, 0], [.2, .1, .3, .4]),
                            1.0 / 4)
     self.assertAlmostEqual(
         metrics.auc([1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
                     [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 1.0 / 2)
	def run(self, hyper_classifier, training_data, training_target, testing_data, testing_target):
		'''
		TODO DOCUMENTATION
		'''
		results = {'name': self.name, 'parameterization': self.parameterization, 'exception': None}
		try:
			self.classifier = hyper_classifier.make_classifier(training_data, training_target, **self.parameterization)
			self.classifier.fit(training_data, training_target)
			results['predicted'] = self.classifier.predict(testing_data)
		except MemoryError as e:
			raise e
		except Exception as e:
			print(repr(e))
			results['exception'] = e
		else:
			# attempt to save memory
			del(self.classifier)
			self.classifier = None

			results['ml_metric_ce'] = ml_metrics.ce(testing_target, results['predicted'])
			results['ml_metric_rmse'] = ml_metrics.rmse(testing_target, results['predicted'])
			results['sklearn_metric_accuracy'] = sklearn.metrics.accuracy_score(testing_target, results['predicted'])
			results['sklearn_metric_f1'] = sklearn.metrics.f1_score(testing_target, results['predicted'])
			results['sklearn_metric_precision'] = sklearn.metrics.precision_score(testing_target, results['predicted'])
			results['sklearn_metric_recall'] = sklearn.metrics.recall_score(testing_target, results['predicted'])

			results['ml_metric_auc'] = {}
			results['sklearn_metric_auc'] = {}
			for label in set(testing_target):
				binary_testing_target = np.array(map(lambda x: 1 if x == label else 0, testing_target))
				binary_predicted = np.array(map(lambda x: 1 if x == label else 0, results['predicted']))
				results['ml_metric_auc'][label] = ml_metrics.auc(binary_testing_target, binary_predicted)
				results['sklearn_metric_auc'][label] = sklearn.metrics.auc_score(binary_testing_target, binary_predicted)

		return results
Example #3
0
def score():
	gold = pandas.read_table(insults.DataFile('Inputs','test_with_solutions.csv'),sep=',')
	private = gold[gold.Usage=='PrivateTest'].Insult
	public = gold[gold.Usage=='PublicTest'].Insult
	data = []
	for fn in os.listdir(insults.DataDirectory('Submissions')):
			if fn[-4:] == ".csv":
				guess = pandas.read_table(insults.DataFile('submissions',fn),sep=',')
				pub_guess = guess.Insult[public.index]
				priv_guess = guess.Insult[private.index]
				data.append({"fn": fn[:-4],
									"score" :ml_metrics.auc(gold.Insult,guess.Insult),
									"public": ml_metrics.auc(np.array(public),np.array(pub_guess)),
									"private": ml_metrics.auc(np.array(private),np.array(priv_guess)),
									})

	print pandas.DataFrame(data,columns=("fn","score","public","private")).sort('score')
def score(pred, y):
    '''
    给最后测试结果打分,根据不同的标准,这里需要每次都改
    '''
    print(y, pred)

    metric = metrics.auc(y, pred)
    print(metric)
    return -metric
def first_test():
    from ml_metrics import auc
    import random
    from sklearn import datasets

    b = BasicLogisticRegression(4)

    iris = datasets.load_iris()
    train_data = iris.data[:75]
    train_y = iris.target[:75]

    test_x = iris.data[75:100]
    tmp = iris.target[:100]
    random.shuffle(tmp)
    test_y = tmp[:50]

    def to_dict(x):
        return {i: k for i, k in enumerate(x, start=1)}

    for z in xrange(50):
        for x, y in random.shuffle(zip(train_data, train_y)):
            # print x, y
            b.sgd_fit_one(to_dict(x), y)
    print "fit done"

    rst_y = map(b.predict_raw, map(to_dict, test_x))
    print b.weights
    print test_y
    print rst_y
    print auc(test_y, rst_y)
    # print len(iris.data)
    #



    # another implementation
    from sgd import log_reg_sgd, h

    theta, err = log_reg_sgd(train_data, train_y, 0.001, max_iter=100)
    pred = [h(i, theta) for i in test_x]
    print "theta,", theta
    print auc(test_y, pred)
Example #6
0
	def staged_auc(self,X,y):
		"""
		calculate the AUC after each of the stages.

		returns: ns   -- list of iteration numbers
		         aucs -- list of corresponding areas under the curve.
		"""
		y = np.array(y)
		results = [ (n, ml_metrics.auc(y,p)) for n,p in self.staged_predict(X)]

		return zip(*results) # Python idiom unzips list into two parallel ones.
Example #7
0
	def staged_auc(self,X,y):
		"""
		calculate the AUC after each of the stages.

		returns: ns   -- list of iteration numbers
		         aucs -- list of corresponding areas under the curve.
		"""
		y = np.array(y)
		results = [ (n, ml_metrics.auc(y,p)) for n,p in self.staged_predict(X)]

		return zip(*results) # Python idiom unzips list into two parallel ones.
Example #8
0
def score():
    gold = pandas.read_table(insults.DataFile("Inputs", "test_with_solutions.csv"), sep=",")
    private = gold[gold.Usage == "PrivateTest"].Insult
    public = gold[gold.Usage == "PublicTest"].Insult
    data = []
    for fn in os.listdir(insults.DataDirectory("Submissions")):
        if fn[-4:] == ".csv":
            guess = pandas.read_table(insults.DataFile("submissions", fn), sep=",")
            pub_guess = guess.Insult[public.index]
            priv_guess = guess.Insult[private.index]
            data.append(
                {
                    "fn": fn[:-4],
                    "score": ml_metrics.auc(gold.Insult, guess.Insult),
                    "public": ml_metrics.auc(np.array(public), np.array(pub_guess)),
                    "private": ml_metrics.auc(np.array(private), np.array(priv_guess)),
                }
            )

    print pandas.DataFrame(data, columns=("fn", "score", "public", "private")).sort("score")
Example #9
0
def objective(df, selector, trial):
    selector.set_trial(trial)

    # NOTE: Use validation set for practical usage.
    df_trn, df_tst = train_test_split(df, test_size=0.5)
    X_trn = selector.fit_transform(df_trn).values
    X_tst = selector.transform(df_tst).values

    model = LogisticRegression(solver="lbfgs", max_iter=10000)
    model.fit(X_trn, df_trn["target"])
    y_pred = model.predict_proba(X_tst)[:, 1]

    score = auc(df_tst["target"].values, y_pred)
    return score
def main():
    markdown = PagedownToHtml()

    print("Reading the private leaderboard file")
    test = data_io.get_test_df()
    for i in test.index:
        test["BodyMarkdown"][i] = markdown.convert(test["BodyMarkdown"][i])

    print("Loading the trained model")
    classifier = data_io.load_model("model.pickle")

    print("Making predictions")
    probs = classifier.predict_proba(test)

    solution = data_io.get_private_leaderboard_solution_df()
    print("Open AUC: %0.6f" % metrics.auc(solution["open"], probs[:,1]))
Example #11
0
def tune_one_fold(i,train_i,test_i):
	"""
	Tune one fold of the data.
	"""
	global train
	clf = make_clf(args)
	ftrain = train[train_i]
	logging.info('fold %d' % i)
	clf.fit(ftrain.Comment,ftrain.Insult)
	ypred = clf.predict(ftrain.Comment) 
	logging.info("%d train auc=%f" % (i, ml_metrics.auc(np.array(ftrain.Insult),ypred)))
	ypred = clf.predict(train[test_i].Comment)
	# record information about the auc at each stage of training.
	xs,ys = clf.staged_auc(train[test_i].Comment,train[test_i].Insult)
	xs = np.array(xs)
	ys = np.array(ys)		
	return pandas.DataFrame({ ('auc%d' % i):ys},index=xs)
Example #12
0
def tune_one_fold(i,train_i,test_i):
	"""
	Tune one fold of the data.
	"""
	global train
	clf = make_clf(args)
	ftrain = train[train_i]
	logging.info('fold %d' % i)
	clf.fit(ftrain.Comment,ftrain.Insult)
	ypred = clf.predict(ftrain.Comment) 
	logging.info("%d train auc=%f" % (i, ml_metrics.auc(np.array(ftrain.Insult),ypred)))
	ypred = clf.predict(train[test_i].Comment)
	# record information about the auc at each stage of training.
	xs,ys = clf.staged_auc(train[test_i].Comment,train[test_i].Insult)
	xs = np.array(xs)
	ys = np.array(ys)		
	return pandas.DataFrame({ ('auc%d' % i):ys},index=xs)
def classification_model(model, m, predictors, outcome):
    #Fit the model:
    model.fit(m[predictors], m[outcome])

    #Make predictions on training set:
    predictions = model.predict(m[predictors])

    #Print accuracy
    accuracy = metrics.accuracy_score(predictions, m[outcome])
    print "Accuracy : %s" % "{0:.3%}".format(accuracy)

    auc = metric.auc(predictions, m[outcome])
    print "Auc : %s" % "{0:.3%}".format(auc)

    recall = metrics.recall_score(predictions, m[outcome])
    print "Recall : %s" % "{0:.3%}".format(recall)

    #Fit the model again so that it can be refered outside the function:
    model.fit(m[predictors], m[outcome])
    RandomForestClassifier(n_estimators=165, max_depth=4, criterion='entropy'))
models.append(GradientBoostingClassifier(max_depth=4))
models.append(KNeighborsClassifier(n_neighbors=20))
models.append(GaussianNB())

TRNtrain, TRNtest, TARtrain, TARtest = train_test_split(train,
                                                        target,
                                                        test_size=0.3,
                                                        random_state=0)

plt.figure(figsize=(10, 10))
for model in models:
    model.fit(TRNtrain, TARtrain)
    pred_scr = model.predict_proba(TRNtest)[:, 1]
    fpr, tpr, thresholds = roc_curve(TARtest, pred_scr)
    roc_auc = ml_metrics.auc(TARtest, pred_scr)
    md = str(model)
    md = md[:md.find('(')]
    pl.plot(fpr, tpr, label='ROC fold %s (auc = %0.2f)' % (md, roc_auc))

pl.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6))
pl.xlim([0, 1])
pl.ylim([0, 1])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Receiver operating characteristic example')
pl.legend(loc="lower right")
pl.show()

#приводим тестовую выборку к нужному формату
FieldDrop.append('bad')
def calculate_ftrl_features(train, test, fnames, ftablename, ftrl_type='', optional_date_ftrl3='', optional_condition_ftrl4=''):
    folds = [x for x in range(1, nfold+1)]
    global_mean = np.mean(train.is_screener)
    pred_file = '../data/output-py/ftrl/pred_ftrl.csv'

    ftrl_all = pd.DataFrame()
    count = 0
    for L in range(1, len(folds)+1):
        for train_folds in itertools.combinations(folds, L):
            count = count + 1
            print train_folds
            test_folds = [x for x in folds if not x in list(train_folds)]
            if len(test_folds) == 0:
                test_folds = [0]
            print test_folds

            if False:
                store = pd.HDFStore('../data/output-py/ftrl/ftrl_feats' + str(count) + '.h5')
                ftrl_feats = store.get('ftrl_feats')
                store.close()
            else:
                train_file = save_ftrl_data('train', fnames, ftablename, test_folds, list(train_folds), ftrl_type, optional_date_ftrl3, optional_condition_ftrl4)
                if 0 in test_folds:
                    test_file = save_ftrl_data('test', fnames, ftablename, test_folds, list(train_folds), ftrl_type, optional_date_ftrl3, optional_condition_ftrl4)
                else:
                    test_file = save_ftrl_data('val', fnames, ftablename, test_folds, list(train_folds), ftrl_type, optional_date_ftrl3, optional_condition_ftrl4)

                non_factor_cols = "''"
                non_feature_cols = "''"
                text_cols = "'diagnosis_description'"

                os.system('pypy ftrl' + ftrl_type + '.py' +
                          ' --alpha ' + str(0.07) +
                          ' --beta ' + str(1.0) +
                          ' --L1 ' + str(0.01) +
                          ' --L2 ' + str(1.0) +
                          ' --epoch ' + str(1) +
                          ' --train ' + train_file +
                          ' --test ' + test_file +
                          ' --submission ' + pred_file +
                          ' --non_feature_cols ' + non_feature_cols +
                          ' --non_factor_cols ' + non_factor_cols + 
                          ' --text_cols ' + text_cols)

                ftrl_feats = pd.read_csv(pred_file)
                ftrl_feats = ftrl_feats.groupby('patient_id')['is_screener_pred'].max().reset_index()

                for x in folds:
                    if x in list(train_folds):
                        ftrl_feats['fold'+str(x)] = 1
                    else:
                        ftrl_feats['fold'+str(x)] = 0
                store = pd.HDFStore('../data/output-py/ftrl/ftrl_feats' + str(count) + '.h5')
                store.append('ftrl_feats', ftrl_feats)
                store.close()
                os.system('rm -R ' + train_file)
                os.system('rm -R ' + test_file)
                os.system('rm -R ' + pred_file)

            ftrl_all = ftrl_all.append(ftrl_feats, ignore_index=True)

            ftrl_feats = pd.merge(ftrl_feats, train[['patient_id', 'is_screener']], on='patient_id', how='inner')
            if len(ftrl_feats)>0:
                print "Pearson correlation: " + str(pearsonr(ftrl_feats.is_screener, ftrl_feats.is_screener_pred))
                print "AUC: " + str(auc(ftrl_feats.is_screener, ftrl_feats.is_screener_pred))
            del ftrl_feats

    feats_all = train[['patient_id']].append(test[['patient_id']], ignore_index=True)
    for test_fold in ([0] + folds):
        train_folds = [x for x in folds if (x != test_fold) and (x != 0)]

        if len(train_folds) == len(folds):
            pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds])
        else:
            pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds]) + ' and ' + ' and '.join(['fold'+str(x)+'==0' for x in folds if not x in train_folds])
        print pd_query

        ftrl_feats = ftrl_all.query(pd_query).copy().reset_index(drop=True)
        for x in folds:
            ftrl_feats.drop('fold'+str(x), axis=1, inplace=True)

        if test_fold == 0:
            feats_fold = test[['patient_id']].copy()
        else:
            feats_fold = train.query('cv_index==@test_fold')[['patient_id']].copy()
        feats_fold = pd.merge(feats_fold, ftrl_feats, on='patient_id', how='left')
        del ftrl_feats

        for val_fold in [x for x in folds if (x != test_fold) and (x != 0)]:
            train_folds = [x for x in folds if (x != test_fold) and (x != val_fold) and (x != 0)]
            pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds]) + ' and ' + ' and '.join(['fold'+str(x)+'==0' for x in folds if not x in train_folds])
            
            ftrl_feats = ftrl_all.query(pd_query).copy().reset_index(drop=True)
            for x in folds:
                ftrl_feats.drop('fold'+str(x), axis=1, inplace=True)

            feats_val_fold = train.query('cv_index==@val_fold')[['patient_id']].copy()
            feats_val_fold = pd.merge(feats_val_fold, ftrl_feats, on='patient_id', how='left')
            del ftrl_feats
            feats_fold = feats_fold.append(feats_val_fold, ignore_index=True)

        feats_fold = feats_fold.reset_index(drop=True)
        feats_fold['is_screener_pred'].fillna(global_mean, inplace=True)
        feats_fold = feats_fold.rename(columns={'is_screener_pred' : '_'.join(fnames) + '_' + ftablename + '_ftrl' + ftrl_type + '_fold_'+str(test_fold)})
        feats_all = pd.merge(feats_all, feats_fold, on='patient_id', how='left')

    print "Writing to HDF5 store..."
    store = pd.HDFStore('../data/output-py/' + '_'.join(fnames) + '_' + ftablename + '_ftrl' + ftrl_type + '.h5')
    store.append('feats_all', feats_all)
    print 'Feature ' + '_'.join(fnames) + '_' + ftablename + '_ftrl' + ftrl_type + ' is saved in file.'
    store.close()
    return '_'.join(fnames) + '_' + ftablename + '_ftrl' + ftrl_type
Example #16
0
def reverse_auc(labels, predictions):
    target_neg_one = [1 if x == -1 else 0 for x in labels]
    neg_predictions = [-x for x in predictions]
    score = ml_metrics.auc(target_neg_one, neg_predictions)
    return score
Example #17
0
X, y = make_classification(1000000)
t_X, t_y = map(torch.FloatTensor, (X, y))

net = LogsticRegression(20, 2)
loss_func = torch.nn.modules.loss.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters())

bar_epochs = tqdm_notebook(range(epochs))
for e in bar_epochs:
    bar_epochs.set_description(f"Epoch {e}:")
    t = tqdm_notebook(range(0, t_X.size(0), batch_size))
    for b in t:  # for each training step
        # train your data...
        b_X = t_X[b:b + batch_size]
        b_y = t_y[b:b + batch_size]
        output = net(b_X)  # rnn output
        loss = loss_func(
            output,
            b_y.long().view(-1))  # cross entropy loss and y is not one-hotted
        optimizer.zero_grad()  # clear gradients for this training step
        loss.backward()  # backpropagation, compute gradients
        optimizer.step()
        if b % 10000 == 0:
            t.set_description(
                f"Epoch {e}:"
                f"Loss: {loss.data.numpy():.5f} | "
                f"Auc: {auc(b_y.numpy(), output.data.numpy()[:, 1]):.5}")

_net = net.eval()
auc(y, _net(t_X).data.numpy()[:, -1])
Example #18
0
                y_pred = train_predict_adaboost_classifier(X_train, y_train, X_test)

            if 'ftrl' in MODEL:
                y_pred = train_predict_ftrl(X_train, y_train, X_test)
            
            preds = pd.DataFrame()
            preds['ID'] = test_split['ID'].values
            preds['FOLD'] = fold
            preds['ITER'] = it
            preds[MODEL] = y_pred
            preds_model = preds_model.append(preds, ignore_index=True)

            preds = preds.loc[preds['ID'].isin(ids_val)].copy()
            preds = pd.merge(preds, train[['ID', 'TARGET']], on='ID', how='left')

            fold_auc = auc(preds['TARGET'], preds[MODEL])
            aucs.append(fold_auc)
        print np.mean(aucs), np.std(aucs)

    preds_model.loc[preds_model[MODEL]<0, MODEL] = 0.0
    preds_model.loc[preds_model[MODEL]>1, MODEL] = 1.0
    preds_model = preds_model.groupby(['ID', 'ITER'])[MODEL].mean().reset_index()
    for it in range(1, 21):
        preds_model.loc[preds_model['ITER']==it, MODEL] = preds_model.loc[preds_model['ITER']==it, MODEL].rank()
    preds_model = preds_model.groupby('ID')[MODEL].mean().reset_index()
    preds_model.columns = ['ID', 'dmitry_'+MODEL]
    preds_all = pd.merge(preds_all, preds_model, on='ID', how='left')
    preds_all.to_csv('all_models_temp.csv', index=False)

preds_train = pd.merge(train[['ID']], preds_all, on='ID', how='left')
preds_train.to_csv(OUTPUT_PATH + 'train/' + 'dmitry_train.csv', index=False)
Example #19
0
def forward_auc(labels, predictions):
    target_one = [1 if x == 1 else 0 for x in labels]
    score = ml_metrics.auc(target_one, predictions)
    return score
Example #20
0
'compute AUC from VW validation and predictions file'

import sys, csv, math
from ml_metrics import auc

test_file = sys.argv[1]
predictions_file = sys.argv[2]

test_reader = csv.reader(open(test_file), delimiter=" ")
p_reader = csv.reader(open(predictions_file), delimiter="\n")

ys = []
ps = []

for p_line in p_reader:
    test_line = test_reader.next()

    p = float(p_line[0])
    p = math.tanh(p)
    ps.append(p)

    y = float(test_line[0])
    ys.append(y)

AUC = auc(ys, ps)

print "AUC: %s" % (AUC)
print
Example #21
0
File: auc.py Project: mb16/Kaggle
'compute AUC from VW validation and predictions file'

import sys, csv, math
from ml_metrics import auc

test_file = sys.argv[1]
predictions_file = sys.argv[2]

test_reader = csv.reader( open( test_file ), delimiter = " " )
p_reader = csv.reader( open( predictions_file ), delimiter = "\n" )

ys = []
ps = []

for p_line in p_reader:
	test_line = test_reader.next()

	p = float( p_line[0] )
	p = math.tanh( p )
	ps.append( p )

	y = float( test_line[0] )
	ys.append( y )

AUC = auc( ys, ps )

print "AUC: %s" % ( AUC )
print
Example #22
0
def stacking_model_sk_svc(task='together'):
    nfold = 5
    #task='together'

    train_df = None
    test_df = None

    if task == 'together':
        train_df = pd.read_csv('./data/train_df_day_night_together.csv')
        test_df = pd.read_csv('./data/test_df_day_night_together.csv')
        from together_fn_param import list_param
    elif task == 'split':
        train_df = pd.read_csv('./data/train_df_day_night_split.csv')
        test_df = pd.read_csv('./data/test_df_day_night_split.csv')
        from split_fn_param import list_param

    train_df = train_df.fillna(-1)
    test_df = test_df.fillna(-1)

    print("Data loading Done!")
    target = 'label'
    predictors = train_df.columns.values.tolist()[1:-1]
    categorical = None

    X_train = train_df[predictors].values
    X_test = test_df[predictors].values
    labels = train_df['label']
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    seeds = np.random.randint(5000, 10000, size=10).tolist()
    auc_lst = []
    auc_lst1 = []
    n_estimators_lst = []
    stratified = True
    debug = True
    param = list_param('sk_svc')
    oof_preds_folds = np.zeros((train_df.shape[0], len(seeds)))
    sub_preds_folds = np.zeros((test_df.shape[0], len(seeds)))
    sub_preds_folds_vote = np.zeros((test_df.shape[0], len(seeds)))
    oof_preds_folds_vote = np.zeros((train_df.shape[0], len(seeds)))
    feature_importance_df_folds = pd.DataFrame()

    list_thresholds_global = []
    for seed_id in range(len(seeds)):
        if stratified:
            folds = StratifiedKFold(n_splits=nfold,
                                    shuffle=True,
                                    random_state=seeds[seed_id])
        else:
            folds = KFold(n_splits=nfold, shuffle=True, random_state=1001)
        oof_preds = np.zeros(train_df.shape[0])
        sub_preds = np.zeros(test_df.shape[0])
        oof_preds_local_vote = np.zeros(train_df.shape[0])
        sub_preds_local_vote = np.zeros((test_df.shape[0], nfold))
        feature_importance_df = pd.DataFrame()

        gfold_Id = list(folds.split(X_train, labels))
        params_iter = {'random_state': seeds[seed_id]}
        param.update(params_iter)

        clf = SVC(
            C=param['C'],
            kernel='rbf',
            gamma=param['gamma'],
            shrinking=True,
            probability=True,
            tol=param['tol'],  # 0.001,#may be 0.0001 for stoping criteria
            max_iter=int(param['max_iter']),
            verbose=False,
            decision_function_shape='ovr',
            random_state=seeds[seed_id])

        for n_fold, (train_idx,
                     valid_idx) in enumerate(folds.split(X_train, labels)):
            xtrain, xtest = X_train[train_idx, :], X_train[valid_idx, :]
            ytrain, ytest = labels[train_idx], labels[valid_idx]

            clf.fit(xtrain, ytrain)

            oof_preds[valid_idx] = clf.predict_proba(xtest)[:, 1]
            pred = clf.predict_proba(X_test)[:, 1]
            sub_preds += pred / folds.n_splits

            fpr, tpr, thresholds = metrics.roc_curve(
                train_df[target].iloc[valid_idx], oof_preds[valid_idx])
            optimal_idx = np.argmax(tpr - fpr)
            optimal_thresholds = thresholds[optimal_idx]

            list_thresholds_global.append(optimal_thresholds)

            sub_preds_local_vote[:, n_fold] = [
                1 if y_cont > optimal_thresholds else 0 for y_cont in pred
            ]
            oof_preds_local_vote[valid_idx] = [
                1 if y_cont > optimal_thresholds else 0
                for y_cont in oof_preds[valid_idx]
            ]

            print('Fold %2d AUC : %.6f' %
                  (n_fold + 1, roc_auc_score(ytest, oof_preds[valid_idx])))

            del xtrain, xtest, ytrain, ytest
            gc.collect()

        oof_preds_folds[:, seed_id] = oof_preds
        sub_preds_folds[:, seed_id] = sub_preds
        from scipy import stats

        a, b = stats.mode(sub_preds_local_vote, axis=1)
        oof_preds_folds_vote[:, seed_id] = oof_preds_local_vote
        sub_preds_folds_vote[:, seed_id] = a.reshape(-1)
        feature_importance_df_folds = pd.concat(
            [feature_importance_df_folds, feature_importance_df], axis=0)
        auc_lst.append(ml_metrics.auc(train_df[target], oof_preds))
        auc_lst1.append(roc_auc_score(train_df[target], oof_preds))
        print('Full AUC score %.6f' %
              roc_auc_score(train_df[target], oof_preds))
        print("auc_lst1")
        print(auc_lst1)

    print(list_thresholds_global)
    #oof_preds_folds = pd.DataFrame(oof_preds_folds,columns=['sk_svc_seed_' + str(seeds[l]) for l in range(len(seeds))])
    #sub_preds_folds = pd.DataFrame(sub_preds_folds,columns=['sk_svc_seed_' + str(seeds[l]) for l in range(len(seeds))])
    oof_preds_folds_vote = pd.DataFrame(
        oof_preds_folds_vote,
        columns=['sk_svc_seed_' + str(seeds[l]) for l in range(len(seeds))])
    sub_preds_folds_vote = pd.DataFrame(
        sub_preds_folds_vote,
        columns=['sk_svc_seed_' + str(seeds[l]) for l in range(len(seeds))])

    #oof_preds_folds.to_csv("../" + task + "_train_stack/sk_svc.csv", index=False)
    #sub_preds_folds.to_csv("../" + task + "_test_stack/sk_svc.csv", index=False)
    oof_preds_folds_vote.to_csv("./output/" + task +
                                "_train_stack_vote/sk_svc.csv",
                                index=False)
    sub_preds_folds_vote.to_csv("./output/" + task +
                                "_test_stack_vote/sk_svc.csv",
                                index=False)
def calculate_likelihoods(train, test, fnames, ftablename, function_type='max', query_type='', optional_filter_feature_likeli6='', optional_filter_value_likeli6=''):
    global_mean = np.mean(train.is_screener)
    folds = [x for x in range(1, nfold+1)]

    likeli_all = pd.DataFrame()
    for L in range(1, len(folds)+1):
        for train_folds in itertools.combinations(folds, L):
            print train_folds
            sql_query = open('genentech-sql/pattern_likeli_multiple' + query_type + '.sql').read()
            sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename)
            sql_query = sql_query.replace('GENERIC_FEATURE_NAME', '_'.join(fnames))
            sql_query = sql_query.replace('FEATURE_NAMES_COMMA_SEPARATED', ','.join(fnames))
            sql_query = sql_query.replace('T1_COMMA_SEPARATED', ','.join(['t1.'+x for x in fnames]))
            sql_query = sql_query.replace('T3_T4_CONDITION', ' AND '.join(['t3.'+x+'=t4.'+x for x in fnames]))
            sql_query = sql_query.replace('OPTIONAL_CV_EXPRESSION', 'WHERE ' + ' OR '.join(['cv_index='+str(x) for x in list(train_folds)]))
            sql_query = sql_query.replace('GROUP_FUNCTION', function_type)
            sql_query = sql_query.replace('OPTIONAL_CONDITION_LIKELI6', 'WHERE ' + optional_filter_feature_likeli6 + "='" + optional_filter_value_likeli6 + "'")
            #sql_query = sql_query.replace('OPTIONAL_CONDITION_LIKELI6', 'WHERE ' + optional_filter_feature_likeli6 + ">=" + optional_filter_value_likeli6)
            if len(list(train_folds)) == len(folds):
                choosing_patients_expression = 'patients_test2'
            else:
                choosing_patients_expression = 'train_cv_indices ' + 'WHERE ' + ' OR '.join(['cv_index='+str(x) for x in folds if not x in list(train_folds)])
            sql_query = sql_query.replace('CHOOSING_PATIENTS_EXPRESSION', choosing_patients_expression)

            conn = utils.connect_to_database()
            cur = conn.cursor()
            cur.execute(sql_query)
            if (query_type == '3') or (query_type == '4') or (query_type == '5'):
                conn.commit()
                sql_query = open('genentech-sql/pattern_likeli_multiple' + query_type + '_2.sql').read()
                sql_query = sql_query.replace('GENERIC_FEATURE_NAME', '_'.join(fnames))
                sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename)
                cur.execute(sql_query)
                likeli = pd.DataFrame(cur.fetchall())
                likeli.columns = [x.name for x in cur.description]
                cur.execute('DROP TABLE patient_likeli_table;')
                conn.commit()
            else:
                likeli = pd.DataFrame(cur.fetchall())
                likeli.columns = [x.name for x in cur.description]

            for x in folds:
                if x in list(train_folds):
                    likeli['fold'+str(x)] = 1
                else:
                    likeli['fold'+str(x)] = 0
            cur.close()
            conn.close()
            
            likeli_all = likeli_all.append(likeli, ignore_index=True)
            col = likeli.columns[1]
            likeli = pd.merge(likeli, train[['patient_id', 'is_screener']], on='patient_id', how='inner')
            if len(likeli)>0:
                print "Pearson correlation: " + str(pearsonr(likeli.is_screener, likeli[col]))
                print "AUC: " + str(auc(likeli.is_screener, likeli[col]))
            del likeli

    feats_all = train[['patient_id']].append(test[['patient_id']], ignore_index=True)
    for test_fold in ([0] + folds):
        train_folds = [x for x in folds if (x != test_fold) and (x != 0)]

        if len(train_folds) == len(folds):
            pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds])
        else:
            pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds]) + ' and ' + ' and '.join(['fold'+str(x)+'==0' for x in folds if not x in train_folds])
        print pd_query

        likeli = likeli_all.query(pd_query).copy().reset_index(drop=True)
        for x in folds:
            likeli.drop('fold'+str(x), axis=1, inplace=True)

        if test_fold == 0:
            feats_fold = test[['patient_id']].copy()
        else:
            feats_fold = train.query('cv_index==@test_fold')[['patient_id']].copy()
        feats_fold = pd.merge(feats_fold, likeli, on='patient_id', how='left')
        del likeli

        for val_fold in [x for x in folds if (x != test_fold) and (x != 0)]:
            train_folds = [x for x in folds if (x != test_fold) and (x != val_fold) and (x != 0)]
            pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds]) + ' and ' + ' and '.join(['fold'+str(x)+'==0' for x in folds if not x in train_folds])
            
            likeli = likeli_all.query(pd_query).copy().reset_index(drop=True)
            for x in folds:
                likeli.drop('fold'+str(x), axis=1, inplace=True)

            feats_val_fold = train.query('cv_index==@val_fold')[['patient_id']].copy()
            feats_val_fold = pd.merge(feats_val_fold, likeli, on='patient_id', how='left')
            del likeli
            feats_fold = feats_fold.append(feats_val_fold, ignore_index=True)

        col = feats_fold.columns[1]
        feats_fold = feats_fold.reset_index(drop=True)
        feats_fold[col].fillna(global_mean, inplace=True)
        #feats_fold[fname_w_likeli].fillna(global_mean, inplace=True)
        feats_fold = feats_fold.rename(columns={col : col+'_fold_'+str(test_fold)})
        #feats_fold = feats_fold.rename(columns={fname_w_likeli : fname_w_likeli+'_fold_'+str(test_fold)})
        feats_all = pd.merge(feats_all, feats_fold, on='patient_id', how='left')

    print "Writing to HDF5 store..."
    store = pd.HDFStore('../data/output-py/' + col + '.h5')
    store.append('feats_all', feats_all)
    store.close()
    conn.close()
    print "Feature " + col + " is saved in file."
    return col
def calculate_likelihoods2(train, test, fnames_list, ftablename):
    global_mean = np.mean(train.is_screener)
    folds = [x for x in range(1, nfold+1)]

    likeli_all = pd.DataFrame()
    for L in range(1, len(folds)+1):
        for train_folds in itertools.combinations(folds, L):
            print train_folds
            test_folds = [x for x in folds if not x in list(train_folds)]
            if len(test_folds) == 0:
                test_folds = [0]
            print test_folds

            for fnames in fnames_list:
                likeli_table_name = '_'.join(fnames) + '_likeli_table'
                generate_likelihood_table(likeli_table_name, fnames, ftablename, train_folds)
            likeli = merge_likelihood_tables(fnames_list, ftablename, train_folds)
            for fnames in fnames_list:
                likeli_table_name = '_'.join(fnames) + '_likeli_table'
                drop_likelihood_table(likeli_table_name)

            for x in folds:
                if x in list(train_folds):
                    likeli['fold'+str(x)] = 1
                else:
                    likeli['fold'+str(x)] = 0
            likeli_all = likeli_all.append(likeli, ignore_index=True)
            
            col = likeli.columns[1]
            likeli = pd.merge(likeli, train[['patient_id', 'is_screener']], on='patient_id', how='inner')
            if len(likeli)>0:
                print "Pearson correlation: " + str(pearsonr(likeli.is_screener, likeli[col]))
                print "AUC: " + str(auc(likeli.is_screener, likeli[col]))
            del likeli

    file_name = likeli_all.columns[1]
    feats_all = train[['patient_id']].append(test[['patient_id']], ignore_index=True)
    for test_fold in ([0] + folds):
        train_folds = [x for x in folds if (x != test_fold) and (x != 0)]

        if len(train_folds) == len(folds):
            pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds])
        else:
            pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds]) + ' and ' + ' and '.join(['fold'+str(x)+'==0' for x in folds if not x in train_folds])
        print pd_query

        likeli = likeli_all.query(pd_query).copy().reset_index(drop=True)
        for x in folds:
            likeli.drop('fold'+str(x), axis=1, inplace=True)

        if test_fold == 0:
            feats_fold = test[['patient_id']].copy()
        else:
            feats_fold = train.query('cv_index==@test_fold')[['patient_id']].copy()
        feats_fold = pd.merge(feats_fold, likeli, on='patient_id', how='left')
        del likeli

        for val_fold in [x for x in folds if (x != test_fold) and (x != 0)]:
            train_folds = [x for x in folds if (x != test_fold) and (x != val_fold) and (x != 0)]
            pd_query = ' and '.join(['fold'+str(x)+'==1' for x in train_folds]) + ' and ' + ' and '.join(['fold'+str(x)+'==0' for x in folds if not x in train_folds])
            
            likeli = likeli_all.query(pd_query).copy().reset_index(drop=True)
            for x in folds:
                likeli.drop('fold'+str(x), axis=1, inplace=True)

            feats_val_fold = train.query('cv_index==@val_fold')[['patient_id']].copy()
            feats_val_fold = pd.merge(feats_val_fold, likeli, on='patient_id', how='left')
            del likeli
            feats_fold = feats_fold.append(feats_val_fold, ignore_index=True)

        feats_fold = feats_fold.reset_index(drop=True)
        for cols in [x for x in feats_fold.columns if x != 'patient_id']:
            feats_fold[cols].fillna(global_mean*len(fnames), inplace=True)            
            feats_fold = feats_fold.rename(columns={cols : cols+'_fold_'+str(test_fold)})
        feats_all = pd.merge(feats_all, feats_fold, on='patient_id', how='left')

    print "Writing to HDF5 store..."
    store = pd.HDFStore('../data/output-py/' + file_name + '.h5')
    store.append('feats_all', feats_all)
    print "Feature " + file_name + " is saved in file."
    store.close()
    return file_name
    max_iter = 10
    # create a synthetic data set
    x, y = datasets.make_classification(EX)
    print "sample", x[251]
    print "feature num ", x.shape[1]
    # append a 1 column at index 0 in x
    x = np.hstack((np.ones((x.shape[0], 1)), x))
    print x[251]
    from sgd import log_reg_sgd, h


    theta = log_reg_sgd(x[:EX / 2], y[:EX / 2], a, max_iter=max_iter)
    pred = [h(x[i], theta) for i in xrange(EX / 2, EX)]
    print "weights ",theta
    # print "err ",err
    print auc(y[EX / 2:], pred)


    def to_dict(x):
        # print x
        return {i: k for i, k in enumerate(x[1:], start=1)}


    b = BasicLogisticRegression(x.shape[1]-1, a)
    for z in xrange(max_iter ):
        for i in xrange(EX / 2):
            b.sgd_fit_one(to_dict(x[i]), y[i])

    rst_y = map(b.predict_raw, map(to_dict, x[EX / 2:]))
    print rst_y
    print b.weights
Example #26
0
	def auc(self,X,y):
		yhat = self.predict(X)
		return ml_metrics.auc(np.array(y),yhat)
Example #27
0
 def auc_score(self, y, y_pred):
     sorted_y_pred = sorted(y_pred,reverse=True)
     return auc(y,sorted_y_pred)
Example #28
0
 def auc(self, X, y):
     yhat = self.predict(X)
     return ml_metrics.auc(np.array(y), yhat)
Example #29
0
 def auc_score(self, y, y_pred):
     sorted_y_pred = sorted(y_pred, reverse=True)
     return auc(y, sorted_y_pred)
Example #30
0
def reverse_auc(labels, predictions):
    target_neg_one = [1 if x==-1 else 0 for x in labels]
    neg_predictions = [-x for x in predictions]
    score = metrics.auc(target_neg_one, neg_predictions)
    return score
Example #31
0
 def test_auc(self):
     self.assertAlmostEqual(metrics.auc([1,0,1,1], [.32,.52,.26,.86]), 1.0/3)
     self.assertAlmostEqual(metrics.auc([1,0,1,0,1], [.9,.1,.8,.1,.7]), 1)
     self.assertAlmostEqual(metrics.auc([0,1,1,0], [.2,.1,.3,.4]), 1.0/4)
     self.assertAlmostEqual(metrics.auc([1,1,1,1,0,0,0,0,0,0], 
                                        [1,1,1,1,1,1,1,1,1,1]), 1.0/2)
def stacking_model_cat(task='together'):
    nfold = 5
    #task='together'

    train_df = None
    test_df = None

    if task == 'together':
        train_df = pd.read_csv('./data/train_df_day_night_together.csv')
        test_df = pd.read_csv('./data/test_df_day_night_together.csv')
        from together_fn_param import list_param
    elif task == 'split':
        train_df = pd.read_csv('./data/train_df_day_night_split.csv')
        test_df = pd.read_csv('./data/test_df_day_night_split.csv')
        from split_fn_param import list_param

    train_df = train_df.fillna(-1)
    test_df = test_df.fillna(-1)

    print("Data loading Done!")
    target = 'label'
    predictors = train_df.columns.values.tolist()[1:-1]
    categorical = None

    X_train = train_df.drop(['bird_id', 'label'], axis=1)
    labels = train_df['label']
    #cat

    seeds = np.random.randint(5000, 10000, size=10).tolist()
    auc_lst = []
    auc_lst1 = []
    n_estimators_lst = []
    stratified = True
    debug = True
    param = list_param('cat')
    oof_preds_folds = np.zeros((train_df.shape[0], len(seeds)))
    sub_preds_folds = np.zeros((test_df.shape[0], len(seeds)))
    sub_preds_folds_vote = np.zeros((test_df.shape[0], len(seeds)))
    oof_preds_folds_vote = np.zeros((train_df.shape[0], len(seeds)))
    feature_importance_df_folds = pd.DataFrame()

    list_thresholds_global = []
    for seed_id in range(len(seeds)):
        if stratified:
            folds = StratifiedKFold(n_splits=nfold,
                                    shuffle=True,
                                    random_state=seeds[seed_id])
        else:
            folds = KFold(n_splits=nfold, shuffle=True, random_state=1001)
        oof_preds = np.zeros(train_df.shape[0])
        sub_preds = np.zeros(test_df.shape[0])
        oof_preds_local_vote = np.zeros(train_df.shape[0])
        sub_preds_local_vote = np.zeros((test_df.shape[0], nfold))
        feature_importance_df = pd.DataFrame()

        gfold_Id = list(folds.split(X_train, labels))
        params_iter = {
            'iterations': 5000,  # int
            'border_count': 128,  # (128) 1 - 255
            'bootstrap_type': 'Bernoulli',
            'loss_function': 'Logloss',
            'eval_metric': 'F1',  # 'AUC',
            'od_type': 'Iter',
            'allow_writing_files': False,
            'early_stopping_rounds': 50,
            'custom_metric': ['AUC'],
            'random_seed': seeds[seed_id],
            'use_best_model': True
        }
        param.update(params_iter)

        pool = ctb.Pool(train_df[predictors], train_df[target])

        bst1 = ctb.cv(pool=pool,
                      params=param,
                      fold_count=10,
                      partition_random_seed=seeds[seed_id],
                      stratified=True)

        res0 = pd.DataFrame(bst1)

        n_estimators = res0['test-F1-mean'].argmax() + 1

        params_iter2 = {
            'iterations': n_estimators,
        }
        param.update(params_iter2)

        for n_fold, (train_idx,
                     valid_idx) in enumerate(folds.split(X_train, labels)):
            if 'use_best_model' in param:
                param.__delitem__("use_best_model")

            pool_0 = ctb.Pool(train_df[predictors].iloc[train_idx],
                              train_df[target].iloc[train_idx])

            clf = ctb.train(pool=pool_0, params=param)

            #oof_preds[valid_idx] = clf.predict(train_df[predictors].iloc[valid_idx], prediction_type='Probability')[:, 1]
            #sub_preds += (clf.predict(test_df[predictors], prediction_type='Probability')[:, 1]) / folds.n_splits

            oof_preds[valid_idx] = clf.predict(
                train_df[predictors].iloc[valid_idx],
                prediction_type='Probability')[:, 1]
            pred = clf.predict(test_df[predictors],
                               prediction_type='Probability')[:, 1]
            sub_preds += pred / folds.n_splits

            fpr, tpr, thresholds = metrics.roc_curve(
                train_df[target].iloc[valid_idx], oof_preds[valid_idx])
            optimal_idx = np.argmax(tpr - fpr)
            optimal_thresholds = thresholds[optimal_idx]

            list_thresholds_global.append(optimal_thresholds)

            sub_preds_local_vote[:, n_fold] = [
                1 if y_cont > optimal_thresholds else 0 for y_cont in pred
            ]
            oof_preds_local_vote[valid_idx] = [
                1 if y_cont > optimal_thresholds else 0
                for y_cont in oof_preds[valid_idx]
            ]

            fold_importance_df = pd.DataFrame(
                list(
                    zip(train_df[predictors].iloc[train_idx].dtypes.index,
                        clf.get_feature_importance(pool_0))),
                columns=['feature', 'importance'])

            fold_importance_df = fold_importance_df.sort_values(
                by='importance',
                ascending=False,
                inplace=False,
                kind='quicksort',
                na_position='last')
            fold_importance_df["fold"] = n_fold + 1
            fold_importance_df["seed"] = 'seed_' + str(seeds[seed_id])

            feature_importance_df = pd.concat(
                [feature_importance_df, fold_importance_df], axis=0)
            print('Fold %2d AUC : %.6f' %
                  (n_fold + 1,
                   roc_auc_score(train_df[target].iloc[valid_idx],
                                 oof_preds[valid_idx])))

            del clf, pool_0
            gc.collect()

        oof_preds_folds[:, seed_id] = oof_preds
        sub_preds_folds[:, seed_id] = sub_preds
        from scipy import stats

        a, b = stats.mode(sub_preds_local_vote, axis=1)
        oof_preds_folds_vote[:, seed_id] = oof_preds_local_vote
        sub_preds_folds_vote[:, seed_id] = a.reshape(-1)
        feature_importance_df_folds = pd.concat(
            [feature_importance_df_folds, feature_importance_df], axis=0)
        auc_lst.append(ml_metrics.auc(train_df[target], oof_preds))
        auc_lst1.append(roc_auc_score(train_df[target], oof_preds))
        print('Full AUC score %.6f' %
              roc_auc_score(train_df[target], oof_preds))
        print("auc_lst1")
        print(auc_lst1)

    print(list_thresholds_global)
    #oof_preds_folds = pd.DataFrame(oof_preds_folds,columns=['cat_seed_' + str(seeds[l]) for l in range(len(seeds))])
    #sub_preds_folds = pd.DataFrame(sub_preds_folds,columns=['cat_seed_' + str(seeds[l]) for l in range(len(seeds))])
    oof_preds_folds_vote = pd.DataFrame(
        oof_preds_folds_vote,
        columns=['cat_seed_' + str(seeds[l]) for l in range(len(seeds))])
    sub_preds_folds_vote = pd.DataFrame(
        sub_preds_folds_vote,
        columns=['cat_seed_' + str(seeds[l]) for l in range(len(seeds))])

    #oof_preds_folds.to_csv("./output/" + task + "_train_stack/cat.csv", index=False)
    #sub_preds_folds.to_csv("./output/" + task + "_test_stack/cat.csv", index=False)
    oof_preds_folds_vote.to_csv("./output/" + task +
                                "_train_stack_vote/cat.csv",
                                index=False)
    sub_preds_folds_vote.to_csv("./output/" + task +
                                "_test_stack_vote/cat.csv",
                                index=False)
    feature_importance_df_folds = feature_importance_df_folds.sort_values(
        'importance', ascending=False)
    feature_importance_df_folds.to_csv("./output/" + task + "_feature/cat.csv",
                                       index=False)
def stacking_model_xgb_rank(task='together'):
    nfold = 5
    #task='together'

    train_df = None
    test_df = None

    if task == 'together':
        train_df = pd.read_csv('./data/train_df_day_night_together.csv')
        test_df = pd.read_csv('./data/test_df_day_night_together.csv')
        from together_fn_param import list_param
    elif task == 'split':
        train_df = pd.read_csv('./data/train_df_day_night_split.csv')
        test_df = pd.read_csv('./data/test_df_day_night_split.csv')
        from split_fn_param import list_param

    train_df = train_df.fillna(-1)
    test_df = test_df.fillna(-1)

    print("Data loading Done!")

    X_train = train_df.drop(['bird_id', 'label'], axis=1)
    features = X_train.columns
    labels = train_df['label']
    X_train = X_train.fillna(-1)
    y_train = np.int32(labels)

    X_test = test_df.drop(['bird_id', 'label'], axis=1)
    X_test = X_test.fillna(-1)

    #xgboost

    xg_train = xgb.DMatrix(X_train, label=y_train, missing=-1.0)
    xg_test = xgb.DMatrix(X_test, missing=-1.0)

    def xg_f1(yhat, dtrain):
        y = dtrain.get_label()

        pre, rec, th = metrics.precision_recall_curve(y, yhat)

        f1_all = 2 / ((1 / rec) + (1 / pre))
        optimal_idx = np.argmax(f1_all)
        optimal_thresholds = th[optimal_idx]
        y_bin = [1. if y_cont > optimal_thresholds else 0.
                 for y_cont in yhat]  # binaryzing your output
        tn, fp, fn, tp = confusion_matrix(y, y_bin).ravel()
        specificity = tn / (tn + fp)
        sensitivity = tp / (tp + fn)
        optimal_f1 = np.nanmax(f1_all)

        return 'f1', -optimal_f1

    seeds = np.random.randint(5000, 10000, size=10).tolist()
    auc_lst = []
    auc_lst1 = []
    n_estimators_lst = []
    stratified = True
    debug = True
    param = list_param('xgb_rank')
    oof_preds_folds = np.zeros((train_df.shape[0], len(seeds)))
    sub_preds_folds = np.zeros((test_df.shape[0], len(seeds)))
    sub_preds_folds_vote = np.zeros((test_df.shape[0], len(seeds)))
    oof_preds_folds_vote = np.zeros((train_df.shape[0], len(seeds)))
    feature_importance_df_folds = pd.DataFrame()

    list_thresholds_global = []
    for seed_id in range(len(seeds)):
        if stratified:
            folds = StratifiedKFold(n_splits=nfold,
                                    shuffle=True,
                                    random_state=seeds[seed_id])
        else:
            folds = KFold(n_splits=nfold, shuffle=True, random_state=1001)
        oof_preds = np.zeros(train_df.shape[0])
        sub_preds = np.zeros(test_df.shape[0])
        oof_preds_local_vote = np.zeros(train_df.shape[0])
        sub_preds_local_vote = np.zeros((test_df.shape[0], nfold))
        feature_importance_df = pd.DataFrame()

        gfold_Id = list(folds.split(X_train, labels))
        params_iter = {
            'seed': seeds[seed_id],
            'objective': 'rank:pairwise',
            'silent': False,
        }
        param.update(params_iter)

        res = xgb.cv(param,
                     xg_train,
                     num_boost_round=5000,
                     folds=gfold_Id,
                     feval=xg_f1,
                     metrics={'auc'},
                     stratified=True,
                     maximize=False,
                     verbose_eval=50,
                     callbacks=[
                         xgb.callback.print_evaluation(show_stdv=True),
                         xgb.callback.early_stop(50)
                     ])

        n_estimators = res.shape[0]

        for n_fold, (train_idx,
                     valid_idx) in enumerate(folds.split(X_train, labels)):
            xgg_train = X_train.iloc[train_idx]
            xgg_valid = X_train.iloc[valid_idx]
            ygg_train = labels[train_idx]
            ygg_valid = labels[valid_idx]

            xgg_train = xgb.DMatrix(xgg_train, label=ygg_train, missing=-1.0)
            xgg_valid = xgb.DMatrix(xgg_valid, missing=-1.0)
            #xg_test = xgb.DMatrix(X_test, missing=-1.0)

            clf = xgb.train(param,
                            xgg_train,
                            num_boost_round=n_estimators,
                            verbose_eval=1)

            oof_preds[valid_idx] = clf.predict(xgg_valid)
            pred = clf.predict(xg_test)
            sub_preds += pred / folds.n_splits

            fpr, tpr, thresholds = metrics.roc_curve(ygg_valid,
                                                     oof_preds[valid_idx])
            optimal_idx = np.argmax(tpr - fpr)
            optimal_thresholds = thresholds[optimal_idx]

            list_thresholds_global.append(optimal_thresholds)

            sub_preds_local_vote[:, n_fold] = [
                1 if y_cont > optimal_thresholds else 0 for y_cont in pred
            ]
            oof_preds_local_vote[valid_idx] = [
                1 if y_cont > optimal_thresholds else 0
                for y_cont in oof_preds[valid_idx]
            ]

            fold_raw_importance = pd.DataFrame(
                list(clf.get_score(importance_type='gain').items()),
                columns=['feature', 'importance']).sort_values('importance',
                                                               ascending=False)
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = features
            fold_importance_df = pd.merge(fold_importance_df,
                                          fold_raw_importance,
                                          on='feature',
                                          how='left')
            fold_importance_df = fold_importance_df.fillna(value=0)
            fold_importance_df = fold_importance_df.sort_values(
                'importance', ascending=False)
            fold_importance_df["fold"] = n_fold + 1
            fold_importance_df["seed"] = 'seed_' + str(seeds[seed_id])
            feature_importance_df = pd.concat(
                [feature_importance_df, fold_importance_df], axis=0)
            print('Fold %2d AUC : %.6f' %
                  (n_fold + 1, roc_auc_score(ygg_valid, oof_preds[valid_idx])))

            del clf, xgg_train, xgg_valid, ygg_train, ygg_valid
            gc.collect()

        oof_preds_folds[:, seed_id] = oof_preds
        sub_preds_folds[:, seed_id] = sub_preds
        from scipy import stats

        a, b = stats.mode(sub_preds_local_vote, axis=1)
        oof_preds_folds_vote[:, seed_id] = oof_preds_local_vote
        sub_preds_folds_vote[:, seed_id] = a.reshape(-1)
        feature_importance_df_folds = pd.concat(
            [feature_importance_df_folds, feature_importance_df], axis=0)
        auc_lst.append(ml_metrics.auc(y_train, oof_preds))
        auc_lst1.append(roc_auc_score(y_train, oof_preds))
        print('Full AUC score %.6f' % roc_auc_score(y_train, oof_preds))
        print("auc_lst1")
        print(auc_lst1)

    print(list_thresholds_global)
    #oof_preds_folds = pd.DataFrame(oof_preds_folds,columns=['xgb_rank_seed_' + str(seeds[l]) for l in range(len(seeds))])
    #sub_preds_folds = pd.DataFrame(sub_preds_folds,columns=['xgb_rank_seed_' + str(seeds[l]) for l in range(len(seeds))])
    oof_preds_folds_vote = pd.DataFrame(
        oof_preds_folds_vote,
        columns=['xgb_rank_seed_' + str(seeds[l]) for l in range(len(seeds))])
    sub_preds_folds_vote = pd.DataFrame(
        sub_preds_folds_vote,
        columns=['xgb_rank_seed_' + str(seeds[l]) for l in range(len(seeds))])

    #oof_preds_folds.to_csv("./output/" + task + "_train_stack/xgb_rank.csv", index=False)
    #sub_preds_folds.to_csv("./output/" + task + "_test_stack/xgb_rank.csv", index=False)
    oof_preds_folds_vote.to_csv("./output/" + task +
                                "_train_stack_vote/xgb_rank.csv",
                                index=False)
    sub_preds_folds_vote.to_csv("./output/" + task +
                                "_test_stack_vote/xgb_rank.csv",
                                index=False)
    feature_importance_df_folds = feature_importance_df_folds.sort_values(
        'importance', ascending=False)
    feature_importance_df_folds.to_csv("./output/" + task +
                                       "_feature/xgb_rank.csv",
                                       index=False)
Example #34
0
def calculate_ftrl_features(train,
                            test,
                            fnames,
                            ftablename,
                            ftrl_type='',
                            optional_date_ftrl3='',
                            optional_condition_ftrl4=''):
    folds = [x for x in range(1, nfold + 1)]
    global_mean = np.mean(train.is_screener)
    pred_file = '../data/output-py/ftrl/pred_ftrl.csv'

    ftrl_all = pd.DataFrame()
    count = 0
    for L in range(1, len(folds) + 1):
        for train_folds in itertools.combinations(folds, L):
            count = count + 1
            print train_folds
            test_folds = [x for x in folds if not x in list(train_folds)]
            if len(test_folds) == 0:
                test_folds = [0]
            print test_folds

            if False:
                store = pd.HDFStore('../data/output-py/ftrl/ftrl_feats' +
                                    str(count) + '.h5')
                ftrl_feats = store.get('ftrl_feats')
                store.close()
            else:
                train_file = save_ftrl_data('train', fnames,
                                            ftablename, test_folds,
                                            list(train_folds), ftrl_type,
                                            optional_date_ftrl3,
                                            optional_condition_ftrl4)
                if 0 in test_folds:
                    test_file = save_ftrl_data('test', fnames,
                                               ftablename, test_folds,
                                               list(train_folds), ftrl_type,
                                               optional_date_ftrl3,
                                               optional_condition_ftrl4)
                else:
                    test_file = save_ftrl_data('val', fnames,
                                               ftablename, test_folds,
                                               list(train_folds), ftrl_type,
                                               optional_date_ftrl3,
                                               optional_condition_ftrl4)

                non_factor_cols = "''"
                non_feature_cols = "''"
                text_cols = "'diagnosis_description'"

                os.system('pypy ftrl' + ftrl_type + '.py' + ' --alpha ' +
                          str(0.07) + ' --beta ' + str(1.0) + ' --L1 ' +
                          str(0.01) + ' --L2 ' + str(1.0) + ' --epoch ' +
                          str(1) + ' --train ' + train_file + ' --test ' +
                          test_file + ' --submission ' + pred_file +
                          ' --non_feature_cols ' + non_feature_cols +
                          ' --non_factor_cols ' + non_factor_cols +
                          ' --text_cols ' + text_cols)

                ftrl_feats = pd.read_csv(pred_file)
                ftrl_feats = ftrl_feats.groupby(
                    'patient_id')['is_screener_pred'].max().reset_index()

                for x in folds:
                    if x in list(train_folds):
                        ftrl_feats['fold' + str(x)] = 1
                    else:
                        ftrl_feats['fold' + str(x)] = 0
                store = pd.HDFStore('../data/output-py/ftrl/ftrl_feats' +
                                    str(count) + '.h5')
                store.append('ftrl_feats', ftrl_feats)
                store.close()
                os.system('rm -R ' + train_file)
                os.system('rm -R ' + test_file)
                os.system('rm -R ' + pred_file)

            ftrl_all = ftrl_all.append(ftrl_feats, ignore_index=True)

            ftrl_feats = pd.merge(ftrl_feats,
                                  train[['patient_id', 'is_screener']],
                                  on='patient_id',
                                  how='inner')
            if len(ftrl_feats) > 0:
                print "Pearson correlation: " + str(
                    pearsonr(ftrl_feats.is_screener,
                             ftrl_feats.is_screener_pred))
                print "AUC: " + str(
                    auc(ftrl_feats.is_screener, ftrl_feats.is_screener_pred))
            del ftrl_feats

    feats_all = train[['patient_id']].append(test[['patient_id']],
                                             ignore_index=True)
    for test_fold in ([0] + folds):
        train_folds = [x for x in folds if (x != test_fold) and (x != 0)]

        if len(train_folds) == len(folds):
            pd_query = ' and '.join(
                ['fold' + str(x) + '==1' for x in train_folds])
        else:
            pd_query = ' and '.join(
                ['fold' + str(x) + '==1'
                 for x in train_folds]) + ' and ' + ' and '.join([
                     'fold' + str(x) + '==0'
                     for x in folds if not x in train_folds
                 ])
        print pd_query

        ftrl_feats = ftrl_all.query(pd_query).copy().reset_index(drop=True)
        for x in folds:
            ftrl_feats.drop('fold' + str(x), axis=1, inplace=True)

        if test_fold == 0:
            feats_fold = test[['patient_id']].copy()
        else:
            feats_fold = train.query('cv_index==@test_fold')[['patient_id'
                                                              ]].copy()
        feats_fold = pd.merge(feats_fold,
                              ftrl_feats,
                              on='patient_id',
                              how='left')
        del ftrl_feats

        for val_fold in [x for x in folds if (x != test_fold) and (x != 0)]:
            train_folds = [
                x for x in folds
                if (x != test_fold) and (x != val_fold) and (x != 0)
            ]
            pd_query = ' and '.join(
                ['fold' + str(x) + '==1'
                 for x in train_folds]) + ' and ' + ' and '.join([
                     'fold' + str(x) + '==0'
                     for x in folds if not x in train_folds
                 ])

            ftrl_feats = ftrl_all.query(pd_query).copy().reset_index(drop=True)
            for x in folds:
                ftrl_feats.drop('fold' + str(x), axis=1, inplace=True)

            feats_val_fold = train.query('cv_index==@val_fold')[['patient_id'
                                                                 ]].copy()
            feats_val_fold = pd.merge(feats_val_fold,
                                      ftrl_feats,
                                      on='patient_id',
                                      how='left')
            del ftrl_feats
            feats_fold = feats_fold.append(feats_val_fold, ignore_index=True)

        feats_fold = feats_fold.reset_index(drop=True)
        feats_fold['is_screener_pred'].fillna(global_mean, inplace=True)
        feats_fold = feats_fold.rename(
            columns={
                'is_screener_pred':
                '_'.join(fnames) + '_' + ftablename + '_ftrl' + ftrl_type +
                '_fold_' + str(test_fold)
            })
        feats_all = pd.merge(feats_all,
                             feats_fold,
                             on='patient_id',
                             how='left')

    print "Writing to HDF5 store..."
    store = pd.HDFStore('../data/output-py/' + '_'.join(fnames) + '_' +
                        ftablename + '_ftrl' + ftrl_type + '.h5')
    store.append('feats_all', feats_all)
    print 'Feature ' + '_'.join(
        fnames) + '_' + ftablename + '_ftrl' + ftrl_type + ' is saved in file.'
    store.close()
    return '_'.join(fnames) + '_' + ftablename + '_ftrl' + ftrl_type
def stacking_model_lgb_gbt(task='together'):
    nfold = 5
    #task='together'

    train_df = None
    test_df = None

    if task == 'together':
        train_df = pd.read_csv('./data/train_df_day_night_together.csv')
        test_df = pd.read_csv('./data/test_df_day_night_together.csv')
        from together_fn_param import list_param
    elif task == 'split':
        train_df = pd.read_csv('./data/train_df_day_night_split.csv')
        test_df = pd.read_csv('./data/test_df_day_night_split.csv')
        from split_fn_param import list_param

    train_df = train_df.fillna(-1)
    test_df = test_df.fillna(-1)

    print("Data loading Done!")

    target = 'label'
    predictors = train_df.columns.values.tolist()[1:-1]
    categorical = None

    gc.collect()

    #lightgbm
    X_train = train_df[predictors].values

    labels = train_df['label']

    def xg_f1(preds, train_data):
        yhat = preds

        dtrain = train_data

        y = dtrain.get_label()

        pre, rec, th = metrics.precision_recall_curve(y, yhat)

        f1_all = 2 / ((1 / rec) + (1 / pre))
        optimal_idx = np.argmax(f1_all)
        optimal_thresholds = th[optimal_idx]
        y_bin = [1. if y_cont > optimal_thresholds else 0.
                 for y_cont in yhat]  # binaryzing your output
        tn, fp, fn, tp = confusion_matrix(y, y_bin).ravel()
        specificity = tn / (tn + fp)
        sensitivity = tp / (tp + fn)
        optimal_f1 = np.nanmax(f1_all)

        return 'f1', -optimal_f1, False

    xg_train = lgb.Dataset(train_df[predictors].values,
                           label=train_df[target].values,
                           feature_name=predictors)

    seeds = np.random.randint(5000, 10000, size=10).tolist()
    auc_lst = []
    auc_lst1 = []
    n_estimators_lst = []
    stratified = True
    debug = True
    param = list_param('lgb_gbdt')
    oof_preds_folds = np.zeros((train_df.shape[0], len(seeds)))
    sub_preds_folds = np.zeros((test_df.shape[0], len(seeds)))
    sub_preds_folds_vote = np.zeros((test_df.shape[0], len(seeds)))
    oof_preds_folds_vote = np.zeros((train_df.shape[0], len(seeds)))
    feature_importance_df_folds = pd.DataFrame()

    list_thresholds_global = []
    for seed_id in range(len(seeds)):
        if stratified:
            folds = StratifiedKFold(n_splits=nfold,
                                    shuffle=True,
                                    random_state=seeds[seed_id])
        else:
            folds = KFold(n_splits=nfold, shuffle=True, random_state=1001)
        oof_preds = np.zeros(train_df.shape[0])
        sub_preds = np.zeros(test_df.shape[0])
        oof_preds_local_vote = np.zeros(train_df.shape[0])
        sub_preds_local_vote = np.zeros((test_df.shape[0], nfold))
        feature_importance_df = pd.DataFrame()

        gfold_Id = list(folds.split(X_train, labels))

        params_iter = {
            'max_bin': 63,  # fixed #int
            'save_binary': True,  # fixed
            'seed': seeds[seed_id],
            'feature_fraction_seed': seeds[seed_id],
            'bagging_seed': seeds[seed_id],
            'drop_seed': seeds[seed_id],
            'data_random_seed': seeds[seed_id],
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'verbose': 1,
            'metric': 'auc',
        }
        param.update(params_iter)

        bst1 = lgb.cv(param,
                      xg_train,
                      num_boost_round=5000,
                      early_stopping_rounds=50,
                      folds=gfold_Id)

        res0 = pd.DataFrame(bst1)

        n_estimators = res0.shape[0]

        for n_fold, (train_idx,
                     valid_idx) in enumerate(folds.split(X_train, labels)):
            xgg_train = lgb.Dataset(data=train_df[predictors].iloc[train_idx],
                                    label=train_df[target].iloc[train_idx],
                                    free_raw_data=False,
                                    silent=True)
            xgg_valid = lgb.Dataset(data=train_df[predictors].iloc[valid_idx],
                                    label=train_df[target].iloc[valid_idx],
                                    free_raw_data=False,
                                    silent=True)

            clf = lgb.train(
                param,
                xgg_train,
                num_boost_round=n_estimators,
                # fobj=loglikelood,
                # feval=binary_error,
                verbose_eval=1,
            )

            oof_preds[valid_idx] = clf.predict(xgg_valid.data)
            pred = clf.predict(test_df[predictors])
            sub_preds += pred / folds.n_splits

            fpr, tpr, thresholds = metrics.roc_curve(xgg_valid.label,
                                                     oof_preds[valid_idx])
            optimal_idx = np.argmax(tpr - fpr)
            optimal_thresholds = thresholds[optimal_idx]

            list_thresholds_global.append(optimal_thresholds)

            sub_preds_local_vote[:, n_fold] = [
                1 if y_cont > optimal_thresholds else 0 for y_cont in pred
            ]
            oof_preds_local_vote[valid_idx] = [
                1 if y_cont > optimal_thresholds else 0
                for y_cont in oof_preds[valid_idx]
            ]

            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = clf.feature_name()
            fold_importance_df["importance"] = clf.feature_importance(
                importance_type='gain')
            fold_importance_df = fold_importance_df.fillna(value=0)
            fold_importance_df = fold_importance_df.sort_values(
                'importance', ascending=False)
            fold_importance_df["fold"] = n_fold + 1
            fold_importance_df["seed"] = 'seed_' + str(seeds[seed_id])
            feature_importance_df = pd.concat(
                [feature_importance_df, fold_importance_df], axis=0)
            print('Fold %2d AUC : %.6f' %
                  (n_fold + 1,
                   roc_auc_score(xgg_valid.label, oof_preds[valid_idx])))

            del clf, xgg_train, xgg_valid
            gc.collect()

        oof_preds_folds[:, seed_id] = oof_preds
        sub_preds_folds[:, seed_id] = sub_preds
        from scipy import stats

        a, b = stats.mode(sub_preds_local_vote, axis=1)
        oof_preds_folds_vote[:, seed_id] = oof_preds_local_vote
        sub_preds_folds_vote[:, seed_id] = a.reshape(-1)
        feature_importance_df_folds = pd.concat(
            [feature_importance_df_folds, feature_importance_df], axis=0)
        auc_lst.append(ml_metrics.auc(xg_train.label, oof_preds))
        auc_lst1.append(roc_auc_score(xg_train.label, oof_preds))
        print('Full AUC score %.6f' % roc_auc_score(xg_train.label, oof_preds))
        print("auc_lst1")
        print(auc_lst1)

    print(list_thresholds_global)
    #oof_preds_folds = pd.DataFrame(oof_preds_folds,columns=['lgb_gbt_seed_' + str(seeds[l]) for l in range(len(seeds))])
    #sub_preds_folds = pd.DataFrame(sub_preds_folds,columns=['lgb_gbt_seed_' + str(seeds[l]) for l in range(len(seeds))])
    oof_preds_folds_vote = pd.DataFrame(
        oof_preds_folds_vote,
        columns=['lgb_gbt_seed_' + str(seeds[l]) for l in range(len(seeds))])
    sub_preds_folds_vote = pd.DataFrame(
        sub_preds_folds_vote,
        columns=['lgb_gbt_seed_' + str(seeds[l]) for l in range(len(seeds))])

    #oof_preds_folds.to_csv("./output/" + task + "_train_stack/lgb_gbt.csv", index=False)
    #sub_preds_folds.to_csv("./output/" + task + "_test_stack/lgb_gbt.csv", index=False)
    oof_preds_folds_vote.to_csv("./output/" + task +
                                "_train_stack_vote/lgb_gbt.csv",
                                index=False)
    sub_preds_folds_vote.to_csv("./output/" + task +
                                "_test_stack_vote/lgb_gbt.csv",
                                index=False)
    feature_importance_df_folds = feature_importance_df_folds.sort_values(
        'importance', ascending=False)
    feature_importance_df_folds.to_csv("./output/" + task +
                                       "_feature/lgb_gbt.csv",
                                       index=False)
Example #36
0
def calculate_likelihoods(train,
                          test,
                          fnames,
                          ftablename,
                          function_type='max',
                          query_type='',
                          optional_filter_feature_likeli6='',
                          optional_filter_value_likeli6=''):
    global_mean = np.mean(train.is_screener)
    folds = [x for x in range(1, nfold + 1)]

    likeli_all = pd.DataFrame()
    for L in range(1, len(folds) + 1):
        for train_folds in itertools.combinations(folds, L):
            print train_folds
            sql_query = open('genentech-sql/pattern_likeli_multiple' +
                             query_type + '.sql').read()
            sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename)
            sql_query = sql_query.replace('GENERIC_FEATURE_NAME',
                                          '_'.join(fnames))
            sql_query = sql_query.replace('FEATURE_NAMES_COMMA_SEPARATED',
                                          ','.join(fnames))
            sql_query = sql_query.replace(
                'T1_COMMA_SEPARATED', ','.join(['t1.' + x for x in fnames]))
            sql_query = sql_query.replace(
                'T3_T4_CONDITION',
                ' AND '.join(['t3.' + x + '=t4.' + x for x in fnames]))
            sql_query = sql_query.replace(
                'OPTIONAL_CV_EXPRESSION', 'WHERE ' +
                ' OR '.join(['cv_index=' + str(x) for x in list(train_folds)]))
            sql_query = sql_query.replace('GROUP_FUNCTION', function_type)
            sql_query = sql_query.replace(
                'OPTIONAL_CONDITION_LIKELI6',
                'WHERE ' + optional_filter_feature_likeli6 + "='" +
                optional_filter_value_likeli6 + "'")
            #sql_query = sql_query.replace('OPTIONAL_CONDITION_LIKELI6', 'WHERE ' + optional_filter_feature_likeli6 + ">=" + optional_filter_value_likeli6)
            if len(list(train_folds)) == len(folds):
                choosing_patients_expression = 'patients_test2'
            else:
                choosing_patients_expression = 'train_cv_indices ' + 'WHERE ' + ' OR '.join(
                    [
                        'cv_index=' + str(x)
                        for x in folds if not x in list(train_folds)
                    ])
            sql_query = sql_query.replace('CHOOSING_PATIENTS_EXPRESSION',
                                          choosing_patients_expression)

            conn = utils.connect_to_database()
            cur = conn.cursor()
            cur.execute(sql_query)
            if (query_type == '3') or (query_type == '4') or (query_type
                                                              == '5'):
                conn.commit()
                sql_query = open('genentech-sql/pattern_likeli_multiple' +
                                 query_type + '_2.sql').read()
                sql_query = sql_query.replace('GENERIC_FEATURE_NAME',
                                              '_'.join(fnames))
                sql_query = sql_query.replace('FEATURE_TABLE_NAME', ftablename)
                cur.execute(sql_query)
                likeli = pd.DataFrame(cur.fetchall())
                likeli.columns = [x.name for x in cur.description]
                cur.execute('DROP TABLE patient_likeli_table;')
                conn.commit()
            else:
                likeli = pd.DataFrame(cur.fetchall())
                likeli.columns = [x.name for x in cur.description]

            for x in folds:
                if x in list(train_folds):
                    likeli['fold' + str(x)] = 1
                else:
                    likeli['fold' + str(x)] = 0
            cur.close()
            conn.close()

            likeli_all = likeli_all.append(likeli, ignore_index=True)
            col = likeli.columns[1]
            likeli = pd.merge(likeli,
                              train[['patient_id', 'is_screener']],
                              on='patient_id',
                              how='inner')
            if len(likeli) > 0:
                print "Pearson correlation: " + str(
                    pearsonr(likeli.is_screener, likeli[col]))
                print "AUC: " + str(auc(likeli.is_screener, likeli[col]))
            del likeli

    feats_all = train[['patient_id']].append(test[['patient_id']],
                                             ignore_index=True)
    for test_fold in ([0] + folds):
        train_folds = [x for x in folds if (x != test_fold) and (x != 0)]

        if len(train_folds) == len(folds):
            pd_query = ' and '.join(
                ['fold' + str(x) + '==1' for x in train_folds])
        else:
            pd_query = ' and '.join(
                ['fold' + str(x) + '==1'
                 for x in train_folds]) + ' and ' + ' and '.join([
                     'fold' + str(x) + '==0'
                     for x in folds if not x in train_folds
                 ])
        print pd_query

        likeli = likeli_all.query(pd_query).copy().reset_index(drop=True)
        for x in folds:
            likeli.drop('fold' + str(x), axis=1, inplace=True)

        if test_fold == 0:
            feats_fold = test[['patient_id']].copy()
        else:
            feats_fold = train.query('cv_index==@test_fold')[['patient_id'
                                                              ]].copy()
        feats_fold = pd.merge(feats_fold, likeli, on='patient_id', how='left')
        del likeli

        for val_fold in [x for x in folds if (x != test_fold) and (x != 0)]:
            train_folds = [
                x for x in folds
                if (x != test_fold) and (x != val_fold) and (x != 0)
            ]
            pd_query = ' and '.join(
                ['fold' + str(x) + '==1'
                 for x in train_folds]) + ' and ' + ' and '.join([
                     'fold' + str(x) + '==0'
                     for x in folds if not x in train_folds
                 ])

            likeli = likeli_all.query(pd_query).copy().reset_index(drop=True)
            for x in folds:
                likeli.drop('fold' + str(x), axis=1, inplace=True)

            feats_val_fold = train.query('cv_index==@val_fold')[['patient_id'
                                                                 ]].copy()
            feats_val_fold = pd.merge(feats_val_fold,
                                      likeli,
                                      on='patient_id',
                                      how='left')
            del likeli
            feats_fold = feats_fold.append(feats_val_fold, ignore_index=True)

        col = feats_fold.columns[1]
        feats_fold = feats_fold.reset_index(drop=True)
        feats_fold[col].fillna(global_mean, inplace=True)
        #feats_fold[fname_w_likeli].fillna(global_mean, inplace=True)
        feats_fold = feats_fold.rename(
            columns={col: col + '_fold_' + str(test_fold)})
        #feats_fold = feats_fold.rename(columns={fname_w_likeli : fname_w_likeli+'_fold_'+str(test_fold)})
        feats_all = pd.merge(feats_all,
                             feats_fold,
                             on='patient_id',
                             how='left')

    print "Writing to HDF5 store..."
    store = pd.HDFStore('../data/output-py/' + col + '.h5')
    store.append('feats_all', feats_all)
    store.close()
    conn.close()
    print "Feature " + col + " is saved in file."
    return col
Example #37
0
def forward_auc(labels, predictions):
    target_one = [1 if x==1 else 0 for x in labels]
    score = metrics.auc(target_one, predictions)
    return score
Example #38
0
def calculate_likelihoods2(train, test, fnames_list, ftablename):
    global_mean = np.mean(train.is_screener)
    folds = [x for x in range(1, nfold + 1)]

    likeli_all = pd.DataFrame()
    for L in range(1, len(folds) + 1):
        for train_folds in itertools.combinations(folds, L):
            print train_folds
            test_folds = [x for x in folds if not x in list(train_folds)]
            if len(test_folds) == 0:
                test_folds = [0]
            print test_folds

            for fnames in fnames_list:
                likeli_table_name = '_'.join(fnames) + '_likeli_table'
                generate_likelihood_table(likeli_table_name, fnames,
                                          ftablename, train_folds)
            likeli = merge_likelihood_tables(fnames_list, ftablename,
                                             train_folds)
            for fnames in fnames_list:
                likeli_table_name = '_'.join(fnames) + '_likeli_table'
                drop_likelihood_table(likeli_table_name)

            for x in folds:
                if x in list(train_folds):
                    likeli['fold' + str(x)] = 1
                else:
                    likeli['fold' + str(x)] = 0
            likeli_all = likeli_all.append(likeli, ignore_index=True)

            col = likeli.columns[1]
            likeli = pd.merge(likeli,
                              train[['patient_id', 'is_screener']],
                              on='patient_id',
                              how='inner')
            if len(likeli) > 0:
                print "Pearson correlation: " + str(
                    pearsonr(likeli.is_screener, likeli[col]))
                print "AUC: " + str(auc(likeli.is_screener, likeli[col]))
            del likeli

    file_name = likeli_all.columns[1]
    feats_all = train[['patient_id']].append(test[['patient_id']],
                                             ignore_index=True)
    for test_fold in ([0] + folds):
        train_folds = [x for x in folds if (x != test_fold) and (x != 0)]

        if len(train_folds) == len(folds):
            pd_query = ' and '.join(
                ['fold' + str(x) + '==1' for x in train_folds])
        else:
            pd_query = ' and '.join(
                ['fold' + str(x) + '==1'
                 for x in train_folds]) + ' and ' + ' and '.join([
                     'fold' + str(x) + '==0'
                     for x in folds if not x in train_folds
                 ])
        print pd_query

        likeli = likeli_all.query(pd_query).copy().reset_index(drop=True)
        for x in folds:
            likeli.drop('fold' + str(x), axis=1, inplace=True)

        if test_fold == 0:
            feats_fold = test[['patient_id']].copy()
        else:
            feats_fold = train.query('cv_index==@test_fold')[['patient_id'
                                                              ]].copy()
        feats_fold = pd.merge(feats_fold, likeli, on='patient_id', how='left')
        del likeli

        for val_fold in [x for x in folds if (x != test_fold) and (x != 0)]:
            train_folds = [
                x for x in folds
                if (x != test_fold) and (x != val_fold) and (x != 0)
            ]
            pd_query = ' and '.join(
                ['fold' + str(x) + '==1'
                 for x in train_folds]) + ' and ' + ' and '.join([
                     'fold' + str(x) + '==0'
                     for x in folds if not x in train_folds
                 ])

            likeli = likeli_all.query(pd_query).copy().reset_index(drop=True)
            for x in folds:
                likeli.drop('fold' + str(x), axis=1, inplace=True)

            feats_val_fold = train.query('cv_index==@val_fold')[['patient_id'
                                                                 ]].copy()
            feats_val_fold = pd.merge(feats_val_fold,
                                      likeli,
                                      on='patient_id',
                                      how='left')
            del likeli
            feats_fold = feats_fold.append(feats_val_fold, ignore_index=True)

        feats_fold = feats_fold.reset_index(drop=True)
        for cols in [x for x in feats_fold.columns if x != 'patient_id']:
            feats_fold[cols].fillna(global_mean * len(fnames), inplace=True)
            feats_fold = feats_fold.rename(
                columns={cols: cols + '_fold_' + str(test_fold)})
        feats_all = pd.merge(feats_all,
                             feats_fold,
                             on='patient_id',
                             how='left')

    print "Writing to HDF5 store..."
    store = pd.HDFStore('../data/output-py/' + file_name + '.h5')
    store.append('feats_all', feats_all)
    print "Feature " + file_name + " is saved in file."
    store.close()
    return file_name
                y_pred = train_predict_ftrl(X_train, y_train, X_test)

            preds = pd.DataFrame()
            preds['ID'] = test_split['ID'].values
            preds['FOLD'] = fold
            preds['ITER'] = it
            preds[MODEL] = y_pred
            preds_model = preds_model.append(preds, ignore_index=True)

            preds = preds.loc[preds['ID'].isin(ids_val)].copy()
            preds = pd.merge(preds,
                             train[['ID', 'TARGET']],
                             on='ID',
                             how='left')

            fold_auc = auc(preds['TARGET'], preds[MODEL])
            aucs.append(fold_auc)
        print np.mean(aucs), np.std(aucs)

    preds_model.loc[preds_model[MODEL] < 0, MODEL] = 0.0
    preds_model.loc[preds_model[MODEL] > 1, MODEL] = 1.0
    preds_model = preds_model.groupby(['ID',
                                       'ITER'])[MODEL].mean().reset_index()
    for it in range(1, 21):
        preds_model.loc[preds_model['ITER'] == it,
                        MODEL] = preds_model.loc[preds_model['ITER'] == it,
                                                 MODEL].rank()
    preds_model = preds_model.groupby('ID')[MODEL].mean().reset_index()
    preds_model.columns = ['ID', 'dmitry_' + MODEL]
    preds_all = pd.merge(preds_all, preds_model, on='ID', how='left')
    preds_all.to_csv('all_models_temp.csv', index=False)
train = coder.fit_transform(train)

models = []
models.append(RandomForestClassifier(n_estimators=165, max_depth=4, criterion='entropy'))
models.append(GradientBoostingClassifier(max_depth =4))
models.append(KNeighborsClassifier(n_neighbors=20))
models.append(GaussianNB())

TRNtrain, TRNtest, TARtrain, TARtest = train_test_split(train, target, test_size=0.3, random_state=0)

plt.figure(figsize=(10, 10)) 
for model in models:
    model.fit(TRNtrain, TARtrain)
    pred_scr = model.predict_proba(TRNtest)[:, 1]
    fpr, tpr, thresholds = roc_curve(TARtest, pred_scr)
    roc_auc = ml_metrics.auc(TARtest, pred_scr)
    md = str(model)
    md = md[:md.find('(')]
    pl.plot(fpr, tpr, label='ROC fold %s (auc = %0.2f)' % (md, roc_auc))

pl.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6))
pl.xlim([0, 1])
pl.ylim([0, 1])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Receiver operating characteristic example')
pl.legend(loc="lower right")
pl.show()


#приводим тестовую выборку к нужному формату