def train_nusvc(df_train, df_test, n_splits=25, pca=False): train = df_train.copy() test = df_test.copy() oof = np.zeros(len(train)) preds = np.zeros(len(test)) cols = [ c for c in train.columns if c not in ['id', 'target', 'wheezy-copper-turtle-magic'] ] for i in range(512): train2 = train[train['wheezy-copper-turtle-magic'] == i].copy() test2 = test[test['wheezy-copper-turtle-magic'] == i].copy() idx1 = train2.index idx2 = test2.index train2.reset_index(drop=True, inplace=True) if pca: data = pd.concat( [pd.DataFrame(train2[cols]), pd.DataFrame(test2[cols])]) #data2 = StandardScaler().fit_transform(PCA(n_components=40, random_state=51).fit_transform(data[cols])) data2 = StandardScaler().fit_transform( PCA(svd_solver='full', n_components='mle').fit_transform(data[cols])) train3 = data2[:train2.shape[0]] test3 = data2[train2.shape[0]:] else: sel = VarianceThreshold(threshold=1.5).fit(train2[cols]) train3 = sel.transform(train2[cols]) test3 = sel.transform(test2[cols]) skf = StratifiedKFold(n_splits=n_splits, random_state=15) for train_index, test_index in skf.split(train3, train2['target']): clf = Pipeline([('scaler', StandardScaler()), ('svn', NuSVC(probability=True, kernel='poly', degree=4, gamma='auto', random_state=745, nu=0.59, coef0=0.053))]) clf.fit(train3[train_index, :], train2.loc[train_index]['target']) oof[idx1[test_index]] = clf.predict_proba(train3[test_index, :])[:, 1] preds[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits ut.report_oof(df_train, oof) return oof, preds
def train_svc(df_train, df_test): train = df_train.copy() test = df_test.copy() oof = np.zeros(len(train)) preds = np.zeros(len(test)) cols = [ c for c in train.columns if c not in ['id', 'target', 'wheezy-copper-turtle-magic'] ] for i in range(512): train2 = train[train['wheezy-copper-turtle-magic'] == i] test2 = test[test['wheezy-copper-turtle-magic'] == i] idx1 = train2.index idx2 = test2.index train2.reset_index(drop=True, inplace=True) sel = VarianceThreshold(threshold=1.5).fit(train2[cols]) train3 = sel.transform(train2[cols]) test3 = sel.transform(test2[cols]) skf = StratifiedKFold(n_splits=25, random_state=15) for train_index, test_index in skf.split(train3, train2['target']): clf = Pipeline([('scaler', StandardScaler()), ('svc', SVC(probability=True, kernel='poly', degree=4, gamma='auto'))]) clf.fit(train3[train_index, :], train2.loc[train_index]['target']) oof[idx1[test_index]] = clf.predict_proba(train3[test_index, :])[:, 1] preds[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits if i % 25 == 0: print(i) ut.report_oof(df_train, oof) return oof, preds
def train_logit(df_train, df_test): train = df_train.copy() test = df_test.copy() oof = np.zeros(len(train)) preds = np.zeros(len(test)) cols = [ c for c in train.columns if c not in ['id', 'target', 'wheezy-copper-turtle-magic'] ] for i in range(512): train2 = train[train['wheezy-copper-turtle-magic'] == i] test2 = test[test['wheezy-copper-turtle-magic'] == i] idx1 = train2.index idx2 = test2.index train2.reset_index(drop=True, inplace=True) sel = VarianceThreshold(threshold=1.5).fit(train2[cols]) train3 = sel.transform(train2[cols]) test3 = sel.transform(test2[cols]) skf = StratifiedKFold(n_splits=25, random_state=15) for train_index, test_index in skf.split(train3, train2['target']): clf = Pipeline([('scaler', StandardScaler()), ('logit', LogisticRegression(solver='saga', penalty='l1', C=1))]) clf.fit(train3[train_index, :], train2.loc[train_index]['target']) oof[idx1[test_index]] = clf.predict_proba(train3[test_index, :])[:, 1] preds[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits ut.report_oof(df_train, oof) return oof, preds
def train_logit(df_train, df_test, C=1, pca=False): train = df_train.copy() test = df_test.copy() oof = np.zeros(len(train)) preds = np.zeros(len(test)) cols = [c for c in train.columns if c not in ['id', 'target', 'wheezy-copper-turtle-magic']] for i in range(512): train2 = train[train['wheezy-copper-turtle-magic']==i] test2 = test[test['wheezy-copper-turtle-magic']==i] idx1 = train2.index; idx2 = test2.index train2.reset_index(drop=True,inplace=True) if pca: data = pd.concat([pd.DataFrame(train2[cols]), pd.DataFrame(test2[cols])]) #data2 = StandardScaler().fit_transform(PCA(n_components=40, random_state=51).fit_transform(data[cols])) data2 = StandardScaler().fit_transform(PCA(svd_solver='full',n_components='mle').fit_transform(data[cols])) train3 = data2[:train2.shape[0]] test3 = data2[train2.shape[0]:] else: sel = VarianceThreshold(threshold=1.5).fit(train2[cols]) train3 = sel.transform(train2[cols]) test3 = sel.transform(test2[cols]) skf = StratifiedKFold(n_splits=25, random_state=15) for train_index, test_index in skf.split(train3, train2['target']): clf = Pipeline([('scaler', StandardScaler()), ('logit', LogisticRegression(solver='saga', penalty='l1', C=C))]) clf.fit(train3[train_index,:],train2.loc[train_index]['target']) oof[idx1[test_index]] = clf.predict_proba(train3[test_index,:])[:,1] preds[idx2] += clf.predict_proba(test3)[:,1] / skf.n_splits ut.report_oof(df_train, oof) return oof, preds
def train_lgb(df_train, df_test, kfolds): train = df_train.copy() test = df_test.copy() target = train.target.copy() sub = test[['id']].copy() train, test = ut.general_processing(train, test) # model oof = np.zeros(len(train)) predictions = np.zeros(len(test)) feature_importance_df = pd.DataFrame() for fold_, (trn_idx, val_idx) in enumerate(kfolds.split(train.values, target.values)): print("fold n°{}".format(fold_)) trn_data = lgb.Dataset(train.iloc[trn_idx], label=target.iloc[trn_idx]) val_data = lgb.Dataset(train.iloc[val_idx], label=target.iloc[val_idx]) param = { 'bagging_freq': 3, 'bagging_fraction': 0.8, 'boost_from_average': 'false', 'boost': 'gbdt', 'feature_fraction': 0.9, 'learning_rate': 0.01, 'max_depth': 10, 'metric': 'auc', 'min_data_in_leaf': 82, 'min_sum_hessian_in_leaf': 10.0, 'num_leaves': 20, 'objective': 'binary', 'verbosity': 1, } # param = { # this is for v9 # 'bagging_freq': 3, # 'bagging_fraction': 0.8, # 'boost_from_average':'false', # 'boost': 'gbdt', # 'feature_fraction': 0.8, # 'learning_rate': 0.001, # 'max_depth': 10, # 'metric':'auc', # 'min_data_in_leaf': 100, # 'num_leaves': 30, # 'objective': 'binary', # 'verbosity': 1, # 'n_jobs': -1 # } num_round = 1000000 clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=500, early_stopping_rounds=100) oof[val_idx] = clf.predict(train.iloc[val_idx], num_iteration=clf.best_iteration) fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = train.columns fold_importance_df["importance"] = clf.feature_importance() fold_importance_df["fold"] = fold_ + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) predictions += clf.predict( test, num_iteration=clf.best_iteration) / kfolds.n_splits ut.report_oof(df_train, oof) sub['target'] = predictions return oof, sub
def train_all(df_train, df_test, n_folds, pca=False): train = df_train.copy() test = df_test.copy() oof_svc = np.zeros(len(train)) oof_nusvc = np.zeros(len(train)) oof_logit = np.zeros(len(train)) oof_knn = np.zeros(len(train)) oof_qda = np.zeros(len(train)) preds_svc = np.zeros(len(test)) preds_nusvc = np.zeros(len(test)) preds_logit = np.zeros(len(test)) preds_knn = np.zeros(len(test)) preds_qda = np.zeros(len(test)) cols = [ c for c in train.columns if c not in ['id', 'target', 'wheezy-copper-turtle-magic'] ] for i in range(512): train2 = train[train['wheezy-copper-turtle-magic'] == i].copy() test2 = test[test['wheezy-copper-turtle-magic'] == i].copy() if len(train2) == 0: continue idx1 = train2.index idx2 = test2.index train2.reset_index(drop=True, inplace=True) if pca: data = pd.concat( [pd.DataFrame(train2[cols]), pd.DataFrame(test2[cols])]) #data2 = StandardScaler().fit_transform(PCA(n_components=40, random_state=51).fit_transform(data[cols])) data2 = StandardScaler().fit_transform( PCA(svd_solver='full', n_components='mle').fit_transform(data[cols])) train3 = data2[:train2.shape[0]] test3 = data2[train2.shape[0]:] else: sel = VarianceThreshold(threshold=1.5).fit(train2[cols]) train3 = sel.transform(train2[cols]) test3 = sel.transform(test2[cols]) skf = StratifiedKFold(n_splits=n_folds, random_state=15) for train_index, test_index in skf.split(train3, train2['target']): clf = Pipeline([('scaler', StandardScaler()), ('svn', SVC(probability=True, kernel='poly', degree=4, gamma='auto'))]) clf.fit(train3[train_index, :], train2.loc[train_index]['target']) oof_svc[idx1[test_index]] = clf.predict_proba( train3[test_index, :])[:, 1] preds_svc[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits clf = Pipeline([('scaler', StandardScaler()), ('svn', NuSVC(probability=True, kernel='poly', degree=4, gamma='auto', random_state=745, nu=0.59, coef0=0.053))]) clf.fit(train3[train_index, :], train2.loc[train_index]['target']) oof_nusvc[idx1[test_index]] = clf.predict_proba( train3[test_index, :])[:, 1] preds_nusvc[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits clf = Pipeline([('scaler', StandardScaler()), ('logit', LogisticRegression(solver='saga', penalty='l1', C=0.5))]) clf.fit(train3[train_index, :], train2.loc[train_index]['target']) oof_logit[idx1[test_index]] = clf.predict_proba( train3[test_index, :])[:, 1] preds_logit[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits clf = Pipeline([('scaler', StandardScaler()), ('KNN', KNeighborsClassifier(n_neighbors=17, p=2.9))]) clf.fit(train3[train_index, :], train2.loc[train_index]['target']) oof_knn[idx1[test_index]] = clf.predict_proba( train3[test_index, :])[:, 1] preds_knn[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits clf = QuadraticDiscriminantAnalysis(reg_param=0.6) clf.fit(train3[train_index, :], train2.loc[train_index]['target']) oof_qda[idx1[test_index]] = clf.predict_proba( train3[test_index, :])[:, 1] preds_qda[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits ut.report_oof(df_train, oof_svc) ut.report_oof(df_train, oof_nusvc) ut.report_oof(df_train, oof_logit) ut.report_oof(df_train, oof_knn) ut.report_oof(df_train, oof_kda) return oof_svc, preds_svc, oof_nusvc, preds_nusvc, oof_logit, preds_logit, oof_knn, preds_knn, oof_qda, preds_qda