def train_cv(label): """ cross validation to fine tune hypeparameters :param label: :return: """ df_train = pd.read_csv('../input/train.csv', encoding='utf-8') clf = SVC(C=0.8, kernel='rbf', class_weight='balanced') # feature composition df_handcraft_train = pd.read_csv('../input/train_features.csv')[features] tfidf_unigram_train = train_tfidf_unigram_features() tfidf_bigram_train = train_tfidf_bigram_features() tfidf_char_train = train_tfidf_char_features() X_train = hstack((df_handcraft_train, tfidf_unigram_train, tfidf_bigram_train, tfidf_char_train)) y_train = df_train[label] grid_params = { 'C': np.arange(0.6, 1.0, 0.1), 'kernel': ('linear', 'rbf', 'poly', 'sigmoid'), } '''re-sample the data set''' X_train_resampled, y_train_resampled = resample(X_train, y_train) grid_clf = GridSearchCV(estimator=clf, param_grid=grid_params, verbose=1, scoring='roc_auc', cv=5) grid_clf.fit(X_train_resampled, y_train_resampled) return grid_clf
def train_cv(label): """ cross validation to fine tune hypeparameters :param label: :return: """ df_train = pd.read_csv('../input/train.csv', encoding='utf-8') clf = RandomForestClassifier(n_estimators=1000, class_weight='balanced', verbose=1) # feature composition df_handcraft_train = pd.read_csv('../input/train_features.csv')[features] tfidf_unigram_train = train_tfidf_unigram_features() tfidf_bigram_train = train_tfidf_bigram_features() tfidf_char_train = train_tfidf_char_features() X_train = hstack(df_handcraft_train, tfidf_unigram_train, tfidf_bigram_train, tfidf_char_train) y_train = df_train[label] grid_params = { 'n_estimators': range(1000, 4000, 100), 'min_samples_leaf': range(2, 10, 1) } grid_clf = GridSearchCV(estimator=clf, param_grid=grid_params, verbose=1, scoring='roc_auc', cv=5) grid_clf.fit(X_train, y_train) return grid_clf
def train_cv(label): """ better params: - C=4 - solver='lbfgs' gird search to tune hypeparameters :param label: target label :return: lr model with special parameters """ # label_weight = df_train.shape[0]/df_train[df_train[label]==1].shape[0] lr = LogisticRegression(class_weight='balanced', solver='sag', random_state=22, verbose=1, max_iter=6000, C=4) # lr.set_params(class_weight={label:label_weight}) # lr.set_params(class_weight='balanced') df_train = pd.read_csv('../input/train.csv', encoding='utf-8') '''feature composition''' df_handcraft_train = pd.read_csv('../input/train_features.csv', encoding='utf-8')[features].as_matrix() tfidf_unigram_train = train_tfidf_unigram_features() tfidf_bigram_train = train_tfidf_bigram_features() tfidf_char_train = train_tfidf_char_features() print(df_handcraft_train.shape) # print(tfidf_train.shape) X_train = features_merge(df_handcraft_train, tfidf_unigram_train, tfidf_bigram_train, tfidf_char_train) # X_train = np.concatenate((df_handcraft_train,df_tfidf_train),axis=1) # df_train = pd.read_csv('../input/train.csv', encoding='utf-8') y_train = df_train[label] params = { # 'penalty': ('l1', 'l2'), # 'C': np.arange(1, 5, 1), 'solver': ('liblinear', 'sag', 'saga', 'newton-cg', 'lbfgs'), # 'solver': ('sag', 'saga', 'newton-cg'), # 'max_iter': range(6000, 7000, 100) } clf = GridSearchCV(estimator=lr, param_grid=params, scoring='roc_auc', cv=5, verbose=1) clf.fit(X_train, y_train) return clf
def train(label): df_train = pd.read_csv('../input/train.csv', encoding='utf-8') clf = XGBClassifier( learning_rate=0.1, n_estimators=2000, max_depth=4, silent=False, # scale_pos_weight=_toxic_count / _clean_count, colsample_bytree=0.8, colsample_bylevel=0.6, gamma=2, objective='binary:logistic') '''feature composition''' df_handcraft_train = pd.read_csv('../input/train_features.csv', encoding='utf-8')[features].as_matrix() tfidf_unigram_train = train_tfidf_unigram_features() tfidf_bigram_train = train_tfidf_bigram_features() tfidf_char_train = train_tfidf_char_features() X_train = features_merge(df_handcraft_train, tfidf_unigram_train, tfidf_bigram_train, tfidf_char_train) y_train = df_train['label'] # '''resample the data set''' # X_train_resampled, y_train_resampled = resample(X_train, y_train) '''feature selection''' model = SelectFromModel(estimator=clf) X_train = model.transform(X_train) '''train test split''' X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=2) clf.fit(X_train, y_train, eval_metric='auc', eval_set=[(X_valid, y_valid)], early_stopping_rounds=20) y_predict = clf.predict(X_valid) print(label + ' roc auc score is ' + str(roc_auc_score(y_valid, y_predict))) # clf.fit(X_train_resampled, y_train_resampled) return clf
def train_cv(label): """ cross validation training process :param label: :return: list clfs """ df_train = pd.read_csv('../input/train.csv', encoding='utf-8') num_fold = 5 skf = StratifiedKFold(n_splits=num_fold, shuffle=True) count = 0 clf_list = [] '''feature composition''' df_handcraft_train = pd.read_csv('../input/train_features.csv', encoding='utf-8')[features].as_matrix() tfidf_unigram_train = train_tfidf_unigram_features() tfidf_bigram_train = train_tfidf_bigram_features() tfidf_char_train = train_tfidf_char_features() word_embedding_train = pd.read_csv( '../feature_engineering/word_embedding/w2v_train_embedding.csv', encoding='utf-8') X_train = features_merge(df_handcraft_train, tfidf_unigram_train, tfidf_bigram_train, tfidf_char_train, word_embedding_train) # X_train = features_merge(df_handcraft_train, tfidf_unigram_train, tfidf_bigram_train, tfidf_char_train) y_train = df_train[label] for idx_train, idx_valid in skf.split(y_train, y_train): print("fitting fold " + str(count)) clf = _train(label, x_train=X_train, y_train=y_train, idx_train=idx_train, idx_valid=idx_valid, index=count) count += 1 clf_list.append(clf) return clf_list
def train_grid_search(label): """ better params: - colsample_bytree=0.8 - colsample_bylevel=0.6 - gamma:2 - max_depth=4 cross validation to fine tune hypeparameters :param label: :return: """ df_train = pd.read_csv('../input/train.csv', encoding='utf-8') _toxic_count = len(df_train[df_train.max(axis=1) == 0]) print(_toxic_count) _clean_count = len(df_train) - _toxic_count clf = XGBClassifier( learning_rate=0.1, n_estimators=2000, max_depth=4, silent=False, # scale_pos_weight=_toxic_count / _clean_count, colsample_bytree=0.8, colsample_bylevel=0.6, gamma=2, objective='binary:logistic') # feature composition df_handcraft_train = pd.read_csv( '../feature_engineering/statistics/statics_train.csv') tfidf_unigram_train = train_tfidf_unigram_features() tfidf_bigram_train = train_tfidf_bigram_features() tfidf_char_train = train_tfidf_char_features() word_embedding_train = pd.read_csv( '../feature_engineering/word_embedding/w2v_train_embedding.csv', encoding='utf-8') # print(word_embedding_train.shape) X_train = features_merge(df_handcraft_train, tfidf_unigram_train, tfidf_bigram_train, tfidf_char_train, word_embedding_train) # X_train = features_merge(df_handcraft_train, tfidf_unigram_train, tfidf_bigram_train, tfidf_char_train) # X_train = features_merge(df_handcraft_train, tfidf_unigram_train, tfidf_char_train) y_train = df_train[label] grid_params = { # 'learning_rate': np.arange(0.08, 0.2, 0.01), # 'n_estimators': range(1000, 4000, 100), # 'gamma': range(0, 5, 1), # 'max_depth': range(4, 5, 1), # 'colsample_bytree': np.arange(0.6, 1, 0.1), 'colsample_bylevel': np.arange(0.5, 1.0, 0.1) } # '''resample the data set''' # X_train_resampled, y_train_resampled = resample(X_train, y_train) X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=2) print('grid search construct') grid_clf = GridSearchCV(estimator=clf, param_grid=grid_params, verbose=1, scoring='roc_auc', cv=5) grid_clf.fit(X=X_train, y=y_train, eval_metric='auc', eval_set=[(X_valid, y_valid)], early_stopping_rounds=50) return grid_clf