Ejemplo n.º 1
0
def train_cv(label):
    """
    cross validation to fine tune hypeparameters
    :param label:
    :return:
    """
    df_train = pd.read_csv('../input/train.csv', encoding='utf-8')
    clf = SVC(C=0.8, kernel='rbf', class_weight='balanced')

    # feature composition
    df_handcraft_train = pd.read_csv('../input/train_features.csv')[features]
    tfidf_unigram_train = train_tfidf_unigram_features()
    tfidf_bigram_train = train_tfidf_bigram_features()
    tfidf_char_train = train_tfidf_char_features()
    X_train = hstack((df_handcraft_train, tfidf_unigram_train,
                      tfidf_bigram_train, tfidf_char_train))
    y_train = df_train[label]

    grid_params = {
        'C': np.arange(0.6, 1.0, 0.1),
        'kernel': ('linear', 'rbf', 'poly', 'sigmoid'),
    }
    '''re-sample the data set'''
    X_train_resampled, y_train_resampled = resample(X_train, y_train)

    grid_clf = GridSearchCV(estimator=clf,
                            param_grid=grid_params,
                            verbose=1,
                            scoring='roc_auc',
                            cv=5)
    grid_clf.fit(X_train_resampled, y_train_resampled)
    return grid_clf
Ejemplo n.º 2
0
def train(label):
    """
    train model process
    :param label: target label list
    :return:
    """
    clf = SVC(C=0.8, kernel='rbf', class_weight='balanced', probability=True)
    df_train = pd.read_csv('../input/train_clean.csv', encoding='utf-8')
    '''feature composition'''
    df_handcraft_train = pd.read_csv('../input/train_features.csv',
                                     encoding='utf-8')[features].as_matrix()
    # tfidf_unigram_train = train_tfidf_unigram_features()
    tfidf_bigram_train = train_tfidf_bigram_features()
    # tfidf_char_train = train_tfidf_char_features()
    X_train = features_merge(df_handcraft_train, tfidf_bigram_train)
    y_train = df_train[label]

    # '''re-sample the data set'''
    # X_train_resampled, y_train_resampled = resample(X_train, y_train)

    # '''feature selection'''
    # model = SelectFromModel(estimator=clf)
    # X_train_resampled = model.transform(X_train_resampled)

    # '''train test split'''
    # X_train, X_valid, y_train, y_valid = train_test_split(X_train_resampled, y_train_resampled, test_size=0.2,
    #                                                       random_state=2)
    clf.fit(X_train, y_train)
    # y_predict = clf.predict(X_valid)
    # print(label + ' roc auc score is ' + str(roc_auc_score(y_valid, y_predict)))

    # clf.fit(X_train_resampled, y_train_resampled)
    return clf
Ejemplo n.º 3
0
def train_cv(label):
    """
    cross validation to fine tune hypeparameters
    :param label:
    :return:
    """
    df_train = pd.read_csv('../input/train.csv', encoding='utf-8')
    clf = RandomForestClassifier(n_estimators=1000,
                                 class_weight='balanced',
                                 verbose=1)

    # feature composition
    df_handcraft_train = pd.read_csv('../input/train_features.csv')[features]
    tfidf_unigram_train = train_tfidf_unigram_features()
    tfidf_bigram_train = train_tfidf_bigram_features()
    tfidf_char_train = train_tfidf_char_features()
    X_train = hstack(df_handcraft_train, tfidf_unigram_train,
                     tfidf_bigram_train, tfidf_char_train)
    y_train = df_train[label]

    grid_params = {
        'n_estimators': range(1000, 4000, 100),
        'min_samples_leaf': range(2, 10, 1)
    }

    grid_clf = GridSearchCV(estimator=clf,
                            param_grid=grid_params,
                            verbose=1,
                            scoring='roc_auc',
                            cv=5)
    grid_clf.fit(X_train, y_train)
    return grid_clf
Ejemplo n.º 4
0
def train_cv(label):
    """
    better params:
    -   C=4
    -   solver='lbfgs'

    gird search to tune hypeparameters
    :param label: target label
    :return: lr model with special parameters
    """
    # label_weight = df_train.shape[0]/df_train[df_train[label]==1].shape[0]
    lr = LogisticRegression(class_weight='balanced',
                            solver='sag',
                            random_state=22,
                            verbose=1,
                            max_iter=6000,
                            C=4)
    # lr.set_params(class_weight={label:label_weight})
    # lr.set_params(class_weight='balanced')

    df_train = pd.read_csv('../input/train.csv', encoding='utf-8')
    '''feature composition'''
    df_handcraft_train = pd.read_csv('../input/train_features.csv',
                                     encoding='utf-8')[features].as_matrix()
    tfidf_unigram_train = train_tfidf_unigram_features()
    tfidf_bigram_train = train_tfidf_bigram_features()
    tfidf_char_train = train_tfidf_char_features()
    print(df_handcraft_train.shape)
    # print(tfidf_train.shape)
    X_train = features_merge(df_handcraft_train, tfidf_unigram_train,
                             tfidf_bigram_train, tfidf_char_train)
    # X_train = np.concatenate((df_handcraft_train,df_tfidf_train),axis=1)
    # df_train = pd.read_csv('../input/train.csv', encoding='utf-8')
    y_train = df_train[label]

    params = {
        # 'penalty': ('l1', 'l2'),
        # 'C': np.arange(1, 5, 1),
        'solver': ('liblinear', 'sag', 'saga', 'newton-cg', 'lbfgs'),
        # 'solver': ('sag', 'saga', 'newton-cg'),
        # 'max_iter': range(6000, 7000, 100)
    }

    clf = GridSearchCV(estimator=lr,
                       param_grid=params,
                       scoring='roc_auc',
                       cv=5,
                       verbose=1)
    clf.fit(X_train, y_train)
    return clf
Ejemplo n.º 5
0
def train(label):
    df_train = pd.read_csv('../input/train.csv', encoding='utf-8')
    clf = XGBClassifier(
        learning_rate=0.1,
        n_estimators=2000,
        max_depth=4,
        silent=False,
        # scale_pos_weight=_toxic_count / _clean_count,
        colsample_bytree=0.8,
        colsample_bylevel=0.6,
        gamma=2,
        objective='binary:logistic')
    '''feature composition'''
    df_handcraft_train = pd.read_csv('../input/train_features.csv',
                                     encoding='utf-8')[features].as_matrix()
    tfidf_unigram_train = train_tfidf_unigram_features()
    tfidf_bigram_train = train_tfidf_bigram_features()
    tfidf_char_train = train_tfidf_char_features()
    X_train = features_merge(df_handcraft_train, tfidf_unigram_train,
                             tfidf_bigram_train, tfidf_char_train)
    y_train = df_train['label']

    # '''resample the data set'''
    # X_train_resampled, y_train_resampled = resample(X_train, y_train)
    '''feature selection'''
    model = SelectFromModel(estimator=clf)
    X_train = model.transform(X_train)
    '''train test split'''
    X_train, X_valid, y_train, y_valid = train_test_split(X_train,
                                                          y_train,
                                                          test_size=0.2,
                                                          random_state=2)
    clf.fit(X_train,
            y_train,
            eval_metric='auc',
            eval_set=[(X_valid, y_valid)],
            early_stopping_rounds=20)
    y_predict = clf.predict(X_valid)
    print(label + ' roc auc score is ' +
          str(roc_auc_score(y_valid, y_predict)))

    # clf.fit(X_train_resampled, y_train_resampled)
    return clf
Ejemplo n.º 6
0
def train_cv(label):
    """
    cross validation training process
    :param label:
    :return: list clfs
    """
    df_train = pd.read_csv('../input/train.csv', encoding='utf-8')
    num_fold = 5
    skf = StratifiedKFold(n_splits=num_fold, shuffle=True)
    count = 0
    clf_list = []
    '''feature composition'''
    df_handcraft_train = pd.read_csv('../input/train_features.csv',
                                     encoding='utf-8')[features].as_matrix()
    tfidf_unigram_train = train_tfidf_unigram_features()
    tfidf_bigram_train = train_tfidf_bigram_features()
    tfidf_char_train = train_tfidf_char_features()
    word_embedding_train = pd.read_csv(
        '../feature_engineering/word_embedding/w2v_train_embedding.csv',
        encoding='utf-8')
    X_train = features_merge(df_handcraft_train, tfidf_unigram_train,
                             tfidf_bigram_train, tfidf_char_train,
                             word_embedding_train)
    # X_train = features_merge(df_handcraft_train, tfidf_unigram_train, tfidf_bigram_train, tfidf_char_train)
    y_train = df_train[label]

    for idx_train, idx_valid in skf.split(y_train, y_train):
        print("fitting fold " + str(count))
        clf = _train(label,
                     x_train=X_train,
                     y_train=y_train,
                     idx_train=idx_train,
                     idx_valid=idx_valid,
                     index=count)
        count += 1
        clf_list.append(clf)
    return clf_list
Ejemplo n.º 7
0
def train_grid_search(label):
    """
    better params:
    -   colsample_bytree=0.8
    -   colsample_bylevel=0.6
    -   gamma:2
    -   max_depth=4
    cross validation to fine tune hypeparameters
    :param label:
    :return:
    """

    df_train = pd.read_csv('../input/train.csv', encoding='utf-8')

    _toxic_count = len(df_train[df_train.max(axis=1) == 0])
    print(_toxic_count)
    _clean_count = len(df_train) - _toxic_count

    clf = XGBClassifier(
        learning_rate=0.1,
        n_estimators=2000,
        max_depth=4,
        silent=False,
        # scale_pos_weight=_toxic_count / _clean_count,
        colsample_bytree=0.8,
        colsample_bylevel=0.6,
        gamma=2,
        objective='binary:logistic')

    # feature composition
    df_handcraft_train = pd.read_csv(
        '../feature_engineering/statistics/statics_train.csv')
    tfidf_unigram_train = train_tfidf_unigram_features()
    tfidf_bigram_train = train_tfidf_bigram_features()
    tfidf_char_train = train_tfidf_char_features()
    word_embedding_train = pd.read_csv(
        '../feature_engineering/word_embedding/w2v_train_embedding.csv',
        encoding='utf-8')
    # print(word_embedding_train.shape)
    X_train = features_merge(df_handcraft_train, tfidf_unigram_train,
                             tfidf_bigram_train, tfidf_char_train,
                             word_embedding_train)
    # X_train = features_merge(df_handcraft_train, tfidf_unigram_train, tfidf_bigram_train, tfidf_char_train)
    # X_train = features_merge(df_handcraft_train, tfidf_unigram_train, tfidf_char_train)
    y_train = df_train[label]

    grid_params = {
        # 'learning_rate': np.arange(0.08, 0.2, 0.01),
        # 'n_estimators': range(1000, 4000, 100),
        # 'gamma': range(0, 5, 1),
        # 'max_depth': range(4, 5, 1),
        # 'colsample_bytree': np.arange(0.6, 1, 0.1),
        'colsample_bylevel': np.arange(0.5, 1.0, 0.1)
    }

    # '''resample the data set'''
    # X_train_resampled, y_train_resampled = resample(X_train, y_train)

    X_train, X_valid, y_train, y_valid = train_test_split(X_train,
                                                          y_train,
                                                          test_size=0.2,
                                                          random_state=2)
    print('grid search construct')
    grid_clf = GridSearchCV(estimator=clf,
                            param_grid=grid_params,
                            verbose=1,
                            scoring='roc_auc',
                            cv=5)
    grid_clf.fit(X=X_train,
                 y=y_train,
                 eval_metric='auc',
                 eval_set=[(X_valid, y_valid)],
                 early_stopping_rounds=50)
    return grid_clf