print(str(datetime.now()) + ' Reading Data Complete')

# param_dict = {'learning_rate' : [0.05, 1, 3],
#               'max_depth' : [5, 10, 50, 100, 200],
#               'n_estimators' : [50, 100, 200]}

param_dict = {
    'max_depth': [5, 18, 15],
    'min_child_weight': [1, 3, 5, 7],
    'subsample': [0.6, 0.8, 1],
    'colsample_bytree': [0.6, 0.8, 1]
}

model_list = []

param_space, param_to_int_dict = c_vars.get_param_space(param_dict)
# print (param_space)
# print (param_to_int_dict)
param_space = [[0.6, 5, 1, 0.6]]
# param_list = [0.05, 5, 200]
for param_list in param_space:
    # train_columns = c_vars.col_index_training[:c_vars.num_features_for_model]
    train_columns = [x for x in range(X[0].shape[1])]
    print(
        str(datetime.now()) + ' Training XGBoost classifier, ' +
        str(param_list))
    kf = [KFold(n_splits=4, shuffle=True) for _ in range(n_models)]
    clf = [0 for _ in range(n_models)]
    clf[0] = XGBClassifier(
        max_depth=param_list[param_to_int_dict['max_depth']],
        min_child_weight=param_list[param_to_int_dict['min_child_weight']],
Example #2
0
def main():

    df = pd.read_csv(c_vars.train_file)
    # df = pd.read_csv(c_vars.train_file_processed, encoding = "cp1252")
    # df['Description_Clean'].fillna('', inplace = True, axis = 0)
    # df['Description_Clean_Adj'].fillna('', inplace = True, axis = 0)

    df.drop(['User_ID'], axis=1, inplace=True)
    df['Is_Response'] = df['Is_Response'].apply(lambda x: 1
                                                if x == 'happy' else 0)
    df['Browser_Used'] = df['Browser_Used'].apply(
        lambda x: c_vars.browser_dict[x])

    # print ('Cleaning started at ' + str(datetime.now()))
    df = parallelize_dataframe(df, parallel_func_to_apply)
    # print ('Cleaning complete at ' + str(datetime.now()))
    print(df.head(5))
    df.to_csv(c_vars.train_file_processed, index=False)
    df_submit = pd.read_csv(c_vars.test_file)
    df_submit['Browser_Used'] = df_submit['Browser_Used'].apply(
        lambda x: c_vars.browser_dict[x])
    df_submit = parallelize_dataframe(df_submit, parallel_func_to_apply)
    df_submit.to_csv(c_vars.test_file_processed, index=False)
    # sys.exit()

    df['text_length'] = df['Description_Clean'].apply(lambda x: len(x))
    df['word_count'] = df['Description_Clean'].apply(
        lambda x: len(x.split(' ')))

    # create more copies of the unhappy/bad reviews to identify those words
    # df = pd.concat([df, df.loc[df['Is_Response'] == 0,], df.loc[df['Is_Response'] == 0,]])

    df_train, df_dev = train_test_split(df.as_matrix(),
                                        test_size=0.1,
                                        random_state=442)
    df_train = pd.DataFrame(df_train,
                            columns=c_vars.header_useful + [
                                'Description_Clean', 'Description_Clean_Adj',
                                'text_length', 'word_count'
                            ])
    df_dev = pd.DataFrame(df_dev,
                          columns=c_vars.header_useful + [
                              'Description_Clean', 'Description_Clean_Adj',
                              'text_length', 'word_count'
                          ])

    df_device = df_train.groupby(['Device_Used'
                                  ])['Is_Response'].agg(['count', np.sum])
    df_device.reset_index(inplace=True)
    # print (df_device.columns.values)
    # df_device.columns = df_device.columns.get_level_values(0)
    df_device['target_rate_dev'] = df_device['sum'] / df_device['count']
    df_device = df_device[['Device_Used', 'target_rate_dev']]

    df_train = pd.merge(df_train,
                        df_device,
                        how='left',
                        on='Device_Used',
                        suffixes=('', ''))
    df_dev = pd.merge(df_dev,
                      df_device,
                      how='left',
                      on='Device_Used',
                      suffixes=('', ''))

    df_browser = df_train.groupby(['Browser_Used'
                                   ])['Is_Response'].agg(['count', np.sum])
    df_browser.reset_index(inplace=True)
    # print (df_browser.columns.values)
    # df_browser.columns = df_browser.columns.get_level_values(0)
    df_browser['target_rate_browser'] = df_browser['sum'] / df_browser['count']
    df_browser = df_browser[['Browser_Used', 'target_rate_browser']]

    df_train = pd.merge(df_train,
                        df_browser,
                        how='left',
                        on='Browser_Used',
                        suffixes=('', ''))
    df_dev = pd.merge(df_dev,
                      df_browser,
                      how='left',
                      on='Browser_Used',
                      suffixes=('', ''))

    X_train = df_train[[
        'Description_Clean', 'Description_Clean_Adj', 'text_length',
        'word_count', 'target_rate_dev', 'target_rate_browser'
    ]].as_matrix()
    X_dev = df_dev[[
        'Description_Clean', 'Description_Clean_Adj', 'text_length',
        'word_count', 'target_rate_dev', 'target_rate_browser'
    ]].as_matrix()
    y_train_tfidf = df_train['Is_Response'].as_matrix().astype(np.int64)
    y_dev_tfidf = df_dev['Is_Response'].as_matrix().astype(np.int64)

    # 3600, 4000, 3500
    # tfVect1 = TfidfVectorizer(max_features = 3600, ngram_range = (1,2))
    tfVect1 = TfidfVectorizer(max_features=3000, ngram_range=(1, 1))
    tfVect2 = TfidfVectorizer(max_features=4000, ngram_range=(2, 2))
    tfVect3 = TfidfVectorizer(max_features=3000, ngram_range=(1, 1))
    # tfVect3 = CountVectorizer()
    countVect = CountVectorizer()
    # tfVect = TfidfVectorizer(min_df = 5)

    tfVect1.fit(X_train[:, 0])
    tfVect2.fit(X_train[:, 0])
    tfVect3.fit(X_train[:, 1])
    # countVect.fit(X_train[:, 0])
    # X_train_tfidf = countVect.transform(X_train[:, 0])
    # X_train_tfidf = hstack((tfVect2.transform(X_train[:, 1]), countVect.transform(X_train[:, 0])))
    X_train_tfidf = hstack(
        (tfVect1.transform(X_train[:, 0]), tfVect2.transform(X_train[:, 0]),
         tfVect3.transform(X_train[:, 1])))
    # X_train_tfidf = hstack((tfVect1.transform(X_train[:, 0]), tfVect3.transform(X_train[:, 1])))
    # X_train_tfidf = tfVect1.transform(X_train[:, 0])
    # X_train_tfidf = hstack((tfVect1.transform(X_train[:, 0]), tfVect2.transform(X_train[:, 0]), tfVect3.transform(X_train[:, 1]), countVect.transform(X_train[:, 1])))
    # truncatedsvd = TruncatedSVD(n_components = 100, random_state = 42)
    # truncatedsvd.fit(X_train_tfidf)
    X_train_tfidf = c_vars.add_feature(X_train_tfidf,
                                       X_train[:, 2].astype(np.float64))
    X_train_tfidf = c_vars.add_feature(X_train_tfidf,
                                       X_train[:, 3].astype(np.int64))
    X_train_tfidf = c_vars.add_feature(X_train_tfidf,
                                       X_train[:, 4].astype(np.int64))
    X_train_tfidf = c_vars.add_feature(X_train_tfidf,
                                       X_train[:, 5].astype(np.int64))

    # X_train_tfidf, s_train, vt =  svds(X_train_tfidf, k = 600)
    # X_train_tfidf = X_train_tfidf * s_train
    # v = vt.T

    # write the vectors to a file
    '''
    with open('../analysis/svd_vectors.csv', 'w') as f:
        for i in range(truncatedsvd.explained_variance_.shape[0]):
            f.write(str(truncatedsvd.explained_variance_[i]) + ',' + str(truncatedsvd.explained_variance_ratio_[i]) + '\n')
    # sys.exit()
    '''

    # X_dev_tfidf = countVect.transform(X_dev[:, 0])
    # X_dev_tfidf = hstack((tfVect2.transform(X_dev[:, 1]), countVect.transform(X_dev[:, 0])))
    X_dev_tfidf = hstack(
        (tfVect1.transform(X_dev[:, 0]), tfVect2.transform(X_dev[:, 0]),
         tfVect3.transform(X_dev[:, 1])))
    # X_dev_tfidf = tfVect1.transform(X_dev[:, 0])
    # X_dev_tfidf = hstack((tfVect1.transform(X_dev[:, 0]), tfVect3.transform(X_dev[:, 1])))
    # X_dev_tfidf = hstack((tfVect1.transform(X_dev[:, 0]), tfVect2.transform(X_dev[:, 0]), tfVect3.transform(X_dev[:, 1]), countVect.transform(X_dev[:, 1])))
    # X_dev_tfidf = truncatedsvd.transform(X_dev_tfidf)
    X_dev_tfidf = c_vars.add_feature(X_dev_tfidf, X_dev[:,
                                                        2].astype(np.float64))
    X_dev_tfidf = c_vars.add_feature(X_dev_tfidf, X_dev[:, 3].astype(np.int64))
    X_dev_tfidf = c_vars.add_feature(X_dev_tfidf, X_dev[:, 4].astype(np.int64))
    X_dev_tfidf = c_vars.add_feature(X_dev_tfidf, X_dev[:, 5].astype(np.int64))

    # X_dev_tfidf = X_dev_tfidf * v

    # scaler = StandardScaler(with_mean = False)
    # scaler.fit(X_train_tfidf)

    # X_train_tfidf = scaler.transform(X_train_tfidf)
    # X_dev_tfidf = scaler.transform(X_dev_tfidf)

    # print (X_train_tfidf)

    del X_train
    del df_train
    del X_dev
    del df_dev
    # del s_train
    gc.collect()

    param_dict = {
        'max_depth': [5, 18, 15],
        'n_estimators': [120, 300, 500],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 5, 10]
    }
    param_dict = {
        'colsample_bytree': [0.9],
        'learning_rate': [0.01, 0.025, 0.1],
        'max_depth': [3, 5],
        'min_child_weight': [5],
        'n_estimators': [10, 50],
        'reg_alpha': [0],
        'reg_lambda': [10],
        'subsample': [0.9]
    }
    param_dict = {'penalty': ['l2'], 'C': [1]}
    param_space, param_to_int_dict = c_vars.get_param_space(param_dict)

    for param_list in param_space:
        # print (param_list)
        # clf = RandomForestClassifier(max_depth=param_list[param_to_int_dict['max_depth']],
        # n_estimators=param_list[param_to_int_dict['n_estimators']],
        # min_samples_split=param_list[param_to_int_dict['min_samples_split']],
        # min_samples_leaf=param_list[param_to_int_dict['min_samples_leaf']],
        # random_state = 42)
        # clf = XGBClassifier(colsample_bytree      = param_list[param_to_int_dict['colsample_bytree']],
        # learning_rate         = param_list[param_to_int_dict['learning_rate']],
        # max_depth             = param_list[param_to_int_dict['max_depth']],
        # min_child_weight      = param_list[param_to_int_dict['min_child_weight']],
        # n_estimators          = param_list[param_to_int_dict['n_estimators']],
        # reg_alpha             = param_list[param_to_int_dict['reg_alpha']],
        # reg_lambda            = param_list[param_to_int_dict['reg_lambda']],
        # subsample             = param_list[param_to_int_dict['subsample']])
        kf = StratifiedKFold(n_splits=4, shuffle=True)
        # clf = RandomForestClassifier(max_depth=6, n_estimators=100, min_samples_split=5, min_samples_leaf=5, random_state = 42)
        # clf = XGBClassifier(
        # colsample_bytree      = 0.6,
        # learning_rate         = 0.05,
        # max_depth             = 4,
        # min_child_weight      = 1,
        # n_estimators          = 20,
        # reg_alpha             = 0,
        # reg_lambda            = 10,
        # subsample             = 0.8,
        # )
        # clf = GradientBoostingClassifier(
        # max_depth        = 5,
        # n_estimators     = 40,
        # learning_rate    = 0.1,
        # random_state = 42)
        # clf = MLPClassifier(activation         = 'logistic',
        # hidden_layer_sizes = (200, 50, 10),
        # learning_rate      = 'invscaling',
        # max_iter           = 200,
        # solver             = 'adam',
        # random_state = 42)
        # clf = MultinomialNB(alpha = i)
        # clf = GaussianNB()
        # clf = SVC(C = i)
        clf = LogisticRegression(
            penalty=param_list[param_to_int_dict['penalty']],
            C=param_list[param_to_int_dict['C']])
        # if type(X_train_tfidf) is not np.ndarray:
        # X_train_tfidf = X_train_tfidf.toarray()
        # X_dev_tfidf = X_dev_tfidf.toarray()
        # clf = LogisticRegression(penalty = 'l2', C = i)
        # clf = CatBoostClassifier(iterations=50, learning_rate=0.03, depth=4, loss_function='Logloss', class_weights = [1/0.67,1])
        # if type(X_train_tfidf) is not np.ndarray:
        # X_train_tfidf = X_train_tfidf.toarray()
        # X_dev_tfidf = X_dev_tfidf.toarray()
        kf_index = 0
        for train_indices, test_indices in kf.split(X_train_tfidf,
                                                    y_train_tfidf):
            kf_index += 1
            # print (kf_index)
            if type(X_train_tfidf) is np.ndarray:
                X_train, X_val = X_train_tfidf[train_indices], X_train_tfidf[
                    test_indices]
            else:
                X_train, X_val = csr_matrix(X_train_tfidf)[
                    safe_mask(X_train_tfidf, train_indices), :], csr_matrix(
                        X_train_tfidf)[
                            safe_mask(X_train_tfidf, test_indices), :]
            y_train, y_val = y_train_tfidf[train_indices], y_train_tfidf[
                test_indices]

            clf.fit(X_train, y_train)

            # val set
            y_pred = clf.predict(X_val)
            y_pred_proba = clf.predict_proba(X_val)[:, 1]
            y_pred_train = clf.predict(X_train)
            y_pred_proba_train = clf.predict_proba(X_train)[:, 1]

            print('Train ' + 'CV ' + str(kf_index) + ' ,' + str(param_list) +
                  ' ,' + str(accuracy_score(y_train, y_pred_train)) + ',' +
                  str(roc_auc_score(y_train, y_pred_proba_train)))
            print('Val   ' + 'CV ' + str(kf_index) + ' ,' + str(param_list) +
                  ' ,' + str(accuracy_score(y_val, y_pred)) + ',' +
                  str(roc_auc_score(y_val, y_pred_proba)))

            # dev set
            y_pred = clf.predict(X_dev_tfidf)
            y_pred_proba = clf.predict_proba(X_dev_tfidf)[:, 1]

            print('Dev   ' + 'CV ' + str(kf_index) + ' ,' + str(param_list) +
                  ' ,' + str(accuracy_score(y_dev_tfidf, y_pred)) + ',' +
                  str(roc_auc_score(y_dev_tfidf, y_pred_proba)))

        # train on the whole set

        clf.fit(X_train_tfidf, y_train_tfidf)
        # dev set
        y_pred_train = clf.predict(X_train_tfidf)
        y_pred_proba_train = clf.predict_proba(X_train_tfidf)[:, 1]
        y_pred = clf.predict(X_dev_tfidf)
        y_pred_proba = clf.predict_proba(X_dev_tfidf)[:, 1]

        print('Train ' + str(param_list) + ' ,' +
              str(accuracy_score(y_train_tfidf, y_pred_train)) + ',' +
              str(roc_auc_score(y_train_tfidf, y_pred_proba_train)))
        print('Dev   ' + str(param_list) + ' ,' +
              str(accuracy_score(y_dev_tfidf, y_pred)) + ',' +
              str(roc_auc_score(y_dev_tfidf, y_pred_proba)))
        '''
        df_dev['y_dev'] = y_dev
        df_dev['y_pred'] = y_pred
        df_dev['y_pred_proba'] = y_pred_proba
        df_dev.to_csv('../analysis/dev_analysis.csv', index = False)
        '''
        '''
        values = X_train_tfidf.max(0).toarray()[0]
        # feature_names = np.hstack((np.array(tfVect1.get_feature_names()), np.array(tfVect2.get_feature_names()), np.array(tfVect3.get_feature_names())))
        feature_names = np.hstack((np.array(tfVect1.get_feature_names()), np.array(tfVect2.get_feature_names())))
        print (feature_names.shape)
        features_series = pd.DataFrame(values, index = feature_names)
        features_series['coefs'] = clf.coef_.reshape(-1, 1)
        features_series.to_csv('../analysis/features_series.csv')
        '''
        '''
        f = open('../analysis/corr.csv', 'w')
        for i in range(X_train_tfidf.shape[1]):
            f.write (str(features_series.index[i]) + ',' + str(features_series[i]) + ',' + str(i) + ',' + str(pearsonr(X_train_tfidf.toarray()[:,i], y_train)[0]) + '\n')
        f.close()
        '''
        # top_20 = features_series.nlargest(20)
        # bot_20 = features_series.nsmallest(20)
        # print ('bot_20')
        # print (bot_20)
        # print ('top_20')
        # print (top_20)

    # delete training data to clean up memory
    del X_train_tfidf
    del X_dev_tfidf
    del y_train
    del y_pred
    del y_pred_proba
    del y_pred_train
    del y_pred_proba_train
    gc.collect()
    '''