Beispiel #1
0
print(len(feat), 25)

GloveDimOption = '50'  # this  could be 50 (171.4 MB), 100 (347.1 MB), 200 (693.4 MB), or 300 (1 GB)
embeddings_index = loadGloveModel('data/glove.6B.' + GloveDimOption + 'd.txt')

# print(embeddings_index['apple'])
# print(embeddings_index['mango'])
embeddings_index[''] = np.zeros(50)
embeddings_index['*root'] = np.ones(50)

enc = CategoricalEncoder(encoding='onehot')
X_pos = [['ADJ'], ['ADP'], ['ADV'], ['AUX'], ['CCONJ'], ['DET'], ['INTJ'],
         ['NOUN'], ['NUM'], ['PART'], ['PRON'], ['PROPN'], ['PUNCT'],
         ['SCONJ'], ['SYM'], ['VERB'], ['X']]
enc.fit(X_pos)

for i in X_pos:
    embeddings_index[i[0]] = pad_sequences(enc.transform([[i[0]]]).toarray(),
                                           maxlen=50,
                                           padding='post')[0]
    #embeddings_index[i[0]] = pad_sequences(enc.transform([[i[0]]]).toarray(), maxlen=18, padding='post')[0]
    # print(embeddings_index[i[0]])
    # print(embeddings_index['apple'])

feat_vect, transit_vect = [], []
# feat_vect = np.array(())
# transit_vect = np.array(())
for i in feat:
    #print(i)
    sd = np.array(())
Beispiel #2
0
        X_categorical = X[:, :idx_end_categorical + 1]

        # Select only the numerical columns of X (including ft_embedding if present)
        X_numerical = X[:, idx_end_categorical + 1:]

        return X_categorical, X_numerical

    else:
        return np.zeros((X.shape[0], 0)), X


if __name__ == "__main__":
    df_train = load_and_preprocess('TrainingSet(3).csv', nrows=10000)
    # print(df_train.combined_str)

    # X = pd.concat([df_train[colnames_categ], df_inference[colnames_categ]])
    X = df_train[colnames_categ]
    X = X.apply(preprocess_categ_series)

    enc = CategoricalEncoder(handle_unknown='ignore')
    enc.fit(X)

    len_onehot = enc.transform(
        df_train[colnames_categ].iloc[:1]).toarray().shape[1]
    print(len_onehot)
    # train_X_onehot = enc.transform(df_train[colnames_categ]).toarray()
    # # inference_X_onehot = enc.transform(df_train[colnames_categ]).toarray()
    # print(train_X_onehot.shape)
    # print(train_X_onehot[0])
    # pprint(enc.categories_)
rt = RandomTreesEmbedding(max_depth=3,
                          n_estimators=n_estimator,
                          random_state=0)

rt_lm = LogisticRegression()
pipeline = make_pipeline(rt, rt_lm)
pipeline.fit(X_train, y_train)
y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)

# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = CategoricalEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)

grd = GradientBoostingClassifier(n_estimators=n_estimator)
grd_enc = CategoricalEncoder()
grd_lm = LogisticRegression()
grd.fit(X_train, y_train)
grd_enc.fit(grd.apply(X_train)[:, :, 0])
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)

y_pred_grd_lm = grd_lm.predict_proba(
    grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)
# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator,
	random_state=0)

rt_lm = LogisticRegression()
pipeline = make_pipeline(rt, rt_lm)
pipeline.fit(X_train, y_train)
y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)

# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = CategoricalEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)

grd = GradientBoostingClassifier(n_estimators=n_estimator)
grd_enc = CategoricalEncoder()
grd_lm = LogisticRegression()
grd.fit(X_train, y_train)
grd_enc.fit(grd.apply(X_train)[:, :, 0])
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)

y_pred_grd_lm = grd_lm.predict_proba(
    grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)