Esempio n. 1
0
enc = CategoricalEncoder(encoding='onehot-dense')

X_2 = np.array(X[:, 0].reshape(-1, 1))
Xq_2 = np.array(X_q[:, 0].reshape(-1, 1))
attributes = [dataset['attributes'][0][0]]

for i, (name, relation) in enumerate(dataset['attributes'][1:-1]):
    if relation == 'NUMERIC':
        X_2 = np.hstack((X_2, X[:, i + 1].reshape(-1, 1)))
        Xq_2 = np.hstack((Xq_2, X_q[:, i + 1].reshape(-1, 1)))
        attributes.append(name)
        continue

    X_2 = np.hstack((X_2, enc.fit_transform(X[:, i + 1].reshape(-1, 1))))
    Xq_2 = np.hstack((Xq_2, enc.transform(X_q[:, i + 1].reshape(-1, 1))))

    for category in enc.categories_[0]:
        attributes.append(category)

X = X_2.astype(float)
X_q = Xq_2.astype(float)

print('Num features: %d' % len(attributes))
print(attributes)

# We now have 51 features, for example feature entrepreneur can get value 0 or 1, 0 meaning persion is not entrepreneur and 1 meaning he is.

# ### Most informative features
# Before we use PCA to remove some features we will see which features are considered most informative when we use Logistic Regression classifier.
Esempio n. 2
0
GloveDimOption = '50'  # this  could be 50 (171.4 MB), 100 (347.1 MB), 200 (693.4 MB), or 300 (1 GB)
embeddings_index = loadGloveModel('data/glove.6B.' + GloveDimOption + 'd.txt')

# print(embeddings_index['apple'])
# print(embeddings_index['mango'])
embeddings_index[''] = np.zeros(50)
embeddings_index['*root'] = np.ones(50)

enc = CategoricalEncoder(encoding='onehot')
X_pos = [['ADJ'], ['ADP'], ['ADV'], ['AUX'], ['CCONJ'], ['DET'], ['INTJ'],
         ['NOUN'], ['NUM'], ['PART'], ['PRON'], ['PROPN'], ['PUNCT'],
         ['SCONJ'], ['SYM'], ['VERB'], ['X']]
enc.fit(X_pos)

for i in X_pos:
    embeddings_index[i[0]] = pad_sequences(enc.transform([[i[0]]]).toarray(),
                                           maxlen=50,
                                           padding='post')[0]
    #embeddings_index[i[0]] = pad_sequences(enc.transform([[i[0]]]).toarray(), maxlen=18, padding='post')[0]
    # print(embeddings_index[i[0]])
    # print(embeddings_index['apple'])

feat_vect, transit_vect = [], []
# feat_vect = np.array(())
# transit_vect = np.array(())
for i in feat:
    #print(i)
    sd = np.array(())
    for w in i:
        # if(w in embeddings_index.keys()):
        # 	sd = np.concatenate(sd,embeddings_index[w])
                          n_estimators=n_estimator,
                          random_state=0)

rt_lm = LogisticRegression()
pipeline = make_pipeline(rt, rt_lm)
pipeline.fit(X_train, y_train)
y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)

# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = CategoricalEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)

grd = GradientBoostingClassifier(n_estimators=n_estimator)
grd_enc = CategoricalEncoder()
grd_lm = LogisticRegression()
grd.fit(X_train, y_train)
grd_enc.fit(grd.apply(X_train)[:, :, 0])
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)

y_pred_grd_lm = grd_lm.predict_proba(
    grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)
Esempio n. 4
0
def extract_features(df_train,
                     df_inference,
                     selected_feature_names_categ,
                     selected_feature_names_interval,
                     shuffle=True,
                     fuzzy_matching=True,
                     use_onehot=True,
                     use_sentence_vec=False):

    features_to_use = []
    variable_types = []

    if not (use_onehot):
        for feature in selected_feature_names_categ:
            features_to_use.append(feature + '_encoded')
            variable_types.append('categorical_nominal')

    # Append interval AFTER categorical!!
    for feature in selected_feature_names_interval:
        features_to_use.append(feature + '_normed')
        variable_types.append('numerical')

    # Check to ensure all cols exist (avoid keyerrors)
    for df in [df_train, df_inference]:
        df[selected_feature_names_categ + selected_feature_names_interval]
        print(df['combined_str'])

    # for feature in selected_feature_names_categ:
    #     le = preprocessing.LabelEncoder()
    #     print(print_attr_overview(df[feature], True, topn=10))
    #     df[feature + '_encoded'] = le.fit_transform(df[feature])
    #     features_to_use.append(feature + '_encoded')

    if use_onehot:
        # Each Feature has its own vocab...
        vocabs = defaultdict(list)

        X = pd.concat([df_train[colnames_categ], df_inference[colnames_categ]])
        X = df_train[colnames_categ]
        X = X.apply(preprocess_categ_series)

        enc = CategoricalEncoder(handle_unknown='ignore')
        enc.fit_transform(X)

        # pprint(enc.categories_)

    else:
        le = preprocessing.LabelEncoder()
        all_unique = []

        # FIT LABEL_ENCODER (combine vocabs for train and inference)
        for df in [df_train, df_inference]:
            for feature in selected_feature_names_categ:
                # print(print_attr_overview(df[feature]))

                s = df[feature]

                # Remove categorical entries with less than 10 occurances
                a = s.value_counts()
                s[s.isin(a.index[a < 12])] = np.nan

                s[s.isnull()] = "EMPTY_PLACEHOLDER"
                s = s.map(lambda x: x.lower() if type(x) == str else x)
                # print(np.unique(df[feature]))
                all_unique.extend(np.unique(s))

        le.fit(all_unique)

        # TRANSFORM LABEL_ENCODER
        for df in [df_train, df_inference]:
            for feature in selected_feature_names_categ:
                print(feature)
                # print(df[feature])
                s = df[feature]

                s = s.map(lambda x: x.lower() if type(x) == str else x)
                df[feature + '_encoded'] = le.transform(s)
                print(feature, len(np.unique(s)))

    for df in [df_train, df_inference]:
        for feature in selected_feature_names_interval:
            s = df[feature]
            s = s.map(lambda x: x.replace(',', '') if type(x) == str else x)
            # print(s)
            s = pd.to_numeric(s, errors='coerce')

            # Set null values to zero
            # TODO: try set nan to the mean instead of zero
            # TODO: try different types of normalisation
            s[np.logical_not(s.notnull())] = 0.0

            df[feature + '_normed'] = norm_zscore(s)

    # features_to_use.append('sentence_vec')
    # variable_types.append('embedding')

    if use_sentence_vec:
        from ft_embedding import get_sentence_vec
        print('Computing sentence vectors for dataset')
        train_embedding_mat = np.asarray(
            [get_sentence_vec(x) for x in df_train['combined_str']])
        inference_embedding_mat = np.asarray(
            [get_sentence_vec(x) for x in df_inference['combined_str']])
        variable_types.append('ft_embedding')

    if use_onehot:
        print(features_to_use)

        # One-Hot Categorical Encoding
        train_X_onehot = enc.transform(df_train[colnames_categ]).toarray()
        train_X_interval = df_train[features_to_use].as_matrix()
        print(train_X_onehot.shape)
        print(train_X_interval.shape)
        train_X = np.hstack([train_X_onehot, train_X_interval])

        inference_X_onehot = enc.transform(
            df_inference[colnames_categ]).toarray()
        inference_X_interval = df_inference[features_to_use].as_matrix()
        print(inference_X_onehot.shape)
        print(inference_X_interval.shape)
        inference_X = np.hstack([inference_X_onehot, inference_X_interval])

        # Add (one-hot encoded) numerical features to variable_types
        len_onehot = train_X_onehot.shape[1]
        print(len_onehot)
        features_to_use = ['numerical'
                           for i in range(len_onehot)] + features_to_use

    else:
        # Index Categorical Encoding (integer)
        train_X = df_train[features_to_use].as_matrix()
        inference_X = df_inference[features_to_use].as_matrix()

    train_y = df_train['case_status'].as_matrix()

    if use_sentence_vec:
        # Stack with sentence embedding
        train_X = np.hstack([train_X.copy(), train_embedding_mat])
        inference_X = np.hstack([inference_X.copy(), inference_embedding_mat])
        print(train_embedding_mat.shape)
        print(inference_embedding_mat.shape)

    print(train_X.shape)
    print(inference_X.shape)
    # exit()
    inference_row_id = df_inference['row ID']

    if shuffle:
        train_X, train_y = skl_shuffle(train_X, train_y)

    # print(X.shape)
    # print(y.shape)

    if use_onehot:
        vocab_size = 0
    else:
        vocab_size = len(list(le.classes_))

    return train_X, train_y, inference_row_id, inference_X, vocab_size, variable_types, features_to_use
Esempio n. 5
0
        X_categorical = X[:, :idx_end_categorical + 1]

        # Select only the numerical columns of X (including ft_embedding if present)
        X_numerical = X[:, idx_end_categorical + 1:]

        return X_categorical, X_numerical

    else:
        return np.zeros((X.shape[0], 0)), X


if __name__ == "__main__":
    df_train = load_and_preprocess('TrainingSet(3).csv', nrows=10000)
    # print(df_train.combined_str)

    # X = pd.concat([df_train[colnames_categ], df_inference[colnames_categ]])
    X = df_train[colnames_categ]
    X = X.apply(preprocess_categ_series)

    enc = CategoricalEncoder(handle_unknown='ignore')
    enc.fit(X)

    len_onehot = enc.transform(
        df_train[colnames_categ].iloc[:1]).toarray().shape[1]
    print(len_onehot)
    # train_X_onehot = enc.transform(df_train[colnames_categ]).toarray()
    # # inference_X_onehot = enc.transform(df_train[colnames_categ]).toarray()
    # print(train_X_onehot.shape)
    # print(train_X_onehot[0])
    # pprint(enc.categories_)
rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator,
	random_state=0)

rt_lm = LogisticRegression()
pipeline = make_pipeline(rt, rt_lm)
pipeline.fit(X_train, y_train)
y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)

# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = CategoricalEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)

grd = GradientBoostingClassifier(n_estimators=n_estimator)
grd_enc = CategoricalEncoder()
grd_lm = LogisticRegression()
grd.fit(X_train, y_train)
grd_enc.fit(grd.apply(X_train)[:, :, 0])
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)

y_pred_grd_lm = grd_lm.predict_proba(
    grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)