Beispiel #1
0
def inblance_preprocessing(data_df, label_df):
    data_df = pd.concat([data_df, label_df], axis=1)
    positive_instances = data_df[data_df['is_risk'] == 1]
    negative_instances = data_df[data_df['is_risk'] == 0]
    print('positive_instances negative_instances len is ',
          len(positive_instances), len(negative_instances))

    if len(positive_instances) > len(negative_instances):
        n = int(len(positive_instances) / len(negative_instances))
    else:
        n = int(len(negative_instances) / len(positive_instances))
    n = max(n, 1)

    all_instances = negative_instances
    for _ in range(n):
        all_instances = all_instances.append(positive_instances)
    print('all_instances len is ', len(all_instances), 'shape is ',
          all_instances.shape)
    all_instances = skl_shuffle(all_instances)
    return all_instances.iloc[:, :-1], all_instances.iloc[:, -1]
Beispiel #2
0
 def randomize(self, table):
     return skl_shuffle(table, random_state=self.rand_seed)
Beispiel #3
0
 def shuffle(self, random_state=None):
     self._data, self._target = skl_shuffle(self._data, self._target,
                                            random_state=random_state)
Beispiel #4
0
def extract_features(df_train,
                     df_inference,
                     selected_feature_names_categ,
                     selected_feature_names_interval,
                     shuffle=True,
                     fuzzy_matching=True,
                     use_onehot=True,
                     use_sentence_vec=False):

    features_to_use = []
    variable_types = []

    if not (use_onehot):
        for feature in selected_feature_names_categ:
            features_to_use.append(feature + '_encoded')
            variable_types.append('categorical_nominal')

    # Append interval AFTER categorical!!
    for feature in selected_feature_names_interval:
        features_to_use.append(feature + '_normed')
        variable_types.append('numerical')

    # Check to ensure all cols exist (avoid keyerrors)
    for df in [df_train, df_inference]:
        df[selected_feature_names_categ + selected_feature_names_interval]
        print(df['combined_str'])

    # for feature in selected_feature_names_categ:
    #     le = preprocessing.LabelEncoder()
    #     print(print_attr_overview(df[feature], True, topn=10))
    #     df[feature + '_encoded'] = le.fit_transform(df[feature])
    #     features_to_use.append(feature + '_encoded')

    if use_onehot:
        # Each Feature has its own vocab...
        vocabs = defaultdict(list)

        X = pd.concat([df_train[colnames_categ], df_inference[colnames_categ]])
        X = df_train[colnames_categ]
        X = X.apply(preprocess_categ_series)

        enc = CategoricalEncoder(handle_unknown='ignore')
        enc.fit_transform(X)

        # pprint(enc.categories_)

    else:
        le = preprocessing.LabelEncoder()
        all_unique = []

        # FIT LABEL_ENCODER (combine vocabs for train and inference)
        for df in [df_train, df_inference]:
            for feature in selected_feature_names_categ:
                # print(print_attr_overview(df[feature]))

                s = df[feature]

                # Remove categorical entries with less than 10 occurances
                a = s.value_counts()
                s[s.isin(a.index[a < 12])] = np.nan

                s[s.isnull()] = "EMPTY_PLACEHOLDER"
                s = s.map(lambda x: x.lower() if type(x) == str else x)
                # print(np.unique(df[feature]))
                all_unique.extend(np.unique(s))

        le.fit(all_unique)

        # TRANSFORM LABEL_ENCODER
        for df in [df_train, df_inference]:
            for feature in selected_feature_names_categ:
                print(feature)
                # print(df[feature])
                s = df[feature]

                s = s.map(lambda x: x.lower() if type(x) == str else x)
                df[feature + '_encoded'] = le.transform(s)
                print(feature, len(np.unique(s)))

    for df in [df_train, df_inference]:
        for feature in selected_feature_names_interval:
            s = df[feature]
            s = s.map(lambda x: x.replace(',', '') if type(x) == str else x)
            # print(s)
            s = pd.to_numeric(s, errors='coerce')

            # Set null values to zero
            # TODO: try set nan to the mean instead of zero
            # TODO: try different types of normalisation
            s[np.logical_not(s.notnull())] = 0.0

            df[feature + '_normed'] = norm_zscore(s)

    # features_to_use.append('sentence_vec')
    # variable_types.append('embedding')

    if use_sentence_vec:
        from ft_embedding import get_sentence_vec
        print('Computing sentence vectors for dataset')
        train_embedding_mat = np.asarray(
            [get_sentence_vec(x) for x in df_train['combined_str']])
        inference_embedding_mat = np.asarray(
            [get_sentence_vec(x) for x in df_inference['combined_str']])
        variable_types.append('ft_embedding')

    if use_onehot:
        print(features_to_use)

        # One-Hot Categorical Encoding
        train_X_onehot = enc.transform(df_train[colnames_categ]).toarray()
        train_X_interval = df_train[features_to_use].as_matrix()
        print(train_X_onehot.shape)
        print(train_X_interval.shape)
        train_X = np.hstack([train_X_onehot, train_X_interval])

        inference_X_onehot = enc.transform(
            df_inference[colnames_categ]).toarray()
        inference_X_interval = df_inference[features_to_use].as_matrix()
        print(inference_X_onehot.shape)
        print(inference_X_interval.shape)
        inference_X = np.hstack([inference_X_onehot, inference_X_interval])

        # Add (one-hot encoded) numerical features to variable_types
        len_onehot = train_X_onehot.shape[1]
        print(len_onehot)
        features_to_use = ['numerical'
                           for i in range(len_onehot)] + features_to_use

    else:
        # Index Categorical Encoding (integer)
        train_X = df_train[features_to_use].as_matrix()
        inference_X = df_inference[features_to_use].as_matrix()

    train_y = df_train['case_status'].as_matrix()

    if use_sentence_vec:
        # Stack with sentence embedding
        train_X = np.hstack([train_X.copy(), train_embedding_mat])
        inference_X = np.hstack([inference_X.copy(), inference_embedding_mat])
        print(train_embedding_mat.shape)
        print(inference_embedding_mat.shape)

    print(train_X.shape)
    print(inference_X.shape)
    # exit()
    inference_row_id = df_inference['row ID']

    if shuffle:
        train_X, train_y = skl_shuffle(train_X, train_y)

    # print(X.shape)
    # print(y.shape)

    if use_onehot:
        vocab_size = 0
    else:
        vocab_size = len(list(le.classes_))

    return train_X, train_y, inference_row_id, inference_X, vocab_size, variable_types, features_to_use