def inblance_preprocessing(data_df, label_df): data_df = pd.concat([data_df, label_df], axis=1) positive_instances = data_df[data_df['is_risk'] == 1] negative_instances = data_df[data_df['is_risk'] == 0] print('positive_instances negative_instances len is ', len(positive_instances), len(negative_instances)) if len(positive_instances) > len(negative_instances): n = int(len(positive_instances) / len(negative_instances)) else: n = int(len(negative_instances) / len(positive_instances)) n = max(n, 1) all_instances = negative_instances for _ in range(n): all_instances = all_instances.append(positive_instances) print('all_instances len is ', len(all_instances), 'shape is ', all_instances.shape) all_instances = skl_shuffle(all_instances) return all_instances.iloc[:, :-1], all_instances.iloc[:, -1]
def randomize(self, table): return skl_shuffle(table, random_state=self.rand_seed)
def shuffle(self, random_state=None): self._data, self._target = skl_shuffle(self._data, self._target, random_state=random_state)
def extract_features(df_train, df_inference, selected_feature_names_categ, selected_feature_names_interval, shuffle=True, fuzzy_matching=True, use_onehot=True, use_sentence_vec=False): features_to_use = [] variable_types = [] if not (use_onehot): for feature in selected_feature_names_categ: features_to_use.append(feature + '_encoded') variable_types.append('categorical_nominal') # Append interval AFTER categorical!! for feature in selected_feature_names_interval: features_to_use.append(feature + '_normed') variable_types.append('numerical') # Check to ensure all cols exist (avoid keyerrors) for df in [df_train, df_inference]: df[selected_feature_names_categ + selected_feature_names_interval] print(df['combined_str']) # for feature in selected_feature_names_categ: # le = preprocessing.LabelEncoder() # print(print_attr_overview(df[feature], True, topn=10)) # df[feature + '_encoded'] = le.fit_transform(df[feature]) # features_to_use.append(feature + '_encoded') if use_onehot: # Each Feature has its own vocab... vocabs = defaultdict(list) X = pd.concat([df_train[colnames_categ], df_inference[colnames_categ]]) X = df_train[colnames_categ] X = X.apply(preprocess_categ_series) enc = CategoricalEncoder(handle_unknown='ignore') enc.fit_transform(X) # pprint(enc.categories_) else: le = preprocessing.LabelEncoder() all_unique = [] # FIT LABEL_ENCODER (combine vocabs for train and inference) for df in [df_train, df_inference]: for feature in selected_feature_names_categ: # print(print_attr_overview(df[feature])) s = df[feature] # Remove categorical entries with less than 10 occurances a = s.value_counts() s[s.isin(a.index[a < 12])] = np.nan s[s.isnull()] = "EMPTY_PLACEHOLDER" s = s.map(lambda x: x.lower() if type(x) == str else x) # print(np.unique(df[feature])) all_unique.extend(np.unique(s)) le.fit(all_unique) # TRANSFORM LABEL_ENCODER for df in [df_train, df_inference]: for feature in selected_feature_names_categ: print(feature) # print(df[feature]) s = df[feature] s = s.map(lambda x: x.lower() if type(x) == str else x) df[feature + '_encoded'] = le.transform(s) print(feature, len(np.unique(s))) for df in [df_train, df_inference]: for feature in selected_feature_names_interval: s = df[feature] s = s.map(lambda x: x.replace(',', '') if type(x) == str else x) # print(s) s = pd.to_numeric(s, errors='coerce') # Set null values to zero # TODO: try set nan to the mean instead of zero # TODO: try different types of normalisation s[np.logical_not(s.notnull())] = 0.0 df[feature + '_normed'] = norm_zscore(s) # features_to_use.append('sentence_vec') # variable_types.append('embedding') if use_sentence_vec: from ft_embedding import get_sentence_vec print('Computing sentence vectors for dataset') train_embedding_mat = np.asarray( [get_sentence_vec(x) for x in df_train['combined_str']]) inference_embedding_mat = np.asarray( [get_sentence_vec(x) for x in df_inference['combined_str']]) variable_types.append('ft_embedding') if use_onehot: print(features_to_use) # One-Hot Categorical Encoding train_X_onehot = enc.transform(df_train[colnames_categ]).toarray() train_X_interval = df_train[features_to_use].as_matrix() print(train_X_onehot.shape) print(train_X_interval.shape) train_X = np.hstack([train_X_onehot, train_X_interval]) inference_X_onehot = enc.transform( df_inference[colnames_categ]).toarray() inference_X_interval = df_inference[features_to_use].as_matrix() print(inference_X_onehot.shape) print(inference_X_interval.shape) inference_X = np.hstack([inference_X_onehot, inference_X_interval]) # Add (one-hot encoded) numerical features to variable_types len_onehot = train_X_onehot.shape[1] print(len_onehot) features_to_use = ['numerical' for i in range(len_onehot)] + features_to_use else: # Index Categorical Encoding (integer) train_X = df_train[features_to_use].as_matrix() inference_X = df_inference[features_to_use].as_matrix() train_y = df_train['case_status'].as_matrix() if use_sentence_vec: # Stack with sentence embedding train_X = np.hstack([train_X.copy(), train_embedding_mat]) inference_X = np.hstack([inference_X.copy(), inference_embedding_mat]) print(train_embedding_mat.shape) print(inference_embedding_mat.shape) print(train_X.shape) print(inference_X.shape) # exit() inference_row_id = df_inference['row ID'] if shuffle: train_X, train_y = skl_shuffle(train_X, train_y) # print(X.shape) # print(y.shape) if use_onehot: vocab_size = 0 else: vocab_size = len(list(le.classes_)) return train_X, train_y, inference_row_id, inference_X, vocab_size, variable_types, features_to_use