def _sklearn2weka(self, features, labels=None): encoder = CategoricalEncoder(encoding='ordinal') labels_nominal = encoder.fit_transform(np.array(labels).reshape(-1, 1)) if not hasattr(self, 'dict') and labels is not None: dict = {} for label, nominal in zip(labels, labels_nominal): if nominal.item(0) not in dict: dict[nominal.item(0)] = label self._dict = dict labels_column = np.reshape(labels_nominal,[labels_nominal.shape[0], 1]) weka_dataset = ndarray_to_instances(np.ascontiguousarray(features, dtype=np.float_), 'weka_dataset') weka_dataset.insert_attribute(Attribute.create_nominal('tag', [str(float(i)) for i in range(len(self._dict))]), features.shape[1]) if labels is not None: for index, inst in enumerate(weka_dataset): inst.set_value(features.shape[1], labels_column[index]) weka_dataset.set_instance(index,inst) return weka_dataset
def encode_cat(dat): cat_encoder = CategoricalEncoder(encoding='onehot-dense') dat = dat.astype('str') dat_reshaped = dat.values.reshape(-1, 1) dat_1hot = cat_encoder.fit_transform(dat_reshaped) col_names = [ dat.name + '_' + str(x) for x in list(cat_encoder.categories_[0]) ] return pd.DataFrame(dat_1hot, columns=col_names)
def encode_cat(dat): """ functon to return a labeled data frame with one hot encoding """ cat_encoder = CategoricalEncoder(encoding="onehot-dense") dat = dat.astype('str') dat_reshaped = dat.values.reshape(-1, 1) dat_1hot = cat_encoder.fit_transform(dat_reshaped) col_names = [ dat.name + "_" + str(x) for x in list(cat_encoder.categories_[0]) ] return pd.DataFrame(dat_1hot, columns=col_names)
def train(self, d, report_dir=None, dropout=0.5, batch_size=32, epochs=5, validation_split=0., **params): d = d.sample(frac=1) x = d[[c for c in d.columns if c != self.target]] y = d[[self.target]] self.preprocessor = dict( x=StandardScaler(), y=CategoricalEncoder(encoding='onehot-dense'), ) x = self.preprocessor['x'].fit_transform(x) y = self.preprocessor['y'].fit_transform(y) self.build(dropout=dropout) self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) callbacks_used = [callbacks.TerminateOnNaN()] if validation_split: callbacks_used += [ callbacks.TensorBoard(report_dir, batch_size=batch_size, histogram_freq=1, write_grads=True), callbacks.ModelCheckpoint(os.path.join(report_dir, 'network.h5'), verbose=0, save_best_only=True) ] report = self.model.fit(x, y, batch_size=batch_size, epochs=epochs, verbose=0, callbacks=callbacks_used, validation_split=validation_split, validation_data=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None) if not validation_split: models.save_model(self.model, os.path.join(report_dir, 'network.h5')) return {k: [float(_v) for _v in v] for k, v in report.history.items()}
def create_pipelines(housing_num): num_attribs = list(housing_num) cat_attribs = ["ocean_proximity"] num_pipeline = Pipeline([('selector', DataFrameSelector(num_attribs)), ('imputer', Imputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_caller', StandardScaler())]) cat_pipeline = Pipeline([('selector', DataFrameSelector(cat_attribs)), ('cat_encoder', CategoricalEncoder(encoding="onehot-dense"))]) return num_pipeline, cat_pipeline
def make_encode_pipeline(): numerical_cols = NUMERICAL_COLS categorical_cols = CATEGORICAL_COLS + ['AgeBin', 'FamilyBin'] cat_pipe = Pipeline([ ('cat_selector', DataFrameSelector(categorical_cols)), ('cat_encoder', CategoricalEncoder('onehot-dense')), ]) final_pipe = FeatureUnion([ ('num_selector', DataFrameSelector(numerical_cols)), ('cat_pipe', cat_pipe), ]) return final_pipe
def convert_categorical_features(df): enc = CategoricalEncoder(encoding='ordinal') encoded_features = enc.fit_transform(df[[ 'dim_is_requested', 'dim_market', 'dim_room_type', 'cancel_policy', 'dim_is_instant_bookable' ]]) encoded_df = pd.DataFrame(encoded_features, index=df.index, columns=[ 'dim_is_requested', 'dim_market', 'dim_room_type', 'cancel_policy', 'dim_is_instant_bookable' ]) col = df.columns.tolist() col_non_cat = col[1:3] + col[5:6] + col[7:10] + col[11:] df_non_cat = df[col_non_cat] col_cat = encoded_df.columns.tolist() col_full = col_cat[:] + col_non_cat[:] stack_full = np.column_stack([encoded_df, df_non_cat]) stack_df = pd.DataFrame(stack_full, index=df.index, columns=col_full) return stack_df
def make_impute_pipeline(): categorical_cols = CATEGORICAL_COLS numerical_cols = NUMERICAL_COLS categorical_pre = Pipeline([ ('selector', DataFrameSelector(categorical_cols)), ('impute', CustomImputer(strategy='mode')), ]) categorical_pipeline = Pipeline([ ('categorical_pre', categorical_pre), ('encoder', CategoricalEncoder(encoding='onehot-dense')), ]) num_init_quantile_transformer = QuantileTransformer( output_distribution='normal') numerical_pipeline = Pipeline([ ('selector', DataFrameSelector(numerical_cols)), ('scale', num_init_quantile_transformer), ]) combined_features = FeatureUnion([ ('numerical_pipeline', numerical_pipeline), ('cat_ordinal_pipeline', categorical_pipeline), ]) mice_pipeline = Pipeline([ ('combined_features', combined_features), ('mice_impute', MICEImputer(verbose=True)), ]) impute_pipeline = Pipeline([ ('mice_pipeline', mice_pipeline), ('inverse_qt', SelectiveAction( col=list(range(len(numerical_cols))), action=FunctionTransformer( inverse_func, kw_args={'transformer': num_init_quantile_transformer}))), ('numerical_selection', ColumnSelector(range(len(numerical_cols)))) ]) final_pipeline = FeatureUnion([('impute_pipeline', impute_pipeline), ('categorical_pre', categorical_pre)]) return final_pipeline
# In[37]: from sklearn.preprocessing import OneHotEncoder encoder = OneHotEncoder() housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1, 1)) housing_cat_1hot # In[38]: housing_cat_1hot.toarray() # In[39]: from sklearn.preprocessing import CategoricalEncoder cat_encoder = CategoricalEncoder() housing_cat_reshaped = housing_cat.values.reshape(-1, 1) housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped) housing_cat_1hot # In[40]: cat_encoder = CategoricalEncoder(encoding="onehot-dense") housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped) housing_cat_1hot # In[41]: cat_encoder.categories_ # In[42]:
X_q = np.array([[ '66', 'retired', 'married', 'primary', 'no', '206', 'no', 'no', 'cellular', '9', 'feb', '479', '1', '-1', '0', 'unknown' ], [ '54', 'technician', 'married', 'tertiary', 'no', '876', 'no', 'no', 'cellular', '27', 'oct', '269', '3', '541', '3', 'success' ]]) # ### Replacing nominal relations # To use this data we must replace nominal relations. We can encode labels with value between 0 and n_classes-1 or add n_classes dummy values and remove original column. Here we used second option, replaced nominal columns with onehot versions. I used this option because the first one would imply some categories were closer then others, for example encoded value 5 would be "closer" to encoded category 4 than 1. Downside of one-hot option is we will greatly increase number of features, but we can later use PCA to reduce number of features, or just keep the most informative ones. # In[4]: enc = CategoricalEncoder(encoding='onehot-dense') X_2 = np.array(X[:, 0].reshape(-1, 1)) Xq_2 = np.array(X_q[:, 0].reshape(-1, 1)) attributes = [dataset['attributes'][0][0]] for i, (name, relation) in enumerate(dataset['attributes'][1:-1]): if relation == 'NUMERIC': X_2 = np.hstack((X_2, X[:, i + 1].reshape(-1, 1))) Xq_2 = np.hstack((Xq_2, X_q[:, i + 1].reshape(-1, 1))) attributes.append(name) continue X_2 = np.hstack((X_2, enc.fit_transform(X[:, i + 1].reshape(-1, 1)))) Xq_2 = np.hstack((Xq_2, enc.transform(X_q[:, i + 1].reshape(-1, 1))))
# - age: float. # - fare: float. # Categorical Features: # - embarked: categories encoded as strings {'C', 'S', 'Q'}. # - sex: categories encoded as strings {'female', 'male'}. # - pclass: ordinal integers {1, 2, 3}. numeric_features = ['age', 'fare'] categorical_features = ['embarked', 'sex', 'pclass'] # Provisionally, use pd.fillna() to impute missing values for categorical # features; SimpleImputer will eventually support strategy="constant". data[categorical_features] = data[categorical_features].fillna(value='missing') # We create the preprocessing pipelines for both numeric and categorical data. numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler()) categorical_transformer = CategoricalEncoder('onehot-dense', handle_unknown='ignore') preprocessing_pl = make_column_transformer( (numeric_features, numeric_transformer), (categorical_features, categorical_transformer), remainder='drop' ) # Append classifier to preprocessing pipeline. # Now we have a full prediction pipeline. clf = make_pipeline(preprocessing_pl, LogisticRegression()) X = data.drop('survived', axis=1) y = data.survived.values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
for t in transition: transit.append(t) # feat.append(features) # transit.append(transition) print(len(feat), 25) GloveDimOption = '50' # this could be 50 (171.4 MB), 100 (347.1 MB), 200 (693.4 MB), or 300 (1 GB) embeddings_index = loadGloveModel('data/glove.6B.' + GloveDimOption + 'd.txt') # print(embeddings_index['apple']) # print(embeddings_index['mango']) embeddings_index[''] = np.zeros(50) embeddings_index['*root'] = np.ones(50) enc = CategoricalEncoder(encoding='onehot') X_pos = [['ADJ'], ['ADP'], ['ADV'], ['AUX'], ['CCONJ'], ['DET'], ['INTJ'], ['NOUN'], ['NUM'], ['PART'], ['PRON'], ['PROPN'], ['PUNCT'], ['SCONJ'], ['SYM'], ['VERB'], ['X']] enc.fit(X_pos) for i in X_pos: embeddings_index[i[0]] = pad_sequences(enc.transform([[i[0]]]).toarray(), maxlen=50, padding='post')[0] #embeddings_index[i[0]] = pad_sequences(enc.transform([[i[0]]]).toarray(), maxlen=18, padding='post')[0] # print(embeddings_index[i[0]]) # print(embeddings_index['apple']) feat_vect, transit_vect = [], [] # feat_vect = np.array(())
def kfold_validation(self, k=10): sem.acquire() available_ram = psutil.virtual_memory()[1] available_ram = int(int(available_ram) * .9 * 1e-9) if available_ram > 5: jvm.start(max_heap_size='5g') else: jvm.start(max_heap_size=str(available_ram)+'g') ### print('\nCaricando '+self.input_file+' con opts -f'+str(self.features_number)+' -c'+self.classifier_name+'\n') # load .arff file dataset = arff.load(open(self.input_file, 'r')) data = np.array(dataset['data']) self.features_names = [x[0] for x in dataset['attributes']] self.attributes_number = data.shape[1] self.dataset_features_number = self.attributes_number - self.levels_number # Factorization of Nominal features_index encoder = CategoricalEncoder(encoding='ordinal') nominal_features_index = [i for i in range(len(dataset['attributes'][:-self.levels_number])) if dataset['attributes'][i][1] != u'NUMERIC'] if len(nominal_features_index) > 0: data[:, nominal_features_index] = encoder.fit_transform( data[:, nominal_features_index]) # Impute missing value by fitting over training set and transforming both sets imp = SimpleImputer(missing_values='NaN', strategy='most_frequent') data[:, :self.dataset_features_number] = imp.fit_transform(data[:, :self.dataset_features_number]) classifiers_per_fold = [] oracles_per_fold = [] predictions_per_fold = [] predictions_per_fold_all = [] print('\n***\nStart testing with '+str(k)+'Fold cross-validation -f'+str(self.features_number)+' -c'+self.classifier_name+'\n***\n') bar = progressbar.ProgressBar(maxval=k, widgets=[progressbar.Bar( '=', '[', ']'), ' ', progressbar.Percentage()]) bar.start() skf = StratifiedKFold(n_splits=k, shuffle=True) bar_cnt = 0 for train_index, test_index in skf.split(data, data[:,self.attributes_number-1]): self.classifiers = [] self.training_set = data[train_index, :self.dataset_features_number] self.testing_set = data[test_index, :self.dataset_features_number] self.ground_through = data[train_index, self.dataset_features_number:] self.oracle = data[test_index, self.dataset_features_number:] self.prediction = np.ndarray(shape=[len(test_index),self.levels_number],dtype='<U24') self.prediction_all = np.ndarray(shape=[len(test_index),self.levels_number],dtype='<U24') root = Tree() root.train_index = [i for i in range(self.training_set.shape[0])] root.test_index = [i for i in range(self.testing_set.shape[0])] root.test_index_all = root.test_index root.children_tags = list(set(self.ground_through[root.train_index, root.level])) root.children_number = len(root.children_tags) if self.has_config: if 'f' in config[root.tag + '_' + str(root.level + 1)]: root.features_number = config[root.tag + '_' + str(root.level + 1)]['f'] elif 'p' in config[root.tag + '_' + str(root.level + 1)]: root.packets_number = config[root.tag + '_' + str(root.level + 1)]['p'] root.classifier_name = config[root.tag + '_' + str(root.level + 1)]['c'] print('config','tag',root.tag,'level',root.level,'f',root.features_number,'c',root.classifier_name) else: root.features_number = self.features_number root.packets_number = self.packets_number root.classifier_name = self.classifier_name self.classifiers.append(root) if root.children_number > 1: classifier_to_call = getattr(self, supported_classifiers[root.classifier_name]) classifier_to_call(node=root) else: self.unary_class_results_inferring(root) # Creating hierarchy recursively if root.level < self.levels_number-1 and root.children_number > 0: self.recursive(root) classifiers_per_fold.append(self.classifiers) oracles_per_fold.append(self.oracle) predictions_per_fold.append(self.prediction) predictions_per_fold_all.append(self.prediction_all) bar_cnt += 1 bar.update(bar_cnt) bar.finish() folder_discriminator = self.classifier_name if self.has_config: folder_discriminator = self.config_name material_folder = './data_'+folder_discriminator+'/material/' if not os.path.exists('./data_'+folder_discriminator): os.makedirs('./data_'+folder_discriminator) os.makedirs(material_folder) elif not os.path.exists(material_folder): os.makedirs(material_folder) type_discr = 'flow' feat_discr = '_f_' + str(self.features_number) if not self.has_config and self.packets_number != 0: type_discr = 'early' feat_discr = '_p_' + str(self.packets_number) elif self.has_config: if 'p' in self.config: type_discr = 'early' feat_discr = '_c_' + self.config_name material_features_folder = './data_'+folder_discriminator+'/material/features/' if not os.path.exists(material_folder): os.makedirs(material_folder) os.makedirs(material_features_folder) elif not os.path.exists(material_features_folder): os.makedirs(material_features_folder) for i in range(self.levels_number): file = open(material_folder + 'multi_' + type_discr + '_level_' + str(i+1) + feat_discr + '.dat', 'w+') file.close() for j in range(k): file = open(material_folder + 'multi_' + type_discr + '_level_' + str(i+1) + feat_discr + '.dat', 'a') file.write('@fold\n') for o, p in zip(oracles_per_fold[j][:,i], predictions_per_fold[j][:,i]): file.write(str(o)+' '+str(p)+'\n') file.close() # Inferring NW metrics per classifier for classifier in classifiers_per_fold[0]: file = open(material_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '.dat', 'w+') file.close() file = open(material_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '_all.dat', 'w+') file.close() file = open(material_features_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '_features.dat', 'w+') file.close() for fold_n, classifiers in enumerate(classifiers_per_fold): for classifier in classifiers: file = open(material_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '.dat', 'a') if classifier.level > 0: index = [] for pred_n, prediction in enumerate(predictions_per_fold[fold_n][classifier.test_index, classifier.level-1]): if prediction == oracles_per_fold[fold_n][classifier.test_index[pred_n], classifier.level-1]: index.append(classifier.test_index[pred_n]) prediction_nw = predictions_per_fold[fold_n][index, classifier.level] oracle_nw = oracles_per_fold[fold_n][index, classifier.level] else: prediction_nw = predictions_per_fold[fold_n][classifier.test_index, classifier.level] oracle_nw = oracles_per_fold[fold_n][classifier.test_index, classifier.level] file.write('@fold\n') for o, p in zip(oracle_nw, prediction_nw): file.write(str(o)+' '+str(p)+'\n') file.close() file = open(material_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '_all.dat', 'a') if classifier.level > 0: index = [] for pred_n, prediction in enumerate(predictions_per_fold_all[fold_n][classifier.test_index, classifier.level-1]): if prediction == oracles_per_fold[fold_n][classifier.test_index[pred_n], classifier.level-1]: index.append(classifier.test_index[pred_n]) prediction_all = predictions_per_fold_all[fold_n][index, classifier.level] oracle_all = oracles_per_fold[fold_n][index, classifier.level] else: prediction_all = predictions_per_fold_all[fold_n][classifier.test_index_all, classifier.level] oracle_all = oracles_per_fold_all[fold_n][classifier.test_index_all, classifier.level] file.write('@fold\n') for o, p in zip(oracle_all, prediction_all): file.write(str(o)+' '+str(p)+'\n') file.close() file = open(material_features_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '_features.dat', 'a') file.write('@fold\n') file.write(self.features_names[classifier.features_index[0]]) for feature_index in classifier.features_index[1:]: file.write(','+self.features_names[feature_index]) file.write('\n') file.close() graph_folder = './data_'+folder_discriminator+'/graph/' if not os.path.exists('./data_'+folder_discriminator): os.makedirs('./data_'+folder_discriminator) os.makedirs(graph_folder) elif not os.path.exists(graph_folder): os.makedirs(graph_folder) # Graph plot G = nx.DiGraph() for info in classifiers_per_fold[0]: G.add_node(str(info.level)+' '+info.tag, level=info.level, tag=info.tag, children_tags=info.children_tags) for node_parent, data_parent in G.nodes.items(): for node_child, data_child in G.nodes.items(): if data_child['level']-data_parent['level'] == 1 and any(data_child['tag'] in s for s in data_parent['children_tags']): G.add_edge(node_parent, node_child) nx.write_gpickle(G, graph_folder+'multi_' + type_discr + feat_discr +'_graph.gml') ### jvm.stop() sem.release()
from sklearn.tree import DecisionTreeClassifier from sklearn.tree import export_graphviz from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import Imputer from sklearn.metrics import f1_score from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import LabelBinarizer from sklearn.preprocessing import CategoricalEncoder train = pd.read_csv('train.csv') # Cabin allocations https://www.encyclopedia-titanica.org/cabins.html # label = LabelBinarizer() # hot = OneHotEncoder(handle_unknown = 'ignore', n_values=['male', 'female']) categorical = CategoricalEncoder() pipe = Pipeline([('Categorical Encoder', categorical)]) # pipe = Pipeline([('Label Binarizer', label)]) # train['Sex'] = train['Sex'].replace({'male': 0, 'female': 1}) # train.head() # train.describe() train = train[['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Age', 'Survived']] pipe.fit(train) imputer = Imputer() train.describe()
encoding_methods = ['one-hot', 'target', 'similarity'] dirty_column = 'Employee Position Title' ######################################################################### ######################################################################### # Creating a learning pipeline # ---------------------------- # The encoders for both clean and dirty data are first imported: from sklearn.preprocessing import FunctionTransformer from sklearn.preprocessing import CategoricalEncoder from dirty_cat import SimilarityEncoder, TargetEncoder encoders_dict = { 'one-hot': CategoricalEncoder(handle_unknown='ignore', encoding='onehot-dense'), 'similarity': SimilarityEncoder(similarity='ngram', handle_unknown='ignore'), 'target': TargetEncoder(handle_unknown='ignore'), 'numerical': FunctionTransformer(None)} # We then create a function that takes one key of our ``encoders_dict``, # returns a pipeline object with the associated encoder, # as well as a Scaler and a RidgeCV regressor: from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline def make_pipeline(encoding_method): # static transformers from the other columns
y_train, test_size=0.5) # Unsupervised transformation based on totally random trees rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator, random_state=0) rt_lm = LogisticRegression() pipeline = make_pipeline(rt, rt_lm) pipeline.fit(X_train, y_train) y_pred_rt = pipeline.predict_proba(X_test)[:, 1] fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt) # Supervised transformation based on random forests rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator) rf_enc = CategoricalEncoder() rf_lm = LogisticRegression() rf.fit(X_train, y_train) rf_enc.fit(rf.apply(X_train)) rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm) grd = GradientBoostingClassifier(n_estimators=n_estimator) grd_enc = CategoricalEncoder() grd_lm = LogisticRegression() grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
def kfold_validation(self, k=10): sem.acquire() available_ram = psutil.virtual_memory()[1] available_ram = int(int(available_ram) * .9 * 1e-9) if available_ram > 5: jvm.start(max_heap_size='5g') else: jvm.start(max_heap_size=str(available_ram)+'g') ### print('\nCaricando '+self.input_file+' con opts -f'+str(self.features_number)+' -c'+self.classifier_name+'\n') # load .arff file dataset = arff.load(open(input_file, 'r')) data = np.array(dataset['data']) self.features_names = [x[0] for x in dataset['attributes']] self.attributes_number = data.shape[1] self.dataset_features_number = self.attributes_number - self.levels_number # Factorization of Nominal features_index encoder = CategoricalEncoder(encoding='ordinal') nominal_features_index = [i for i in range(len(dataset['attributes'][:-self.levels_number])) if dataset['attributes'][i][1] != u'NUMERIC'] if len(nominal_features_index) > 0: data[:, nominal_features_index] = encoder.fit_transform( data[:, nominal_features_index]) # Impute missing value by fitting over training set and transforming both sets imp = SimpleImputer(missing_values='NaN', strategy='most_frequent') data[:, :self.dataset_features_number] = imp.fit_transform( data[:, :self.dataset_features_number]) prediction = [] probability = [] oracle = [] print('\n***\nStart testing with ' + str(k)+'Fold cross-validation -f'+str(self.features_number)+' -c'+self.classifier_name+'\n***\n') bar = progressbar.ProgressBar(maxval=k, widgets=[progressbar.Bar( '=', '[', ']'), ' ', progressbar.Percentage()]) bar.start() temp_metrics = [] skf = StratifiedKFold(n_splits=k, shuffle=True) bar_cnt = 0 for train_index, test_index in skf.split(data, data[:, self.dataset_features_number + self.tag_under_test]): self.training_set = data[train_index, :self.dataset_features_number] self.testing_set = data[test_index, :self.dataset_features_number] self.ground_through = data[train_index, self.dataset_features_number + self.tag_under_test] self.oracle = data[test_index, self.dataset_features_number + self.tag_under_test] self.prediction = np.ndarray( shape=[len(test_index), 1], dtype='<U24') self.probability = np.ndarray( shape=[len(test_index), len(set(self.ground_through))], dtype='<U24') classifier_to_call = getattr(self, supported_classifiers[self.classifier_name]) classifier_to_call() prediction.append(self.prediction) probability.append(self.probability) oracle.append(self.oracle) # print(type(prediction[bar_cnt])) # print(type(probability[bar_cnt])) bar_cnt += 1 bar.update(bar_cnt) bar.finish() relations = [] relations = [] relations.append({ # Lv2:Lv1 u'Tor': u'Tor', u'TorPT': u'Tor', u'TorApp': u'Tor', u'I2PApp80BW': u'I2P', u'I2PApp0BW': u'I2P', u'I2PApp': u'I2P', u'JonDonym': u'JonDonym' }) relations.append({ # Lv3:Lv2 u'JonDonym': u'JonDonym', u'I2PSNARK_App80BW': u'I2PApp80BW', u'IRC_App80BW': u'I2PApp80BW', u'Eepsites_App80BW': u'I2PApp80BW', u'I2PSNARK_App0BW': u'I2PApp0BW', u'IRC_App0BW': u'I2PApp0BW', u'Eepsites_App0BW': u'I2PApp0BW', u'I2PSNARK_App': u'I2PApp', u'IRC_App': u'I2PApp', u'Eepsites_App': u'I2PApp', u'ExploratoryTunnels_App': u'I2PApp', u'ParticipatingTunnels_App': u'I2PApp', u'Tor': u'Tor', u'Streaming': u'TorApp', u'Torrent': u'TorApp', u'Browsing': u'TorApp', u'Flashproxy': u'TorPT', u'FTE': u'TorPT', u'Meek': u'TorPT', u'Obfs3': u'TorPT', u'scramblesuit': u'TorPT' }) oracle_inferred = [] prediction_inferred = [] for i in range(self.tag_under_test): oracle_inferred.append(list()) prediction_inferred.append(list()) # Infering superior levels for i in range(k): # Assign of prediction to a dummy to use this one in consecutive label swaps inferred_prediction = prediction[i].copy() inferred_oracle = oracle[i].copy() for j in reversed(range(self.tag_under_test)): inferred_oracle = np.vectorize( relations[j].get)(list(inferred_oracle)) inferred_prediction = np.vectorize( relations[j].get)(list(inferred_prediction)) oracle_inferred[j].append(inferred_oracle) prediction_inferred[j].append(inferred_prediction) print('\n***\nStart testing with incremental gamma threshold\n***\n') bar = progressbar.ProgressBar(maxval=9, widgets=[progressbar.Bar( '=', '[', ']'), ' ', progressbar.Percentage()]) bar.start() oracle_gamma = [] prediction_gamma = [] classified_ratio = [] for i in range(9): gamma = float(i+1)/10.0 oracle_gamma.append(list()) prediction_gamma.append(list()) classified_ratio.append(list()) for j in range(k): indexes = [] p_cnt = 0 for p in probability[j]: if max(p) < gamma: indexes.append(p_cnt) p_cnt += 1 gamma_oracle = np.delete(oracle[j], [indexes]) gamma_prediction = np.delete(prediction[j], [indexes]) oracle_gamma[i].append(gamma_oracle) prediction_gamma[i].append(gamma_prediction) classified_ratio[i].append( float(len(gamma_prediction))/float(len(prediction[j]))) bar.update(i) bar.finish() data_folder = './data_'+self.classifier_name+'/material/' if not os.path.exists('./data_'+self.classifier_name): os.makedirs('./data_'+self.classifier_name) os.makedirs(data_folder) elif not os.path.exists(data_folder): os.makedirs(data_folder) if self.packets_number != 0: file = open(data_folder+'flat_early_level_'+str(self.level_target) + '_p_'+str(self.packets_number)+'.dat', 'w+') else: file = open(data_folder+'flat_flow_level_'+str(self.level_target) + '_f_'+str(self.features_number)+'.dat', 'w+') for i in range(k): file.write('@fold\n') for o, p in zip(oracle[i], prediction[i]): file.write(str(o)+' '+str(p)+'\n') file.close() for i in range(self.tag_under_test): if self.packets_number != 0: file = open(data_folder+'flat_early_level_'+str(self.level_target) + '_p_'+str(self.packets_number)+'_inferred_'+str(i+1)+'.dat', 'w+') else: file = open(data_folder+'flat_flow_level_'+str(self.level_target) + '_f_'+str(self.features_number)+'_inferred_'+str(i+1)+'.dat', 'w+') for j in range(k): file.write('@fold\n') for o, p in zip(oracle_inferred[i][j], prediction_inferred[i][j]): file.write(str(o)+' '+str(p)+'\n') file.close() for i in range(9): if self.packets_number != 0: file = open(data_folder+'flat_early_level_'+str(self.level_target)+'_p_' + str(self.packets_number)+'_gamma_'+str(float(i+1)/10.0)+'.dat', 'w+') else: file = open(data_folder+'flat_flow_level_'+str(self.level_target)+'_f_' + str(self.features_number)+'_gamma_'+str(float(i+1)/10.0)+'.dat', 'w+') for j in range(k): file.write('@fold_cr\n') file.write(str(classified_ratio[i][j])+'\n') for o, p in zip(oracle_gamma[i][j], prediction_gamma[i][j]): file.write(str(o)+' '+str(p)+'\n') file.close() ### jvm.stop() sem.release()
import pandas as pd from sklearn.naive_bayes import GaussianNB from sklearn import datasets import numpy as np from sklearn.preprocessing import CategoricalEncoder #Read the data and remove labels mushroom_data = pd.read_csv("../data/agaricus-lepiota.data", header = None) print(mushroom_data.keys()) target = mushroom_data[[19]] mushroom_data = mushroom_data.drop([19],axis = 1) # axis 1 for dropping columns fixed_mushrooms = CategoricalEncoder(mushroom_data) print(type(x)) #Make Naive Bayes Classifier nb = GaussianNB() #y_pred = nb.fit(mushroom_data.as_matrix(),target.as_matrix()).predict(mushroom_data)
# "communication" part (not initially present in the category as a unique word) # as well as the technician part of this category. ######################################################################### # Encoding categorical data using SimilarityEncoder # ------------------------------------------------- # # A typical data-science workflow uses one-hot encoding to represent # categories. from sklearn.preprocessing import CategoricalEncoder # encoding simply a subset of the observations n_obs = 20 employee_position_titles = values['Employee Position Title'].head( n_obs).to_frame() categorical_encoder = CategoricalEncoder(encoding='onehot-dense') one_hot_encoded = categorical_encoder.fit_transform(employee_position_titles) f3, ax3 = plt.subplots(figsize=(6, 6)) ax3.matshow(one_hot_encoded) ax3.set_title('Employee Position Title values, one-hot encoded') ax3.axis('off') f3.tight_layout() ######################################################################### # The corresponding is very sparse # # SimilarityEncoder can be used to replace one-hot encoding capturing the # similarities: f4, ax4 = plt.subplots(figsize=(6, 6)) similarity_encoded = similarity_encoder.fit_transform(employee_position_titles)
X_categorical = X[:, :idx_end_categorical + 1] # Select only the numerical columns of X (including ft_embedding if present) X_numerical = X[:, idx_end_categorical + 1:] return X_categorical, X_numerical else: return np.zeros((X.shape[0], 0)), X if __name__ == "__main__": df_train = load_and_preprocess('TrainingSet(3).csv', nrows=10000) # print(df_train.combined_str) # X = pd.concat([df_train[colnames_categ], df_inference[colnames_categ]]) X = df_train[colnames_categ] X = X.apply(preprocess_categ_series) enc = CategoricalEncoder(handle_unknown='ignore') enc.fit(X) len_onehot = enc.transform( df_train[colnames_categ].iloc[:1]).toarray().shape[1] print(len_onehot) # train_X_onehot = enc.transform(df_train[colnames_categ]).toarray() # # inference_X_onehot = enc.transform(df_train[colnames_categ]).toarray() # print(train_X_onehot.shape) # print(train_X_onehot[0]) # pprint(enc.categories_)
from sklearn.decomposition import PCA from sklearn.ensemble import RandomForestClassifier train = titanic_src.utils.read_csv('data/interim/added_features_train.csv', index_col=0) y = train['Survived'].as_matrix() cat_nominal_cols = ['Title', 'IsAlone', 'Sex', 'Embarked', 'HighestDeck'] cat_ordinal_cols = ['Pclass'] numerical_cols = ['Age', 'SibSp', 'Fare', 'FamilySize', 'NumOfCabins'] cat_nominal_pipeline = Pipeline([ ('selector', tf.DataFrameSelector(cat_nominal_cols)), ('impute', tf.CustomImputer(strategy='mode')), ('encoder', CategoricalEncoder(encoding='onehot-dense')), ]) cat_ordinal_pipeline = Pipeline([ ('selector', tf.DataFrameSelector(cat_ordinal_cols)), ('impute', tf.CustomImputer(strategy='mode')), ('encode_scale', QuantileTransformer(output_distribution='normal')) ]) num_init_quantile_transformer = QuantileTransformer( output_distribution='normal') def inverse_func(X): return num_init_quantile_transformer.inverse_transform(X)
# ---Pipelines with DataFrameSelector num_attribs = list(housing_num) cat_attribs = ["ocean_proximity"] new_df = housing[cat_attribs] #print(new_df) # num_pipeline = Pipeline([ # ('selector', DataFrameSelector(num_attribs)), # ('imputer', Imputer(strategy="median")), # ('attribs_adder', CombinedAttributesAdder()), # ('std_scaler', StandardScaler()) # ]) cat_pipeline = Pipeline([ ('selector', DataFrameSelector(cat_attribs)), ('cat_encoder', CategoricalEncoder(encoding="onehot-dense"))]) # # ---Combining Pipelines with sklearn class FeatureUnion # # Give a list of transformers or pipelines. from sklearn.pipeline import FeatureUnion # # full_pipeline = FeatureUnion(transformer_list=[ # ("num_pipeline", num_pipeline), # ("cat_pipeline", cat_pipeline) # ]) # # Run entire pipeline: # housing_prepared = full_pipeline.fit_transform(housing) # from sklearn.linear_model import LinearRegression # # lin_reg = LinearRegression() # lin_reg.fit(housing_prepared, housing_labels) #
#data.subset(500, 50, 50) #Step 9: Specify the features to use, this part is merely for sklearn. features = ClassifierFeatures() #features.add('headline', TfidfVectorizer(tokenizer=TextTokenizer.tokenizeText, lowercase=False, analyzer='word', ngram_range=(1,1), min_df=1), 'headline'),#, max_features=100000)), # features.add('headline_words', TfidfVectorizer(tokenizer=TextTokenizer.tokenizeText, lowercase=False, analyzer='word', ngram_range=(1,1), min_df=1), 'headline'),#, max_features=100000)), # features.add('headline1_words', TfidfVectorizer(tokenizer=TextTokenizer.tokenizeText, lowercase=False, analyzer='word', ngram_range=(1,1), min_df=1), 'headline1'),#, max_features=100000)), # features.add('headline2_words', TfidfVectorizer(tokenizer=TextTokenizer.tokenizeText, lowercase=False, analyzer='word', ngram_range=(1,1), min_df=1), 'headline2'),#, max_features=100000)), # features.add('keyword_words', TfidfVectorizer(tokenizer=TextTokenizer.tokenizeText, lowercase=False, analyzer='word', ngram_range=(1, 1), min_df=1), 'keyword'),#, max_features=100000)), # #features.add('term_words', TfidfVectorizer(tokenizer=TextTokenizer.tokenizeText, lowercase=False, analyzer='word', ngram_range=(1, 2), min_df=1), 'term'),#, max_features=100000)), #features.add('display_url', TfidfVectorizer(tokenizer=TextTokenizer.tokenizeText, lowercase=False, analyzer='word', ngram_range=(1, 1), min_df=1), 'display_url'),#, max_features=100000)), # features.add('quality_score', StandardScaler(), 'quality_score'), features.add('avg_position', StandardScaler(), 'avg_position'), features.add('avg_cost', StandardScaler(), 'avg_cost'), features.add('device', CategoricalEncoder(), 'device'), features.add('ad_placement', CategoricalEncoder(), 'ad_placement'), features.add('ad_type', CategoricalEncoder(), 'ad_type'), features.add('match_type', CategoricalEncoder(), 'match_type'), #Step 10: Specify the classifier you want to use (additionaly!) new_classifier = LogisticRegression() #new_classifier = SVR(kernel='linear') #new_classifier = LinearRegression(fit_intercept=True, normalize=True) #new_classifier = Ridge(alpha=.5) if options.args.print_details >= 2: printer.labelDistribution(data.Y_train, 'Training Set') #Step 11: Run our system. if len(data.labels) > 1: #otherwise, there is nothing to train
# - embarked: categories encoded as strings {'C', 'S', 'Q'}. # - sex: categories encoded as strings {'female', 'male'}. # - pclass: ordinal integers {1, 2, 3}. # We create the preprocessing pipelines for both numeric and categorical data. numeric_features = ['age', 'fare'] numeric_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = ['embarked', 'sex', 'pclass'] categorical_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', CategoricalEncoder('onehot-dense', handle_unknown='ignore'))]) preprocessor = ColumnTransformer(transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features) ], remainder='drop') # Append classifier to preprocessing pipeline. # Now we have a full prediction pipeline. clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression())]) X = data.drop('survived', axis=1) y = data['survived']
test_size=0.5) # Unsupervised transformation based on totally random trees rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator, random_state=0) rt_lm = LogisticRegression() pipeline = make_pipeline(rt, rt_lm) pipeline.fit(X_train, y_train) y_pred_rt = pipeline.predict_proba(X_test)[:, 1] fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt) # Supervised transformation based on random forests rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator) rf_enc = CategoricalEncoder() rf_lm = LogisticRegression() rf.fit(X_train, y_train) rf_enc.fit(rf.apply(X_train)) rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm) grd = GradientBoostingClassifier(n_estimators=n_estimator) grd_enc = CategoricalEncoder() grd_lm = LogisticRegression() grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
def extract_features(df_train, df_inference, selected_feature_names_categ, selected_feature_names_interval, shuffle=True, fuzzy_matching=True, use_onehot=True, use_sentence_vec=False): features_to_use = [] variable_types = [] if not (use_onehot): for feature in selected_feature_names_categ: features_to_use.append(feature + '_encoded') variable_types.append('categorical_nominal') # Append interval AFTER categorical!! for feature in selected_feature_names_interval: features_to_use.append(feature + '_normed') variable_types.append('numerical') # Check to ensure all cols exist (avoid keyerrors) for df in [df_train, df_inference]: df[selected_feature_names_categ + selected_feature_names_interval] print(df['combined_str']) # for feature in selected_feature_names_categ: # le = preprocessing.LabelEncoder() # print(print_attr_overview(df[feature], True, topn=10)) # df[feature + '_encoded'] = le.fit_transform(df[feature]) # features_to_use.append(feature + '_encoded') if use_onehot: # Each Feature has its own vocab... vocabs = defaultdict(list) X = pd.concat([df_train[colnames_categ], df_inference[colnames_categ]]) X = df_train[colnames_categ] X = X.apply(preprocess_categ_series) enc = CategoricalEncoder(handle_unknown='ignore') enc.fit_transform(X) # pprint(enc.categories_) else: le = preprocessing.LabelEncoder() all_unique = [] # FIT LABEL_ENCODER (combine vocabs for train and inference) for df in [df_train, df_inference]: for feature in selected_feature_names_categ: # print(print_attr_overview(df[feature])) s = df[feature] # Remove categorical entries with less than 10 occurances a = s.value_counts() s[s.isin(a.index[a < 12])] = np.nan s[s.isnull()] = "EMPTY_PLACEHOLDER" s = s.map(lambda x: x.lower() if type(x) == str else x) # print(np.unique(df[feature])) all_unique.extend(np.unique(s)) le.fit(all_unique) # TRANSFORM LABEL_ENCODER for df in [df_train, df_inference]: for feature in selected_feature_names_categ: print(feature) # print(df[feature]) s = df[feature] s = s.map(lambda x: x.lower() if type(x) == str else x) df[feature + '_encoded'] = le.transform(s) print(feature, len(np.unique(s))) for df in [df_train, df_inference]: for feature in selected_feature_names_interval: s = df[feature] s = s.map(lambda x: x.replace(',', '') if type(x) == str else x) # print(s) s = pd.to_numeric(s, errors='coerce') # Set null values to zero # TODO: try set nan to the mean instead of zero # TODO: try different types of normalisation s[np.logical_not(s.notnull())] = 0.0 df[feature + '_normed'] = norm_zscore(s) # features_to_use.append('sentence_vec') # variable_types.append('embedding') if use_sentence_vec: from ft_embedding import get_sentence_vec print('Computing sentence vectors for dataset') train_embedding_mat = np.asarray( [get_sentence_vec(x) for x in df_train['combined_str']]) inference_embedding_mat = np.asarray( [get_sentence_vec(x) for x in df_inference['combined_str']]) variable_types.append('ft_embedding') if use_onehot: print(features_to_use) # One-Hot Categorical Encoding train_X_onehot = enc.transform(df_train[colnames_categ]).toarray() train_X_interval = df_train[features_to_use].as_matrix() print(train_X_onehot.shape) print(train_X_interval.shape) train_X = np.hstack([train_X_onehot, train_X_interval]) inference_X_onehot = enc.transform( df_inference[colnames_categ]).toarray() inference_X_interval = df_inference[features_to_use].as_matrix() print(inference_X_onehot.shape) print(inference_X_interval.shape) inference_X = np.hstack([inference_X_onehot, inference_X_interval]) # Add (one-hot encoded) numerical features to variable_types len_onehot = train_X_onehot.shape[1] print(len_onehot) features_to_use = ['numerical' for i in range(len_onehot)] + features_to_use else: # Index Categorical Encoding (integer) train_X = df_train[features_to_use].as_matrix() inference_X = df_inference[features_to_use].as_matrix() train_y = df_train['case_status'].as_matrix() if use_sentence_vec: # Stack with sentence embedding train_X = np.hstack([train_X.copy(), train_embedding_mat]) inference_X = np.hstack([inference_X.copy(), inference_embedding_mat]) print(train_embedding_mat.shape) print(inference_embedding_mat.shape) print(train_X.shape) print(inference_X.shape) # exit() inference_row_id = df_inference['row ID'] if shuffle: train_X, train_y = skl_shuffle(train_X, train_y) # print(X.shape) # print(y.shape) if use_onehot: vocab_size = 0 else: vocab_size = len(list(le.classes_)) return train_X, train_y, inference_row_id, inference_X, vocab_size, variable_types, features_to_use