Ejemplo n.º 1
0
	def _sklearn2weka(self, features, labels=None):

		encoder = CategoricalEncoder(encoding='ordinal')
		labels_nominal = encoder.fit_transform(np.array(labels).reshape(-1, 1))

		if not hasattr(self, 'dict') and labels is not None:

			dict = {}

			for label, nominal in zip(labels, labels_nominal):
				if nominal.item(0) not in dict:
					dict[nominal.item(0)] = label

			self._dict = dict

		labels_column = np.reshape(labels_nominal,[labels_nominal.shape[0], 1])

		weka_dataset = ndarray_to_instances(np.ascontiguousarray(features, dtype=np.float_), 'weka_dataset')
		weka_dataset.insert_attribute(Attribute.create_nominal('tag', [str(float(i)) for i in range(len(self._dict))]), features.shape[1])

		if labels is not None:
			for index, inst in enumerate(weka_dataset):
				inst.set_value(features.shape[1], labels_column[index])
				weka_dataset.set_instance(index,inst)

		return weka_dataset
Ejemplo n.º 2
0
def encode_cat(dat):
    cat_encoder = CategoricalEncoder(encoding='onehot-dense')
    dat = dat.astype('str')
    dat_reshaped = dat.values.reshape(-1, 1)
    dat_1hot = cat_encoder.fit_transform(dat_reshaped)
    col_names = [
        dat.name + '_' + str(x) for x in list(cat_encoder.categories_[0])
    ]
    return pd.DataFrame(dat_1hot, columns=col_names)
Ejemplo n.º 3
0
def encode_cat(dat):
    """ functon to return a labeled data frame with one hot encoding """
    cat_encoder = CategoricalEncoder(encoding="onehot-dense")
    dat = dat.astype('str')
    dat_reshaped = dat.values.reshape(-1, 1)
    dat_1hot = cat_encoder.fit_transform(dat_reshaped)
    col_names = [
        dat.name + "_" + str(x) for x in list(cat_encoder.categories_[0])
    ]
    return pd.DataFrame(dat_1hot, columns=col_names)
Ejemplo n.º 4
0
    def train(self,
              d,
              report_dir=None,
              dropout=0.5,
              batch_size=32,
              epochs=5,
              validation_split=0.,
              **params):
        d = d.sample(frac=1)
        x = d[[c for c in d.columns if c != self.target]]
        y = d[[self.target]]

        self.preprocessor = dict(
            x=StandardScaler(),
            y=CategoricalEncoder(encoding='onehot-dense'),
        )

        x = self.preprocessor['x'].fit_transform(x)
        y = self.preprocessor['y'].fit_transform(y)

        self.build(dropout=dropout)

        self.model.compile(optimizer='adam',
                           loss='categorical_crossentropy',
                           metrics=['accuracy'])

        callbacks_used = [callbacks.TerminateOnNaN()]

        if validation_split:
            callbacks_used += [
                callbacks.TensorBoard(report_dir,
                                      batch_size=batch_size,
                                      histogram_freq=1,
                                      write_grads=True),
                callbacks.ModelCheckpoint(os.path.join(report_dir,
                                                       'network.h5'),
                                          verbose=0,
                                          save_best_only=True)
            ]

        report = self.model.fit(x,
                                y,
                                batch_size=batch_size,
                                epochs=epochs,
                                verbose=0,
                                callbacks=callbacks_used,
                                validation_split=validation_split,
                                validation_data=None,
                                shuffle=True,
                                class_weight=None,
                                sample_weight=None,
                                initial_epoch=0,
                                steps_per_epoch=None,
                                validation_steps=None)

        if not validation_split:
            models.save_model(self.model, os.path.join(report_dir,
                                                       'network.h5'))

        return {k: [float(_v) for _v in v] for k, v in report.history.items()}
def create_pipelines(housing_num):
    num_attribs = list(housing_num)
    cat_attribs = ["ocean_proximity"]
    num_pipeline = Pipeline([('selector', DataFrameSelector(num_attribs)),
                             ('imputer', Imputer(strategy="median")),
                             ('attribs_adder', CombinedAttributesAdder()),
                             ('std_caller', StandardScaler())])

    cat_pipeline = Pipeline([('selector', DataFrameSelector(cat_attribs)),
                             ('cat_encoder',
                              CategoricalEncoder(encoding="onehot-dense"))])
    return num_pipeline, cat_pipeline
Ejemplo n.º 6
0
def make_encode_pipeline():
    numerical_cols = NUMERICAL_COLS
    categorical_cols = CATEGORICAL_COLS + ['AgeBin', 'FamilyBin']

    cat_pipe = Pipeline([
        ('cat_selector', DataFrameSelector(categorical_cols)),
        ('cat_encoder', CategoricalEncoder('onehot-dense')),
    ])
    final_pipe = FeatureUnion([
        ('num_selector', DataFrameSelector(numerical_cols)),
        ('cat_pipe', cat_pipe),
    ])
    return final_pipe
Ejemplo n.º 7
0
def convert_categorical_features(df):
    enc = CategoricalEncoder(encoding='ordinal')
    encoded_features = enc.fit_transform(df[[
        'dim_is_requested', 'dim_market', 'dim_room_type', 'cancel_policy',
        'dim_is_instant_bookable'
    ]])

    encoded_df = pd.DataFrame(encoded_features,
                              index=df.index,
                              columns=[
                                  'dim_is_requested', 'dim_market',
                                  'dim_room_type', 'cancel_policy',
                                  'dim_is_instant_bookable'
                              ])

    col = df.columns.tolist()
    col_non_cat = col[1:3] + col[5:6] + col[7:10] + col[11:]
    df_non_cat = df[col_non_cat]
    col_cat = encoded_df.columns.tolist()
    col_full = col_cat[:] + col_non_cat[:]

    stack_full = np.column_stack([encoded_df, df_non_cat])
    stack_df = pd.DataFrame(stack_full, index=df.index, columns=col_full)
    return stack_df
Ejemplo n.º 8
0
def make_impute_pipeline():
    categorical_cols = CATEGORICAL_COLS
    numerical_cols = NUMERICAL_COLS

    categorical_pre = Pipeline([
        ('selector', DataFrameSelector(categorical_cols)),
        ('impute', CustomImputer(strategy='mode')),
    ])

    categorical_pipeline = Pipeline([
        ('categorical_pre', categorical_pre),
        ('encoder', CategoricalEncoder(encoding='onehot-dense')),
    ])

    num_init_quantile_transformer = QuantileTransformer(
        output_distribution='normal')

    numerical_pipeline = Pipeline([
        ('selector', DataFrameSelector(numerical_cols)),
        ('scale', num_init_quantile_transformer),
    ])

    combined_features = FeatureUnion([
        ('numerical_pipeline', numerical_pipeline),
        ('cat_ordinal_pipeline', categorical_pipeline),
    ])

    mice_pipeline = Pipeline([
        ('combined_features', combined_features),
        ('mice_impute', MICEImputer(verbose=True)),
    ])

    impute_pipeline = Pipeline([
        ('mice_pipeline', mice_pipeline),
        ('inverse_qt',
         SelectiveAction(
             col=list(range(len(numerical_cols))),
             action=FunctionTransformer(
                 inverse_func,
                 kw_args={'transformer': num_init_quantile_transformer}))),
        ('numerical_selection', ColumnSelector(range(len(numerical_cols))))
    ])

    final_pipeline = FeatureUnion([('impute_pipeline', impute_pipeline),
                                   ('categorical_pre', categorical_pre)])

    return final_pipeline
Ejemplo n.º 9
0
# In[37]:

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1, 1))
housing_cat_1hot

# In[38]:

housing_cat_1hot.toarray()

# In[39]:

from sklearn.preprocessing import CategoricalEncoder
cat_encoder = CategoricalEncoder()
housing_cat_reshaped = housing_cat.values.reshape(-1, 1)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped)
housing_cat_1hot

# In[40]:

cat_encoder = CategoricalEncoder(encoding="onehot-dense")
housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped)
housing_cat_1hot

# In[41]:

cat_encoder.categories_

# In[42]:
Ejemplo n.º 10
0
X_q = np.array([[
    '66', 'retired', 'married', 'primary', 'no', '206', 'no', 'no', 'cellular',
    '9', 'feb', '479', '1', '-1', '0', 'unknown'
],
                [
                    '54', 'technician', 'married', 'tertiary', 'no', '876',
                    'no', 'no', 'cellular', '27', 'oct', '269', '3', '541',
                    '3', 'success'
                ]])

# ### Replacing nominal relations
# To use this data we must replace nominal relations. We can encode labels with value between 0 and n_classes-1 or add n_classes dummy values and remove original column. Here we used second option, replaced nominal columns with onehot versions. I used this option because the first one would imply some categories were closer then others, for example encoded value 5 would be "closer" to encoded category 4 than 1. Downside of one-hot option is we will greatly increase number of features, but we can later use PCA to reduce number of features, or just keep the most informative ones.

# In[4]:

enc = CategoricalEncoder(encoding='onehot-dense')

X_2 = np.array(X[:, 0].reshape(-1, 1))
Xq_2 = np.array(X_q[:, 0].reshape(-1, 1))
attributes = [dataset['attributes'][0][0]]

for i, (name, relation) in enumerate(dataset['attributes'][1:-1]):
    if relation == 'NUMERIC':
        X_2 = np.hstack((X_2, X[:, i + 1].reshape(-1, 1)))
        Xq_2 = np.hstack((Xq_2, X_q[:, i + 1].reshape(-1, 1)))
        attributes.append(name)
        continue

    X_2 = np.hstack((X_2, enc.fit_transform(X[:, i + 1].reshape(-1, 1))))
    Xq_2 = np.hstack((Xq_2, enc.transform(X_q[:, i + 1].reshape(-1, 1))))
# - age: float.
# - fare: float.
# Categorical Features:
# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
# - sex: categories encoded as strings {'female', 'male'}.
# - pclass: ordinal integers {1, 2, 3}.
numeric_features = ['age', 'fare']
categorical_features = ['embarked', 'sex', 'pclass']

# Provisionally, use pd.fillna() to impute missing values for categorical
# features; SimpleImputer will eventually support strategy="constant".
data[categorical_features] = data[categorical_features].fillna(value='missing')

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())
categorical_transformer = CategoricalEncoder('onehot-dense',
                                             handle_unknown='ignore')

preprocessing_pl = make_column_transformer(
    (numeric_features, numeric_transformer),
    (categorical_features, categorical_transformer),
    remainder='drop'
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = make_pipeline(preprocessing_pl, LogisticRegression())

X = data.drop('survived', axis=1)
y = data.survived.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
Ejemplo n.º 12
0
    for t in transition:
        transit.append(t)
    # feat.append(features)
    # transit.append(transition)

print(len(feat), 25)

GloveDimOption = '50'  # this  could be 50 (171.4 MB), 100 (347.1 MB), 200 (693.4 MB), or 300 (1 GB)
embeddings_index = loadGloveModel('data/glove.6B.' + GloveDimOption + 'd.txt')

# print(embeddings_index['apple'])
# print(embeddings_index['mango'])
embeddings_index[''] = np.zeros(50)
embeddings_index['*root'] = np.ones(50)

enc = CategoricalEncoder(encoding='onehot')
X_pos = [['ADJ'], ['ADP'], ['ADV'], ['AUX'], ['CCONJ'], ['DET'], ['INTJ'],
         ['NOUN'], ['NUM'], ['PART'], ['PRON'], ['PROPN'], ['PUNCT'],
         ['SCONJ'], ['SYM'], ['VERB'], ['X']]
enc.fit(X_pos)

for i in X_pos:
    embeddings_index[i[0]] = pad_sequences(enc.transform([[i[0]]]).toarray(),
                                           maxlen=50,
                                           padding='post')[0]
    #embeddings_index[i[0]] = pad_sequences(enc.transform([[i[0]]]).toarray(), maxlen=18, padding='post')[0]
    # print(embeddings_index[i[0]])
    # print(embeddings_index['apple'])

feat_vect, transit_vect = [], []
# feat_vect = np.array(())
Ejemplo n.º 13
0
	def kfold_validation(self, k=10):

		sem.acquire()

		available_ram = psutil.virtual_memory()[1]
		available_ram = int(int(available_ram) * .9 * 1e-9)

		if available_ram > 5:
			jvm.start(max_heap_size='5g')
		else:
			jvm.start(max_heap_size=str(available_ram)+'g')

		###

		print('\nCaricando '+self.input_file+' con opts -f'+str(self.features_number)+' -c'+self.classifier_name+'\n')
		# load .arff file
		dataset = arff.load(open(self.input_file, 'r'))
		data = np.array(dataset['data'])

		self.features_names = [x[0] for x in dataset['attributes']]

		self.attributes_number = data.shape[1]
		self.dataset_features_number = self.attributes_number - self.levels_number

		# Factorization of Nominal features_index
		encoder = CategoricalEncoder(encoding='ordinal')
		nominal_features_index = [i for i in range(len(dataset['attributes'][:-self.levels_number])) if dataset['attributes'][i][1] != u'NUMERIC']
		if len(nominal_features_index) > 0:
			data[:, nominal_features_index] = encoder.fit_transform(
				data[:, nominal_features_index])

		# Impute missing value by fitting over training set and transforming both sets
		imp = SimpleImputer(missing_values='NaN', strategy='most_frequent')
		data[:, :self.dataset_features_number] = imp.fit_transform(data[:, :self.dataset_features_number])

		classifiers_per_fold = []
		oracles_per_fold = []
		predictions_per_fold = []
		predictions_per_fold_all = []

		print('\n***\nStart testing with '+str(k)+'Fold cross-validation -f'+str(self.features_number)+' -c'+self.classifier_name+'\n***\n')

		bar = progressbar.ProgressBar(maxval=k, widgets=[progressbar.Bar(
			'=', '[', ']'), ' ', progressbar.Percentage()])
		bar.start()

		skf = StratifiedKFold(n_splits=k, shuffle=True)
		bar_cnt = 0

		for train_index, test_index in skf.split(data, data[:,self.attributes_number-1]):

			self.classifiers = []

			self.training_set = data[train_index, :self.dataset_features_number]
			self.testing_set = data[test_index, :self.dataset_features_number]
			self.ground_through = data[train_index, self.dataset_features_number:]
			self.oracle = data[test_index, self.dataset_features_number:]
			self.prediction = np.ndarray(shape=[len(test_index),self.levels_number],dtype='<U24')
			self.prediction_all = np.ndarray(shape=[len(test_index),self.levels_number],dtype='<U24')

			root = Tree()

			root.train_index = [i for i in range(self.training_set.shape[0])]
			root.test_index = [i for i in range(self.testing_set.shape[0])]
			root.test_index_all = root.test_index
			root.children_tags = list(set(self.ground_through[root.train_index, root.level]))
			root.children_number = len(root.children_tags)

			if self.has_config:
				if 'f' in config[root.tag + '_' + str(root.level + 1)]:
					root.features_number = config[root.tag + '_' + str(root.level + 1)]['f']
				elif 'p' in config[root.tag + '_' + str(root.level + 1)]:
					root.packets_number = config[root.tag + '_' + str(root.level + 1)]['p']
				root.classifier_name = config[root.tag + '_' + str(root.level + 1)]['c']

				print('config','tag',root.tag,'level',root.level,'f',root.features_number,'c',root.classifier_name)
			else:
				root.features_number = self.features_number
				root.packets_number = self.packets_number
				root.classifier_name = self.classifier_name

			self.classifiers.append(root)

			if root.children_number > 1:

				classifier_to_call = getattr(self, supported_classifiers[root.classifier_name])
				classifier_to_call(node=root)

			else:

				self.unary_class_results_inferring(root)

			# Creating hierarchy recursively
			if root.level < self.levels_number-1 and root.children_number > 0:
				self.recursive(root)

			classifiers_per_fold.append(self.classifiers)

			oracles_per_fold.append(self.oracle)
			predictions_per_fold.append(self.prediction)
			predictions_per_fold_all.append(self.prediction_all)

			bar_cnt += 1
			bar.update(bar_cnt)

		bar.finish()

		folder_discriminator = self.classifier_name

		if self.has_config:
			folder_discriminator = self.config_name

		material_folder = './data_'+folder_discriminator+'/material/'

		if not os.path.exists('./data_'+folder_discriminator):
			os.makedirs('./data_'+folder_discriminator)
			os.makedirs(material_folder)
		elif not os.path.exists(material_folder):
			os.makedirs(material_folder)

		type_discr = 'flow'
		feat_discr = '_f_' + str(self.features_number)

		if not self.has_config and self.packets_number != 0:
			type_discr = 'early'
			feat_discr = '_p_' + str(self.packets_number)
		elif self.has_config:
			if 'p' in self.config:
				type_discr = 'early'
			feat_discr = '_c_' + self.config_name

		material_features_folder = './data_'+folder_discriminator+'/material/features/'

		if not os.path.exists(material_folder):
			os.makedirs(material_folder)
			os.makedirs(material_features_folder)
		elif not os.path.exists(material_features_folder):
			os.makedirs(material_features_folder)

		for i in range(self.levels_number):

			file = open(material_folder + 'multi_' + type_discr + '_level_' + str(i+1) + feat_discr + '.dat', 'w+')
			file.close()

			for j in range(k):

				file = open(material_folder + 'multi_' + type_discr + '_level_' + str(i+1) + feat_discr + '.dat', 'a')

				file.write('@fold\n')
				for o, p in zip(oracles_per_fold[j][:,i], predictions_per_fold[j][:,i]):
					file.write(str(o)+' '+str(p)+'\n')

				file.close()

		# Inferring NW metrics per classifier

		for classifier in classifiers_per_fold[0]:

			file = open(material_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '.dat', 'w+')
			file.close()

			file = open(material_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '_all.dat', 'w+')
			file.close()

			file = open(material_features_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '_features.dat', 'w+')
			file.close()

		for fold_n, classifiers in enumerate(classifiers_per_fold):

			for classifier in classifiers:

				file = open(material_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '.dat', 'a')

				if classifier.level > 0:
					index = []

					for pred_n, prediction in enumerate(predictions_per_fold[fold_n][classifier.test_index, classifier.level-1]):
						if prediction == oracles_per_fold[fold_n][classifier.test_index[pred_n], classifier.level-1]:
							index.append(classifier.test_index[pred_n])

					prediction_nw = predictions_per_fold[fold_n][index, classifier.level]
					oracle_nw = oracles_per_fold[fold_n][index, classifier.level]
				else:
					prediction_nw = predictions_per_fold[fold_n][classifier.test_index, classifier.level]
					oracle_nw = oracles_per_fold[fold_n][classifier.test_index, classifier.level]

				file.write('@fold\n')
				for o, p in zip(oracle_nw, prediction_nw):
						file.write(str(o)+' '+str(p)+'\n')

				file.close()

				file = open(material_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '_all.dat', 'a')

				if classifier.level > 0:
					index = []

					for pred_n, prediction in enumerate(predictions_per_fold_all[fold_n][classifier.test_index, classifier.level-1]):
						if prediction == oracles_per_fold[fold_n][classifier.test_index[pred_n], classifier.level-1]:
							index.append(classifier.test_index[pred_n])

					prediction_all = predictions_per_fold_all[fold_n][index, classifier.level]
					oracle_all = oracles_per_fold[fold_n][index, classifier.level]
				else:
					prediction_all = predictions_per_fold_all[fold_n][classifier.test_index_all, classifier.level]
					oracle_all = oracles_per_fold_all[fold_n][classifier.test_index_all, classifier.level]

				file.write('@fold\n')
				for o, p in zip(oracle_all, prediction_all):
						file.write(str(o)+' '+str(p)+'\n')

				file.close()

				file = open(material_features_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '_features.dat', 'a')

				file.write('@fold\n')
				file.write(self.features_names[classifier.features_index[0]])

				for feature_index in classifier.features_index[1:]:
					file.write(','+self.features_names[feature_index])

				file.write('\n')

				file.close()

		graph_folder = './data_'+folder_discriminator+'/graph/'

		if not os.path.exists('./data_'+folder_discriminator):
			os.makedirs('./data_'+folder_discriminator)
			os.makedirs(graph_folder)
		elif not os.path.exists(graph_folder):
			os.makedirs(graph_folder)

		# Graph plot
		G = nx.DiGraph()
		for info in classifiers_per_fold[0]:
			G.add_node(str(info.level)+' '+info.tag, level=info.level,
					   tag=info.tag, children_tags=info.children_tags)
		for node_parent, data_parent in G.nodes.items():
			for node_child, data_child in G.nodes.items():
				if data_child['level']-data_parent['level'] == 1 and any(data_child['tag'] in s for s in data_parent['children_tags']):
					G.add_edge(node_parent, node_child)
		nx.write_gpickle(G, graph_folder+'multi_' + type_discr + feat_discr +'_graph.gml')

		###

		jvm.stop()

		sem.release()
Ejemplo n.º 14
0
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Imputer
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import CategoricalEncoder

train = pd.read_csv('train.csv')

# Cabin allocations https://www.encyclopedia-titanica.org/cabins.html
# label = LabelBinarizer()
# hot = OneHotEncoder(handle_unknown = 'ignore', n_values=['male', 'female'])
categorical = CategoricalEncoder()
pipe = Pipeline([('Categorical Encoder', categorical)])

# pipe = Pipeline([('Label Binarizer', label)])

# train['Sex'] = train['Sex'].replace({'male': 0, 'female': 1})
# train.head()
# train.describe()

train = train[['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Age', 'Survived']]

pipe.fit(train)

imputer = Imputer()

train.describe()
encoding_methods = ['one-hot', 'target', 'similarity']
dirty_column = 'Employee Position Title'
#########################################################################


#########################################################################
# Creating a learning pipeline
# ----------------------------
# The encoders for both clean and dirty data are first imported:

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import CategoricalEncoder
from dirty_cat import SimilarityEncoder, TargetEncoder

encoders_dict = {
    'one-hot': CategoricalEncoder(handle_unknown='ignore',
                                  encoding='onehot-dense'),
    'similarity': SimilarityEncoder(similarity='ngram',
                                    handle_unknown='ignore'),
    'target': TargetEncoder(handle_unknown='ignore'),
    'numerical': FunctionTransformer(None)}

# We then create a function that takes one key of our ``encoders_dict``,
# returns a pipeline object with the associated encoder,
# as well as a Scaler and a RidgeCV regressor:

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


def make_pipeline(encoding_method):
    # static transformers from the other columns
                                                            y_train,
                                                            test_size=0.5)

# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator,
	random_state=0)

rt_lm = LogisticRegression()
pipeline = make_pipeline(rt, rt_lm)
pipeline.fit(X_train, y_train)
y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)

# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = CategoricalEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)

grd = GradientBoostingClassifier(n_estimators=n_estimator)
grd_enc = CategoricalEncoder()
grd_lm = LogisticRegression()
grd.fit(X_train, y_train)
grd_enc.fit(grd.apply(X_train)[:, :, 0])
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
Ejemplo n.º 17
0
	def kfold_validation(self, k=10):

		sem.acquire()

		available_ram = psutil.virtual_memory()[1]
		available_ram = int(int(available_ram) * .9 * 1e-9)

		if available_ram > 5:
			jvm.start(max_heap_size='5g')
		else:
			jvm.start(max_heap_size=str(available_ram)+'g')

		###

		print('\nCaricando '+self.input_file+' con opts -f'+str(self.features_number)+' -c'+self.classifier_name+'\n')
		# load .arff file
		dataset = arff.load(open(input_file, 'r'))
		data = np.array(dataset['data'])

		self.features_names = [x[0] for x in dataset['attributes']]

		self.attributes_number = data.shape[1]
		self.dataset_features_number = self.attributes_number - self.levels_number

		# Factorization of Nominal features_index
		encoder = CategoricalEncoder(encoding='ordinal')
		nominal_features_index = [i for i in range(len(dataset['attributes'][:-self.levels_number])) if dataset['attributes'][i][1] != u'NUMERIC']
		if len(nominal_features_index) > 0:
			data[:, nominal_features_index] = encoder.fit_transform(
				data[:, nominal_features_index])

		# Impute missing value by fitting over training set and transforming both sets
		imp = SimpleImputer(missing_values='NaN', strategy='most_frequent')
		data[:, :self.dataset_features_number] = imp.fit_transform(
			data[:, :self.dataset_features_number])

		prediction = []
		probability = []
		oracle = []

		print('\n***\nStart testing with ' + str(k)+'Fold cross-validation -f'+str(self.features_number)+' -c'+self.classifier_name+'\n***\n')

		bar = progressbar.ProgressBar(maxval=k, widgets=[progressbar.Bar(
			'=', '[', ']'), ' ', progressbar.Percentage()])
		bar.start()

		temp_metrics = []

		skf = StratifiedKFold(n_splits=k, shuffle=True)
		bar_cnt = 0
		for train_index, test_index in skf.split(data, data[:, self.dataset_features_number + self.tag_under_test]):

			self.training_set = data[train_index, :self.dataset_features_number]
			self.testing_set = data[test_index, :self.dataset_features_number]
			self.ground_through = data[train_index,
									   self.dataset_features_number + self.tag_under_test]
			self.oracle = data[test_index,
							   self.dataset_features_number + self.tag_under_test]
			self.prediction = np.ndarray(
				shape=[len(test_index), 1], dtype='<U24')
			self.probability = np.ndarray(
				shape=[len(test_index), len(set(self.ground_through))], dtype='<U24')

			classifier_to_call = getattr(self, supported_classifiers[self.classifier_name])
			classifier_to_call()

			prediction.append(self.prediction)
			probability.append(self.probability)
			oracle.append(self.oracle)

			# print(type(prediction[bar_cnt]))
			# print(type(probability[bar_cnt]))

			bar_cnt += 1
			bar.update(bar_cnt)

		bar.finish()

		relations = []

		relations = []
		relations.append({  # Lv2:Lv1
			u'Tor': u'Tor',
			u'TorPT': u'Tor',
			u'TorApp': u'Tor',
			u'I2PApp80BW': u'I2P',
			u'I2PApp0BW': u'I2P',
			u'I2PApp': u'I2P',
			u'JonDonym': u'JonDonym'
		})

		relations.append({  # Lv3:Lv2
			u'JonDonym': u'JonDonym',
			u'I2PSNARK_App80BW': u'I2PApp80BW',
			u'IRC_App80BW': u'I2PApp80BW',
			u'Eepsites_App80BW': u'I2PApp80BW',
			u'I2PSNARK_App0BW': u'I2PApp0BW',
			u'IRC_App0BW': u'I2PApp0BW',
			u'Eepsites_App0BW': u'I2PApp0BW',
			u'I2PSNARK_App': u'I2PApp',
			u'IRC_App': u'I2PApp',
			u'Eepsites_App': u'I2PApp',
			u'ExploratoryTunnels_App': u'I2PApp',
			u'ParticipatingTunnels_App': u'I2PApp',
			u'Tor': u'Tor',
			u'Streaming': u'TorApp',
			u'Torrent': u'TorApp',
			u'Browsing': u'TorApp',
			u'Flashproxy': u'TorPT',
			u'FTE': u'TorPT',
			u'Meek': u'TorPT',
			u'Obfs3': u'TorPT',
			u'scramblesuit': u'TorPT'
		})

		oracle_inferred = []
		prediction_inferred = []

		for i in range(self.tag_under_test):
			oracle_inferred.append(list())
			prediction_inferred.append(list())

		# Infering superior levels
		for i in range(k):
			# Assign of prediction to a dummy to use this one in consecutive label swaps
			inferred_prediction = prediction[i].copy()
			inferred_oracle = oracle[i].copy()
			for j in reversed(range(self.tag_under_test)):
				inferred_oracle = np.vectorize(
					relations[j].get)(list(inferred_oracle))
				inferred_prediction = np.vectorize(
					relations[j].get)(list(inferred_prediction))
				oracle_inferred[j].append(inferred_oracle)
				prediction_inferred[j].append(inferred_prediction)
		print('\n***\nStart testing with incremental gamma threshold\n***\n')

		bar = progressbar.ProgressBar(maxval=9, widgets=[progressbar.Bar(
			'=', '[', ']'), ' ', progressbar.Percentage()])
		bar.start()

		oracle_gamma = []
		prediction_gamma = []
		classified_ratio = []

		for i in range(9):
			gamma = float(i+1)/10.0

			oracle_gamma.append(list())
			prediction_gamma.append(list())
			classified_ratio.append(list())

			for j in range(k):
				indexes = []
				p_cnt = 0
				for p in probability[j]:
					if max(p) < gamma:
						indexes.append(p_cnt)
					p_cnt += 1
				gamma_oracle = np.delete(oracle[j], [indexes])
				gamma_prediction = np.delete(prediction[j], [indexes])
				oracle_gamma[i].append(gamma_oracle)
				prediction_gamma[i].append(gamma_prediction)
				classified_ratio[i].append(
					float(len(gamma_prediction))/float(len(prediction[j])))

			bar.update(i)

		bar.finish()

		data_folder = './data_'+self.classifier_name+'/material/'

		if not os.path.exists('./data_'+self.classifier_name):
			os.makedirs('./data_'+self.classifier_name)
			os.makedirs(data_folder)
		elif not os.path.exists(data_folder):
			os.makedirs(data_folder)

		if self.packets_number != 0:
			file = open(data_folder+'flat_early_level_'+str(self.level_target) +
						'_p_'+str(self.packets_number)+'.dat', 'w+')
		else:
			file = open(data_folder+'flat_flow_level_'+str(self.level_target) +
						'_f_'+str(self.features_number)+'.dat', 'w+')

		for i in range(k):
			file.write('@fold\n')
			for o, p in zip(oracle[i], prediction[i]):
				file.write(str(o)+' '+str(p)+'\n')

		file.close()

		for i in range(self.tag_under_test):

			if self.packets_number != 0:
				file = open(data_folder+'flat_early_level_'+str(self.level_target) +
							'_p_'+str(self.packets_number)+'_inferred_'+str(i+1)+'.dat', 'w+')
			else:
				file = open(data_folder+'flat_flow_level_'+str(self.level_target) +
							'_f_'+str(self.features_number)+'_inferred_'+str(i+1)+'.dat', 'w+')

			for j in range(k):
				file.write('@fold\n')
				for o, p in zip(oracle_inferred[i][j], prediction_inferred[i][j]):
					file.write(str(o)+' '+str(p)+'\n')

			file.close()

		for i in range(9):
			if self.packets_number != 0:
				file = open(data_folder+'flat_early_level_'+str(self.level_target)+'_p_' +
							str(self.packets_number)+'_gamma_'+str(float(i+1)/10.0)+'.dat', 'w+')
			else:
				file = open(data_folder+'flat_flow_level_'+str(self.level_target)+'_f_' +
							str(self.features_number)+'_gamma_'+str(float(i+1)/10.0)+'.dat', 'w+')

			for j in range(k):
				file.write('@fold_cr\n')
				file.write(str(classified_ratio[i][j])+'\n')
				for o, p in zip(oracle_gamma[i][j], prediction_gamma[i][j]):
					file.write(str(o)+' '+str(p)+'\n')

			file.close()

		###

		jvm.stop()

		sem.release()
Ejemplo n.º 18
0
import pandas as pd 
from sklearn.naive_bayes import GaussianNB
from sklearn import datasets 
import numpy as np
from sklearn.preprocessing import CategoricalEncoder



#Read the data and remove labels
mushroom_data = pd.read_csv("../data/agaricus-lepiota.data", header = None)
print(mushroom_data.keys())
target = mushroom_data[[19]]
mushroom_data = mushroom_data.drop([19],axis = 1) # axis 1 for dropping columns 
fixed_mushrooms = CategoricalEncoder(mushroom_data)
print(type(x))
#Make Naive Bayes Classifier
nb = GaussianNB() 
#y_pred = nb.fit(mushroom_data.as_matrix(),target.as_matrix()).predict(mushroom_data)
Ejemplo n.º 19
0
# "communication" part (not initially present in the category as a unique word)
# as well as the technician part of this category.

#########################################################################
# Encoding categorical data using SimilarityEncoder
# -------------------------------------------------
#
# A typical data-science workflow uses one-hot encoding to represent
# categories.
from sklearn.preprocessing import CategoricalEncoder

# encoding simply a subset of the observations
n_obs = 20
employee_position_titles = values['Employee Position Title'].head(
    n_obs).to_frame()
categorical_encoder = CategoricalEncoder(encoding='onehot-dense')
one_hot_encoded = categorical_encoder.fit_transform(employee_position_titles)
f3, ax3 = plt.subplots(figsize=(6, 6))
ax3.matshow(one_hot_encoded)
ax3.set_title('Employee Position Title values, one-hot encoded')
ax3.axis('off')
f3.tight_layout()

#########################################################################
# The corresponding is very sparse
#
# SimilarityEncoder can be used to replace one-hot encoding capturing the
# similarities:

f4, ax4 = plt.subplots(figsize=(6, 6))
similarity_encoded = similarity_encoder.fit_transform(employee_position_titles)
Ejemplo n.º 20
0
        X_categorical = X[:, :idx_end_categorical + 1]

        # Select only the numerical columns of X (including ft_embedding if present)
        X_numerical = X[:, idx_end_categorical + 1:]

        return X_categorical, X_numerical

    else:
        return np.zeros((X.shape[0], 0)), X


if __name__ == "__main__":
    df_train = load_and_preprocess('TrainingSet(3).csv', nrows=10000)
    # print(df_train.combined_str)

    # X = pd.concat([df_train[colnames_categ], df_inference[colnames_categ]])
    X = df_train[colnames_categ]
    X = X.apply(preprocess_categ_series)

    enc = CategoricalEncoder(handle_unknown='ignore')
    enc.fit(X)

    len_onehot = enc.transform(
        df_train[colnames_categ].iloc[:1]).toarray().shape[1]
    print(len_onehot)
    # train_X_onehot = enc.transform(df_train[colnames_categ]).toarray()
    # # inference_X_onehot = enc.transform(df_train[colnames_categ]).toarray()
    # print(train_X_onehot.shape)
    # print(train_X_onehot[0])
    # pprint(enc.categories_)
Ejemplo n.º 21
0
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

train = titanic_src.utils.read_csv('data/interim/added_features_train.csv',
                                   index_col=0)

y = train['Survived'].as_matrix()

cat_nominal_cols = ['Title', 'IsAlone', 'Sex', 'Embarked', 'HighestDeck']
cat_ordinal_cols = ['Pclass']
numerical_cols = ['Age', 'SibSp', 'Fare', 'FamilySize', 'NumOfCabins']

cat_nominal_pipeline = Pipeline([
    ('selector', tf.DataFrameSelector(cat_nominal_cols)),
    ('impute', tf.CustomImputer(strategy='mode')),
    ('encoder', CategoricalEncoder(encoding='onehot-dense')),
])

cat_ordinal_pipeline = Pipeline([
    ('selector', tf.DataFrameSelector(cat_ordinal_cols)),
    ('impute', tf.CustomImputer(strategy='mode')),
    ('encode_scale', QuantileTransformer(output_distribution='normal'))
])

num_init_quantile_transformer = QuantileTransformer(
    output_distribution='normal')


def inverse_func(X):
    return num_init_quantile_transformer.inverse_transform(X)
Ejemplo n.º 22
0
# ---Pipelines with DataFrameSelector
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

new_df = housing[cat_attribs]
#print(new_df)
# num_pipeline = Pipeline([
#     ('selector', DataFrameSelector(num_attribs)),
#     ('imputer', Imputer(strategy="median")),
#     ('attribs_adder', CombinedAttributesAdder()),
#     ('std_scaler', StandardScaler())
# ])
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
     ('cat_encoder', CategoricalEncoder(encoding="onehot-dense"))])
# # ---Combining Pipelines with sklearn class FeatureUnion
# # Give a list of transformers or pipelines.
from sklearn.pipeline import FeatureUnion
#
# full_pipeline = FeatureUnion(transformer_list=[
#     ("num_pipeline", num_pipeline),
#     ("cat_pipeline", cat_pipeline)
# ])
# # Run entire pipeline:
# housing_prepared = full_pipeline.fit_transform(housing)
# from sklearn.linear_model import LinearRegression
#
# lin_reg = LinearRegression()
# lin_reg.fit(housing_prepared, housing_labels)
#
Ejemplo n.º 23
0
#data.subset(500, 50, 50)

#Step 9: Specify the features to use, this part is merely for sklearn.
features = ClassifierFeatures()
#features.add('headline', TfidfVectorizer(tokenizer=TextTokenizer.tokenizeText, lowercase=False, analyzer='word', ngram_range=(1,1), min_df=1), 'headline'),#, max_features=100000)),
# features.add('headline_words', TfidfVectorizer(tokenizer=TextTokenizer.tokenizeText, lowercase=False, analyzer='word', ngram_range=(1,1), min_df=1), 'headline'),#, max_features=100000)),
# features.add('headline1_words', TfidfVectorizer(tokenizer=TextTokenizer.tokenizeText, lowercase=False, analyzer='word', ngram_range=(1,1), min_df=1), 'headline1'),#, max_features=100000)),
# features.add('headline2_words', TfidfVectorizer(tokenizer=TextTokenizer.tokenizeText, lowercase=False, analyzer='word', ngram_range=(1,1), min_df=1), 'headline2'),#, max_features=100000)),
# features.add('keyword_words', TfidfVectorizer(tokenizer=TextTokenizer.tokenizeText, lowercase=False, analyzer='word', ngram_range=(1, 1), min_df=1), 'keyword'),#, max_features=100000)),
# #features.add('term_words', TfidfVectorizer(tokenizer=TextTokenizer.tokenizeText, lowercase=False, analyzer='word', ngram_range=(1, 2), min_df=1), 'term'),#, max_features=100000)),
#features.add('display_url', TfidfVectorizer(tokenizer=TextTokenizer.tokenizeText, lowercase=False, analyzer='word', ngram_range=(1, 1), min_df=1), 'display_url'),#, max_features=100000)),
# features.add('quality_score', StandardScaler(), 'quality_score'),

features.add('avg_position', StandardScaler(), 'avg_position'),
features.add('avg_cost', StandardScaler(), 'avg_cost'),
features.add('device', CategoricalEncoder(), 'device'),
features.add('ad_placement', CategoricalEncoder(), 'ad_placement'),
features.add('ad_type', CategoricalEncoder(), 'ad_type'),
features.add('match_type', CategoricalEncoder(), 'match_type'),

#Step 10: Specify the classifier you want to use (additionaly!)
new_classifier = LogisticRegression()
#new_classifier = SVR(kernel='linear')
#new_classifier = LinearRegression(fit_intercept=True, normalize=True)
#new_classifier = Ridge(alpha=.5)

if options.args.print_details >= 2:
  printer.labelDistribution(data.Y_train, 'Training Set')

#Step 11: Run our system.
if len(data.labels) > 1: #otherwise, there is nothing to train
Ejemplo n.º 24
0
# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
# - sex: categories encoded as strings {'female', 'male'}.
# - pclass: ordinal integers {1, 2, 3}.

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(
    steps=[('imputer',
            SimpleImputer(strategy='median')), ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(
    steps=[('imputer',
            SimpleImputer(strategy='constant', fill_value='missing')),
           ('onehot',
            CategoricalEncoder('onehot-dense', handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
],
                                 remainder='drop')

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor',
                       preprocessor), ('classifier', LogisticRegression())])

X = data.drop('survived', axis=1)
y = data['survived']
                                                            test_size=0.5)

# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3,
                          n_estimators=n_estimator,
                          random_state=0)

rt_lm = LogisticRegression()
pipeline = make_pipeline(rt, rt_lm)
pipeline.fit(X_train, y_train)
y_pred_rt = pipeline.predict_proba(X_test)[:, 1]
fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt)

# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator)
rf_enc = CategoricalEncoder()
rf_lm = LogisticRegression()
rf.fit(X_train, y_train)
rf_enc.fit(rf.apply(X_train))
rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr)

y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1]
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)

grd = GradientBoostingClassifier(n_estimators=n_estimator)
grd_enc = CategoricalEncoder()
grd_lm = LogisticRegression()
grd.fit(X_train, y_train)
grd_enc.fit(grd.apply(X_train)[:, :, 0])
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
Ejemplo n.º 26
0
def extract_features(df_train,
                     df_inference,
                     selected_feature_names_categ,
                     selected_feature_names_interval,
                     shuffle=True,
                     fuzzy_matching=True,
                     use_onehot=True,
                     use_sentence_vec=False):

    features_to_use = []
    variable_types = []

    if not (use_onehot):
        for feature in selected_feature_names_categ:
            features_to_use.append(feature + '_encoded')
            variable_types.append('categorical_nominal')

    # Append interval AFTER categorical!!
    for feature in selected_feature_names_interval:
        features_to_use.append(feature + '_normed')
        variable_types.append('numerical')

    # Check to ensure all cols exist (avoid keyerrors)
    for df in [df_train, df_inference]:
        df[selected_feature_names_categ + selected_feature_names_interval]
        print(df['combined_str'])

    # for feature in selected_feature_names_categ:
    #     le = preprocessing.LabelEncoder()
    #     print(print_attr_overview(df[feature], True, topn=10))
    #     df[feature + '_encoded'] = le.fit_transform(df[feature])
    #     features_to_use.append(feature + '_encoded')

    if use_onehot:
        # Each Feature has its own vocab...
        vocabs = defaultdict(list)

        X = pd.concat([df_train[colnames_categ], df_inference[colnames_categ]])
        X = df_train[colnames_categ]
        X = X.apply(preprocess_categ_series)

        enc = CategoricalEncoder(handle_unknown='ignore')
        enc.fit_transform(X)

        # pprint(enc.categories_)

    else:
        le = preprocessing.LabelEncoder()
        all_unique = []

        # FIT LABEL_ENCODER (combine vocabs for train and inference)
        for df in [df_train, df_inference]:
            for feature in selected_feature_names_categ:
                # print(print_attr_overview(df[feature]))

                s = df[feature]

                # Remove categorical entries with less than 10 occurances
                a = s.value_counts()
                s[s.isin(a.index[a < 12])] = np.nan

                s[s.isnull()] = "EMPTY_PLACEHOLDER"
                s = s.map(lambda x: x.lower() if type(x) == str else x)
                # print(np.unique(df[feature]))
                all_unique.extend(np.unique(s))

        le.fit(all_unique)

        # TRANSFORM LABEL_ENCODER
        for df in [df_train, df_inference]:
            for feature in selected_feature_names_categ:
                print(feature)
                # print(df[feature])
                s = df[feature]

                s = s.map(lambda x: x.lower() if type(x) == str else x)
                df[feature + '_encoded'] = le.transform(s)
                print(feature, len(np.unique(s)))

    for df in [df_train, df_inference]:
        for feature in selected_feature_names_interval:
            s = df[feature]
            s = s.map(lambda x: x.replace(',', '') if type(x) == str else x)
            # print(s)
            s = pd.to_numeric(s, errors='coerce')

            # Set null values to zero
            # TODO: try set nan to the mean instead of zero
            # TODO: try different types of normalisation
            s[np.logical_not(s.notnull())] = 0.0

            df[feature + '_normed'] = norm_zscore(s)

    # features_to_use.append('sentence_vec')
    # variable_types.append('embedding')

    if use_sentence_vec:
        from ft_embedding import get_sentence_vec
        print('Computing sentence vectors for dataset')
        train_embedding_mat = np.asarray(
            [get_sentence_vec(x) for x in df_train['combined_str']])
        inference_embedding_mat = np.asarray(
            [get_sentence_vec(x) for x in df_inference['combined_str']])
        variable_types.append('ft_embedding')

    if use_onehot:
        print(features_to_use)

        # One-Hot Categorical Encoding
        train_X_onehot = enc.transform(df_train[colnames_categ]).toarray()
        train_X_interval = df_train[features_to_use].as_matrix()
        print(train_X_onehot.shape)
        print(train_X_interval.shape)
        train_X = np.hstack([train_X_onehot, train_X_interval])

        inference_X_onehot = enc.transform(
            df_inference[colnames_categ]).toarray()
        inference_X_interval = df_inference[features_to_use].as_matrix()
        print(inference_X_onehot.shape)
        print(inference_X_interval.shape)
        inference_X = np.hstack([inference_X_onehot, inference_X_interval])

        # Add (one-hot encoded) numerical features to variable_types
        len_onehot = train_X_onehot.shape[1]
        print(len_onehot)
        features_to_use = ['numerical'
                           for i in range(len_onehot)] + features_to_use

    else:
        # Index Categorical Encoding (integer)
        train_X = df_train[features_to_use].as_matrix()
        inference_X = df_inference[features_to_use].as_matrix()

    train_y = df_train['case_status'].as_matrix()

    if use_sentence_vec:
        # Stack with sentence embedding
        train_X = np.hstack([train_X.copy(), train_embedding_mat])
        inference_X = np.hstack([inference_X.copy(), inference_embedding_mat])
        print(train_embedding_mat.shape)
        print(inference_embedding_mat.shape)

    print(train_X.shape)
    print(inference_X.shape)
    # exit()
    inference_row_id = df_inference['row ID']

    if shuffle:
        train_X, train_y = skl_shuffle(train_X, train_y)

    # print(X.shape)
    # print(y.shape)

    if use_onehot:
        vocab_size = 0
    else:
        vocab_size = len(list(le.classes_))

    return train_X, train_y, inference_row_id, inference_X, vocab_size, variable_types, features_to_use