def main(self): FE = FeatureEngineering() features_train, labels_train, features_test, labels_test, clean_data_frame = FE.main( ) # RandomforestC=self.RFC(features_train, labels_train,features_test, labels_test,clean_data_frame) MultinomialNB = self.MNB(features_train, labels_train, features_test, labels_test, clean_data_frame)
class PreprocessedData: """ This class combines the feature engineering and feature selection class to one to preprocess the data. It takes in the cleaned data and the y_column_name(ratings) """ def __init__(self, data, y_column_name): self.data = data self.y_column_name = y_column_name self.feature_engineering = FeatureEngineering(data, y_column_name) self.feature_selection = FeatureSelection(data, y_column_name) def preprocess_my_data(self, num_of_features_to_select): """ This method preprocesses the cleaned data and performs feature selection to select best n-features :param num_of_features_to_select: n-best features of the model :return: Full preprocesse data with n-selected features """ self.data = self.feature_engineering.input_rare_categorical() self.data = self.feature_engineering.encode_categorical_features() self.data = self.feature_engineering.scale_features() self.data = self.feature_selection.perform_feature_selection( num_of_features_to_select) return self.data
class FeatureSelection: def __init__(self, train, test, id_column, y_column_name): self.train = train self.number_of_train = train.shape[0] self.y_column_name = y_column_name self.id_column = id_column self.test = test self.data = pd.concat([train, test], ignore_index=True) self.feature_engineering = FeatureEngineering(train, test, id_column, y_column_name) self.y = self.data[[self.id_column, self.y_column_name]] def preprocess_my_data(self): self.data = self.feature_engineering.fill_na_categorical() self.data = self.feature_engineering.fill_na_numerical() self.data = self.feature_engineering.input_rare_categorical() self.data = self.feature_engineering.label_encoder() self.data = self.feature_engineering.get_scale_features() return self.data def perform_feature_selection(self, num_of_features_to_select): data = self.preprocess_my_data() train_data = data[:self.number_of_train] ytrain = train_data[[self.y_column_name]] xtrain = train_data.drop([self.id_column, self.y_column_name], axis=1) feature_sel_model = ExtraTreesRegressor().fit(xtrain, ytrain) feat_importances = pd.Series(feature_sel_model.feature_importances_, index=xtrain.columns) selected_features = feat_importances.nlargest( num_of_features_to_select) selected_features_df = selected_features.to_frame() selected_features_list = selected_features_df.index.tolist() data = self.data[selected_features_list] self.data = pd.concat([self.y, data], axis=1) return self.data
def __init__(self, *args, location_type="Location", location_type_plural="Locations"): #super(DataViz, self).__init__(*args) FeatureEngineering.__init__(self, *args) self.location_type = location_type self.location_type_plural = location_type_plural
def __init__(self, train, test, id_column, y_column_name): self.train = train self.number_of_train = train.shape[0] self.y_column_name = y_column_name self.id_column = id_column self.test = test self.data = pd.concat([train, test], ignore_index=True) self.feature_engineering = FeatureEngineering(train, test, id_column, y_column_name) self.y = self.data[[self.id_column, self.y_column_name]]
def __init__(self, train, test, id_column, y_column_name): self.train = train self.test = test self.y_column_name = y_column_name self.id_column = id_column self.data = pd.concat([train, test], ignore_index=True) self.feature_engineering = FeatureEngineering(train, test, id_column, y_column_name) self.feature_selection = FeatureSelection(train, test, id_column, y_column_name)
def fit_pipeline(self, input_csv_directory_path, input_csv_file_name): print('Start Testing pipeline') target_name = 'income' try: data_test = pd.read_csv( os.path.join(input_csv_directory_path, input_csv_file_name)) print(data_test.head()) print(data_test.shape) data_prepare = DataPrepare() df_clean = data_prepare.dataPrepare(data_test) feature_engineering = FeatureEngineering() df_features_target = feature_engineering.featureEngineering( df_clean) # Dropping missing values if any df_features_target.dropna(axis=0, inplace=True) model_pipeline = helper_models.load_pipeline() prediction = model_pipeline.predict( df_features_target.drop(columns=target_name)) probability = model_pipeline.predict_proba( df_features_target.drop(columns=target_name)) print( "Classification report: \n ", classification_report(df_features_target[target_name], prediction)) helper_models.fill_confusion_matrix_and_save( df_features_target[target_name], prediction, f_name='Test Confusion matrix', out_dir=input_csv_directory_path) helper_models.plot_roc_curve_and_save( df_features_target[target_name], probability, f_name='Test Roc Curve', out_dir=input_csv_directory_path) print( 'Pipeline completed successfully and results are stored in data directory' ) except Exception as ex: print('Something went wrong with the Pipeline %s', ex) raise ex
def create_reader_new(self): source_type = self._model_desc_obj.get("dataSource").get("source_type") parameters = self._model_desc_obj.get("dataSource").get("parameters") print('--------- create_reader parameters start ---------') for k in sorted(parameters.keys()): print(k, parameters[k]) print('---------- create_reader parameters end ----------') if source_type == "kafka": reader = self.tensor_dict_from_kafka(parameters) elif source_type == "file": reader = self.tensor_dict_from_hdfs(parameters) fe = FeatureEngineering() reader = fe.get_tensor_dict(reader) reader.init(self.context) return reader
def get_data_and_pipeline(self): """ Function to obtain the pipeline object from the feature engineering class imported above among the libraries Args: None Returns: None """ data = FeatureEngineering(self.data_path) X, full_pipeline = data.build_pipe(hash_size=100) self.data = X self.pipeline = full_pipeline
class FeatureSelection: """ This class peroforms feature selection. It takes the cleaned data and the the y_column_name(ratings) """ def __init__(self, data, y_column_name): self.data = data self.y_column_name = y_column_name self.y = self.data[[self.y_column_name]] self.feature_engineering = FeatureEngineering(data, y_column_name) def preprocess_my_data(self): """ This method preprocessed the data by inputing rare categorical, perform feature engineering and feature scaling :return: preprocessed full data """ self.data = self.feature_engineering.input_rare_categorical() self.data = self.feature_engineering.encode_categorical_features() self.data = self.feature_engineering.scale_features() return self.data def perform_feature_selection(self, num_of_features_to_select): """ This method performs the feature selection technique :param num_of_features_to_select: number of best features to select :return: full data with n-selected features """ data = self.preprocess_my_data() self.train = data[0: 300000] label_encoder = LabelEncoder() ytrain = self.train[self.y_column_name] ytrain= label_encoder.fit_transform(ytrain) xtrain = self.train.drop([self.y_column_name], axis=1) feature_sel_model = ExtraTreesClassifier().fit(xtrain, ytrain) feat_importances = pd.Series(feature_sel_model.feature_importances_, index=xtrain.columns) selected_features = feat_importances.nlargest(num_of_features_to_select) selected_features_df = selected_features.to_frame() selected_features_list = selected_features_df.index.tolist() data = self.data[selected_features_list] self.data = pd.concat([self.y, data], axis=1) return self.data
class PreprocessedData: def __init__(self, train, test, id_column, y_column_name): self.train = train self.test = test self.y_column_name = y_column_name self.id_column = id_column self.data = pd.concat([train, test], ignore_index=True) self.feature_engineering = FeatureEngineering(train, test, id_column, y_column_name) self.feature_selection = FeatureSelection(train, test, id_column, y_column_name) def preprocess_my_data(self, num_of_features_to_select): self.data = self.feature_engineering.fill_na_categorical() self.data = self.feature_engineering.fill_na_numerical() self.data = self.feature_engineering.input_rare_categorical() self.data = self.feature_engineering.label_encoder() self.data = self.feature_engineering.get_scale_features() self.data = self.feature_selection.perform_feature_selection( num_of_features_to_select) return self.data
def setUpDataFrame(): train_filepath = read_yaml('baseConfig.yaml') df_raw = pd.read_csv(train_filepath, low_memory=False, parse_dates=['saledate'] ) print('The shape of dataframe is %s' %(str(df_raw.shape))) cleaning = Cleaning() print('Converting sale price to log of sale price') df_raw = cleaning.convertFeatureToItsLog(df_raw, 'SalePrice') print("Turning string to categorical variables") df_raw = cleaning.turnStringToCategorical(df_raw) #Aligning the levels properly df_raw.UsageBand.cat.set_categories(['High', 'Medium', 'Low'], ordered=True, inplace=True) #converting date and time to features feat_eng = FeatureEngineering() feat_eng.convertDatesToFeatures(df_raw, 'saledate') #saving as feather try: os.makedirs('tmp', exist_ok=True) df_raw.to_feather('tmp/raw') except (FileNotFoundError, IOError) as e: print(e)
class ProcessedData: def __init__(self, train, test, id_column, y_column_name): self.train = train self.test = test self.y_column_name = y_column_name self.id_column = id_column self.data = pd.concat([self.train, self.test], ignore_index=True) self.feature_engineering = FeatureEngineering(self.train, self.test, self.id_column, self.y_column_name) self.feature_selection = FeatureSelection(self.train, self.test, self.id_column, self.y_column_name) def preprocess_my_data(self, num_of_features_to_select): self.data = self.feature_engineering.scale_features() self.data = self.feature_selection.perform_feature_selection( num_of_features_to_select) return self.data
def main(data_directory_path, merge_csv_file_name, prepared_csv_file_name, features_target_csv_file_name): print("Model Process starts") #path = "E:\PlusDental_Task\sample_data" #merge_file_name = "data_merged.csv" #prepared_file_name = "data_prepared.csv" #feature_target_file_name = "features_target.csv" start = time.time() data_read_and_merge = DataReadAndMerge(data_directory_path, merge_csv_file_name) # data_read_and_merge.readAndMerge(path,merge_file_name) data_prepare = DataPrepare(data_directory_path, merge_csv_file_name) #data_prepare.dataPrepare(path, merge_file_name) #data_prepared = pd.read_csv(os.path.join(data_directory_path, prepared_csv_file_name)) #print(data_prepared.head()) #print(data_prepared.shape) #data_explore = DataExploration(data_prepared) #data_explore.dataExploration(data_prepared) feature_engineering = FeatureEngineering(data_directory_path, prepared_csv_file_name) #feature_engineering.featureEngineering(path,prepared_file_name) modelling = Modelling(data_directory_path, features_target_csv_file_name) #modelling.modelling(data_directory_path, features_target_csv_file_name) model_pipeline = ModelPipeline(data_read_and_merge, data_prepare, feature_engineering, modelling) model_pipeline.fit(data_directory_path, merge_csv_file_name, prepared_csv_file_name, features_target_csv_file_name) print("Model Process ends", time.time() - start, "s")
def main(data_directory_path): print("Model Process starts") start = time.time() data_read = DataRead(data_directory_path) data_prepare = DataPrepare() data_explore = DataExploration() feature_engineering = FeatureEngineering() modelling = Modelling() model_pipeline = ModelPipeline(data_read, data_explore, data_prepare, feature_engineering, modelling) model_pipeline.fit(data_directory_path) print("Model Process ends", time.time() - start, "s")
def __init__(self): fe = FeatureEngineering() fe.feature_age() fe.feature_days_admitted() fe.feature_total_medical_history() fe.feature_total_preop_medication() fe.feature_total_symptoms() fe.feature_lab_results_ratios() df_data = self.__drop_cols(fe.df_data) self.df_data = self.__get_dummy_vars(df_data)
# from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import MultinomialNB import numpy as np from feature_engineering import FeatureEngineering from cross_validation import CrossValidation from multi_log_loss import multi_log_loss f = FeatureEngineering('../data/gender_age_train.csv', '../data/gender_age_test.csv', 'device_id', wide_files=[#'../features/apps_per_event.csv', '../features/avg_position.csv', #'../features/count_by_hour.csv', '../features/count_by_period.csv', '../features/event_counts.csv', '../features/sd_position.csv'], long_files=[#'../features/active_app_category_counts.csv', #'../features/installed_app_category_counts.csv', '../features/phone_brand.csv']) labels, features, colnames = f.extract_features() labels.set_index(np.arange(labels.shape[0]), inplace=True) colnames.set_index(np.arange(colnames.shape[0]), inplace=True) train_filter = [i for i, x in enumerate(labels['age'].tolist()) if not np.isnan(x)] test_filter = [i for i, x in enumerate(labels['age'].tolist()) if np.isnan(x)] cv = CrossValidation(features[train_filter, :], labels.ix[train_filter, 'group'], features[test_filter, :], multi_log_loss) model = MultinomialNB() model.predict = model.predict_proba out = cv.run(model, 'test')
def __init__(self): Helpers.__init__(self) WordEmbeddings.__init__(self) FeatureEngineering.__init__(self)
from feature_engineering import FeatureEngineering f = FeatureEngineering('ex_train.csv', 'ex_test.csv', 'key', ['ex_wide_1.csv', 'ex_wide_2.csv'], 'ex_long.csv') d = f.extract_features()
import mlflow.tensorflow from sklearn.model_selection import train_test_split from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.models import Sequential from tensorflow.keras import layers from data_processing import DataProcessing from feature_engineering import FeatureEngineering from classifier import QuestionAnswerClassifer if __name__ == '__main__': Feature_Engineering = FeatureEngineering(new_df, 'Question', 'Label', 'Sentence') maxlen = Feature_Engineering.determine_maxlen() + 10 print(f'Maxlen:{maxlen}') #Split the dataset into train and test set new_df['Question_Sentence'] = new_df['Question'] + ' ' + new_df['Sentence'] features = new_df['Question_Sentence'] target = new_df['Label'] x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=0) tokenizer = Feature_Engineering.text_tokenize(x_train.values, num_words=5000)
def __init__(self, data, y_column_name): self.data = data self.y_column_name = y_column_name self.y = self.data[[self.y_column_name]] self.feature_engineering = FeatureEngineering(data, y_column_name)
print("Turning string to categorical variables") df_raw = cleaning.turnStringToCategorical(df_raw) #Aligning the levels properly df_raw.UsageBand.cat.set_categories(['High', 'Medium', 'Low'], ordered=True, inplace=True) #converting date and time to features feat_eng = FeatureEngineering() feat_eng.convertDatesToFeatures(df_raw, 'saledate') #saving as feather try: os.makedirs('tmp', exist_ok=True) df_raw.to_feather('tmp/raw') except (FileNotFoundError, IOError) as e: print(e) feat_eng = FeatureEngineering() print(feat_eng.testIfDateTimeWorks()); base_config = read_yaml('baseConfig.yaml') #Reading files try: df_raw = pd.read_feather(base_config.parameters.bulldozer_train_feather) print('Finished reading feather file') if 'saleYear' in df_raw.columns: print('Features from dates are present in this feather file') except (IOError, OSError) as e: print('Feather file does not exist') print(e) print('Doing the initial setup')
def __init__(self, data, y_column_name): self.data = data self.y_column_name = y_column_name self.feature_engineering = FeatureEngineering(data, y_column_name) self.feature_selection = FeatureSelection(data, y_column_name)