class TrainModel: def __init__(self,run_id,data_path): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'TrainModel', 'training') self.loadValidate = LoadValidate(self.run_id, self.data_path,'training') self.preProcess = Preprocessor(self.run_id, self.data_path,'training') self.modelTuner = ModelTuner(self.run_id, self.data_path, 'training') self.fileOperation = FileOperation(self.run_id, self.data_path, 'training') self.cluster = KMeansCluster(self.run_id, self.data_path) def training_model(self): try: self.logger.info('Start of Training') self.logger.info('Run_id:' + str(self.run_id)) #Load, validations and transformation self.loadValidate.validate_trainset() #preprocessing activities self.X, self.y = self.preProcess.preprocess_trainset() columns = {"data_columns":[col for col in self.X.columns]} with open('apps/database/columns.json','w') as f: f.write(json.dumps(columns)) #create clusters number_of_clusters = self.cluster.elbow_plot(self.X) # Divide the data into clusters self.X= self.cluster.create_clusters(self.X, number_of_clusters) # create a new column in the dataset consisting of the corresponding cluster assignments. self.X['Labels'] = self.y # getting the unique clusters from our data set list_of_clusters = self.X['Cluster'].unique() # parsing all the clusters and look for the best ML algorithm to fit on individual cluster for i in list_of_clusters: cluster_data=self.X[self.X['Cluster']==i] # filter the data for one cluster # Prepare the feature and Label columns cluster_features=cluster_data.drop(['Labels','Cluster'],axis=1) cluster_label= cluster_data['Labels'] # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split(cluster_features, cluster_label, test_size=0.2, random_state=0) #getting the best model for each of the clusters best_model_name, best_model = self.modelTuner.get_best_model(x_train, y_train, x_test, y_test) #saving the best model to the directory. save_model=self.fileOperation.save_model(best_model,best_model_name+str(i)) self.logger.info('End of Training') except Exception: self.logger.exception('Unsuccessful End of Training') raise Exception
class PredictModel: """ ************************************************************************** * * filename: PredictModel.py * version: 1.0 * creation date: 05-MAY-2020 * * change history: * * * description: Class to prediction the result * ************************************************************************** """ def __init__(self, run_id, data_path): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'PredictModel', 'prediction') self.loadValidate = LoadValidate(self.run_id, self.data_path, 'prediction') self.preProcess = Preprocessor(self.run_id, self.data_path, 'prediction') self.fileOperation = FileOperation(self.run_id, self.data_path, 'prediction') def batch_predict_from_model(self): """ * method: batch_predict_from_model * description: method to prediction the results * return: none * * Parameters * none: """ try: self.logger.info('Start of Prediction') self.logger.info('run_id:' + str(self.run_id)) #validations and transformation self.loadValidate.validate_predictset() #preprocessing activities self.X = self.preProcess.preprocess_predictset() #load model kmeans = self.fileOperation.load_model('KMeans') #cluster selection clusters = kmeans.predict(self.X.drop(['empid'], axis=1)) self.X['clusters'] = clusters clusters = self.X['clusters'].unique() y_predicted = [] for i in clusters: self.logger.info('clusters loop started') cluster_data = self.X[self.X['clusters'] == i] cluster_data_new = cluster_data.drop(['empid', 'clusters'], axis=1) model_name = self.fileOperation.correct_model(i) model = self.fileOperation.load_model(model_name) y_predicted = model.predict(cluster_data_new) #result = pd.DataFrame(list(zip(y_predicted)), columns=['Predictions']) #result.to_csv(self.data_path+'_results/'+'Predictions.csv', header=True, mode='a+') result = pd.DataFrame({ "EmpId": cluster_data['empid'], "Prediction": y_predicted }) result.to_csv(self.data_path + '_results/' + 'Predictions.csv', header=True, mode='a+', index=False) self.logger.info('End of Prediction') except Exception: self.logger.exception('Unsuccessful End of Prediction') raise Exception def single_predict_from_model(self, data): """ * method: single_predict_from_model * description: method to prediction the results * return: none * * Parameters * none: """ try: self.logger.info('Start of Prediction') self.logger.info('run_id:' + str(self.run_id)) #preprocessing activities self.X = self.preProcess.preprocess_predict(data) #load model kmeans = self.fileOperation.load_model('KMeans') #cluster selection clusters = kmeans.predict(self.X.drop(['empid'], axis=1)) self.X['clusters'] = clusters clusters = self.X['clusters'].unique() y_predicted = [] for i in clusters: self.logger.info('clusters loop started') cluster_data = self.X[self.X['clusters'] == i] cluster_data_new = cluster_data.drop(['empid', 'clusters'], axis=1) model_name = self.fileOperation.correct_model(i) model = self.fileOperation.load_model(model_name) self.logger.info('Shape of Data ' + str(cluster_data_new.shape)) self.logger.info('Shape of Data ' + str(cluster_data_new.info())) y_predicted = model.predict(cluster_data_new) #result = pd.DataFrame(list(zip(y_predicted)), columns=['Predictions']) #result.to_csv(self.data_path+'_results/'+'Predictions.csv', header=True, mode='a+') #result = pd.DataFrame({"EmpId": cluster_data['empid'],"Prediction": y_predicted}) #result.to_csv(self.data_path+'_results/'+'Predictions.csv', header=True, mode='a+',index=False) self.logger.info('Output : ' + str(y_predicted)) self.logger.info('End of Prediction') return int(y_predicted[0]) except Exception: self.logger.exception('Unsuccessful End of Prediction') raise Exception
class FileOperation: def __init__(self, run_id, data_path, mode): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'FileOperation', mode) def save_model(self, model, file_name): try: self.logger.info('Start of Save Models') path = os.path.join( 'apps/models/', file_name) #create seperate directory for each cluster if os.path.isdir( path ): #remove previously existing models for each clusters shutil.rmtree('apps/models') os.makedirs(path) else: os.makedirs(path) # with open(path + '/' + file_name + '.sav', 'wb') as f: pickle.dump(model, f) # save the model to file self.logger.info('Model File ' + file_name + ' saved') self.logger.info('End of Save Models') return 'success' except Exception as e: self.logger.exception('Exception raised while Save Models: %s' % e) raise Exception() def load_model(self, file_name): try: self.logger.info('Start of Load Model') with open('apps/models/' + file_name + '/' + file_name + '.sav', 'rb') as f: self.logger.info('Model File ' + file_name + ' loaded') self.logger.info('End of Load Model') return pickle.load(f) except Exception as e: self.logger.exception('Exception raised while Loading Model: %s' % e) raise Exception() def correct_model(self, cluster_number): try: self.logger.info('Start of finding correct model') self.cluster_number = cluster_number self.folder_name = 'apps/models' self.list_of_model_files = [] self.list_of_files = os.listdir(self.folder_name) for self.file in self.list_of_files: try: if (self.file.index(str(self.cluster_number)) != -1): self.model_name = self.file except: continue self.model_name = self.model_name.split('.')[0] self.logger.info('End of finding correct model') return self.model_name except Exception as e: self.logger.info('Exception raised while finding correct model' + str(e)) raise Exception()
class DatabaseOperation: def __init__(self, run_id, data_path, mode): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'DatabaseOperation', mode) def database_connection(self, database_name): try: conn = sqlite3.connect('apps/database/' + database_name + '.db') self.logger.info("Opened %s database successfully" % database_name) except ConnectionError: self.logger.info("Error while connecting to database: %s" % ConnectionError) raise ConnectionError return conn def create_table(self, database_name, table_name, column_names): try: self.logger.info('Start of Creating Table...') conn = self.database_connection(database_name) if (database_name == 'prediction'): conn.execute("DROP TABLE IF EXISTS '" + table_name + "';") c = conn.cursor() c.execute( "SELECT count(name) FROM sqlite_master WHERE type = 'table' AND name = '" + table_name + "'") if c.fetchone()[0] == 1: conn.close() self.logger.info('Tables created successfully') self.logger.info("Closed %s database successfully" % database_name) else: for key in column_names.keys(): type = column_names[key] #in try block we check if the table exists, if yes then add columns to the table # else in catch block we will create the table --training_raw_data_t try: conn.execute("ALTER TABLE " + table_name + " ADD COLUMN {column_name} {dataType}". format(column_name=key, dataType=type)) self.logger.info("ALTER TABLE " + table_name + " ADD COLUMN") except: conn.execute("CREATE TABLE " + table_name + " ({column_name} {dataType})".format( column_name=key, dataType=type)) self.logger.info("CREATE TABLE " + table_name + " column_name") conn.close() self.logger.info('End of Creating Table...') except Exception as e: self.logger.exception('Exception raised while Creating Table: %s' % e) raise e def insert_data(self, database_name, table_name): conn = self.database_connection(database_name) good_data_path = self.data_path bad_data_path = self.data_path + '_rejects' only_files = [f for f in listdir(good_data_path)] self.logger.info('Start of Inserting Data into Table...') for file in only_files: try: with open(good_data_path + '/' + file, "r") as f: next(f) reader = csv.reader(f, delimiter=",") for line in enumerate(reader): #self.logger.info(" %s: nu!!" % line[1]) to_db = '' for list_ in (line[1]): try: to_db = to_db + ",'" + list_ + "'" except Exception as e: raise e #self.logger.info(" %s: list_!!" % to_db.lstrip(',')) to_db = to_db.lstrip(',') conn.execute("INSERT INTO " + table_name + " values ({values})".format( values=(to_db))) conn.commit() except Exception as e: conn.rollback() self.logger.exception( 'Exception raised while Inserting Data into Table: %s ' % e) shutil.move(good_data_path + '/' + file, bad_data_path) conn.close() conn.close() self.logger.info('End of Inserting Data into Table...') def export_csv(self, database_name, table_name): self.file_from_db = self.data_path + str('_validation/') self.file_name = 'InputFile.csv' try: self.logger.info('Start of Exporting Data into CSV...') conn = self.database_connection(database_name) sqlSelect = "SELECT * FROM " + table_name + "" cursor = conn.cursor() cursor.execute(sqlSelect) results = cursor.fetchall() # Get the headers of the csv file headers = [i[0] for i in cursor.description] #Make the CSV ouput directory if not os.path.isdir(self.file_from_db): os.makedirs(self.file_from_db) # Open CSV file for writing. csv_file = csv.writer(open(self.file_from_db + self.file_name, 'w', newline=''), delimiter=',', lineterminator='\r\n', quoting=csv.QUOTE_ALL, escapechar='\\') # Add the headers and data to the CSV file. csv_file.writerow(headers) csv_file.writerows(results) self.logger.info('End of Exporting Data into CSV...') except Exception as e: self.logger.exception( 'Exception raised while Exporting Data into CSV: %s ' % e)
class Preprocessor: def __init__(self,run_id,data_path,mode): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'Preprocessor', mode) def get_data(self): try: # reading the data file self.logger.info('Start of reading dataset...') self.data= pd.read_csv(self.data_path+'_validation/InputFile.csv') self.logger.info('End of reading dataset...') return self.data except Exception as e: self.logger.exception('Exception raised while reading dataset: %s'+str(e)) raise Exception() def drop_columns(self,data,columns): self.data=data self.columns=columns try: self.logger.info('Start of Droping Columns...') self.useful_data=self.data.drop(labels=self.columns, axis=1) # drop the labels specified in the columns self.logger.info('End of Droping Columns...') return self.useful_data except Exception as e: self.logger.exception('Exception raised while Droping Columns:'+str(e)) raise Exception() def is_null_present(self,data): self.null_present = False try: self.logger.info('Start of finding missing values...') self.null_counts=data.isna().sum() # check for the count of null values per column for i in self.null_counts: if i>0: self.null_present=True break if(self.null_present): # write the logs to see which columns have null values dataframe_with_null = pd.DataFrame() dataframe_with_null['columns'] = data.columns dataframe_with_null['missing values count'] = np.asarray(data.isna().sum()) dataframe_with_null.to_csv(self.data_path+'_validation/'+'null_values.csv') # storing the null column information to file self.logger.info('End of finding missing values...') return self.null_present except Exception as e: self.logger.exception('Exception raised while finding missing values:'+str(e)) raise Exception() def impute_missing_values(self, data): self.data= data try: self.logger.info('Start of imputing missing values...') imputer=KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan) self.new_array=imputer.fit_transform(self.data) # impute the missing values # convert the nd-array returned in the step above to a Data frame self.new_data=pd.DataFrame(data=self.new_array, columns=self.data.columns) self.logger.info('End of imputing missing values...') return self.new_data except Exception as e: self.logger.exception('Exception raised while imputing missing values:'+str(e)) raise Exception() def feature_encoding(self, data): try: self.logger.info('Start of feature encoding...') self.new_data = data.select_dtypes(include=['object']).copy() # Using the dummy encoding to encode the categorical columns to numerical ones for col in self.new_data.columns: self.new_data = pd.get_dummies(self.new_data, columns=[col], prefix=[col], drop_first=True) self.logger.info('End of feature encoding...') return self.new_data except Exception as e: self.logger.exception('Exception raised while feature encoding:' + str(e)) raise Exception() def split_features_label(self, data, label_name): self.data =data try: self.logger.info('Start of splitting features and label ...') self.X=self.data.drop(labels=label_name,axis=1) # drop the columns specified and separate the feature columns self.y=self.data[label_name] # Filter the Label columns self.logger.info('End of splitting features and label ...') return self.X,self.y except Exception as e: self.logger.exception('Exception raised while splitting features and label:' + str(e)) raise Exception() def final_predictset(self,data): try: self.logger.info('Start of building final predictset...') with open('apps/database/columns.json', 'r') as f: data_columns = json.load(f)['data_columns'] f.close() df = pd.DataFrame(data=None, columns=data_columns) df_new = pd.concat([df, data], ignore_index=True,sort=False) data_new = df_new.fillna(0) self.logger.info('End of building final predictset...') return data_new except ValueError: self.logger.exception('ValueError raised while building final predictset') raise ValueError except KeyError: self.logger.exception('KeyError raised while building final predictset') raise KeyError except Exception as e: self.logger.exception('Exception raised while building final predictset: %s' % e) raise e def preprocess_trainset(self): try: self.logger.info('Start of Preprocessing...') # get data into pandas data frame data=self.get_data() # drop unwanted columns data=self.drop_columns(data,['empid']) # handle label encoding cat_df = self.feature_encoding(data) data = pd.concat([data, cat_df], axis=1) # drop categorical column data = self.drop_columns(data, ['salary']) # check if missing values are present in the data set is_null_present = self.is_null_present(data) # if missing values are there, replace them appropriately. if (is_null_present): data = self.impute_missing_values(data) # missing value imputation # create separate features and labels self.X, self.y = self.split_features_label(data, label_name='left') self.logger.info('End of Preprocessing...') return self.X, self.y except Exception: self.logger.exception('Unsuccessful End of Preprocessing...') raise Exception def preprocess_predictset(self): try: self.logger.info('Start of Preprocessing...') # get data into pandas data frame data=self.get_data() # drop unwanted columns #data=self.drop_columns(data,['empid']) # handle label encoding cat_df = self.feature_encoding(data) data = pd.concat([data, cat_df], axis=1) # drop categorical column data = self.drop_columns(data, ['salary']) # check if missing values are present in the data set is_null_present = self.is_null_present(data) # if missing values are there, replace them appropriately. if (is_null_present): data = self.impute_missing_values(data) # missing value imputation data = self.final_predictset(data) self.logger.info('End of Preprocessing...') return data except Exception: self.logger.exception('Unsuccessful End of Preprocessing...') raise Exception def preprocess_predict(self,data): try: self.logger.info('Start of Preprocessing...') cat_df = self.feature_encoding(data) data = pd.concat([data, cat_df], axis=1) # drop categorical column data = self.drop_columns(data, ['salary']) # check if missing values are present in the data set is_null_present = self.is_null_present(data) # if missing values are there, replace them appropriately. if (is_null_present): data = self.impute_missing_values(data) # missing value imputation data = self.final_predictset(data) self.logger.info('End of Preprocessing...') return data except Exception: self.logger.exception('Unsuccessful End of Preprocessing...') raise Exception
class LoadValidate: def __init__(self, run_id, data_path, mode): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'LoadValidate', mode) self.dbOperation = DatabaseOperation(self.run_id, self.data_path, mode) def values_from_schema(self, schema_file): try: self.logger.info('Start of Reading values From Schema...') with open('apps/database/' + schema_file + '.json', 'r') as f: dic = json.load(f) f.close() column_names = dic['ColName'] number_of_columns = dic['NumberofColumns'] self.logger.info('End of Reading values From Schema...') except ValueError: self.logger.exception( 'ValueError raised while Reading values From Schema') raise ValueError except KeyError: self.logger.exception( 'KeyError raised while Reading values From Schema') raise KeyError except Exception as e: self.logger.exception( 'Exception raised while Reading values From Schema: %s' % e) raise e return column_names, number_of_columns def validate_column_length(self, number_of_columns): try: self.logger.info('Start of Validating Column Length...') for file in listdir(self.data_path): csv = pd.read_csv(self.data_path + '/' + file) if csv.shape[1] == number_of_columns: pass else: shutil.move(self.data_path + '/' + file, self.data_path + '_rejects') self.logger.info("Invalid Columns Length :: %s" % file) self.logger.info('End of Validating Column Length...') except OSError: self.logger.exception( 'OSError raised while Validating Column Length') raise OSError except Exception as e: self.logger.exception( 'Exception raised while Validating Column Length: %s' % e) raise e def validate_missing_values(self): try: self.logger.info('Start of Validating Missing Values...') for file in listdir(self.data_path): csv = pd.read_csv(self.data_path + '/' + file) count = 0 for columns in csv: if (len(csv[columns]) - csv[columns].count()) == len( csv[columns]): count += 1 shutil.move(self.data_path + '/' + file, self.data_path + '_rejects') self.logger.info("All Missing Values in Column :: %s" % file) break self.logger.info('End of Validating Missing Values...') except OSError: self.logger.exception( 'OSError raised while Validating Missing Values') raise OSError except Exception as e: self.logger.exception( 'Exception raised while Validating Missing Values: %s' % e) raise e def replace_missing_values(self): try: self.logger.info('Start of Replacing Missing Values with NULL...') only_files = [f for f in listdir(self.data_path)] for file in only_files: csv = pd.read_csv(self.data_path + "/" + file) csv.fillna('NULL', inplace=True) csv.to_csv(self.data_path + "/" + file, index=None, header=True) self.logger.info('%s: File Transformed successfully!!' % file) self.logger.info('End of Replacing Missing Values with NULL...') except Exception as e: self.logger.exception( 'Exception raised while Replacing Missing Values with NULL: %s' % e) def archive_old_files(self): now = datetime.now() date = now.date() time = now.strftime("%H%M%S") try: self.logger.info('Start of Archiving Old Rejected Files...') source = self.data_path + '_rejects/' if os.path.isdir(source): path = self.data_path + '_archive' if not os.path.isdir(path): os.makedirs(path) dest = path + '/reject_' + str(date) + "_" + str(time) files = os.listdir(source) for f in files: if not os.path.isdir(dest): os.makedirs(dest) if f not in os.listdir(dest): shutil.move(source + f, dest) self.logger.info('End of Archiving Old Rejected Files...') self.logger.info('Start of Archiving Old Validated Files...') source = self.data_path + '_validation/' if os.path.isdir(source): path = self.data_path + '_archive' if not os.path.isdir(path): os.makedirs(path) dest = path + '/validation_' + str(date) + "_" + str(time) files = os.listdir(source) for f in files: if not os.path.isdir(dest): os.makedirs(dest) if f not in os.listdir(dest): shutil.move(source + f, dest) self.logger.info('End of Archiving Old Validated Files...') self.logger.info('Start of Archiving Old Processed Files...') source = self.data_path + '_processed/' if os.path.isdir(source): path = self.data_path + '_archive' if not os.path.isdir(path): os.makedirs(path) dest = path + '/processed_' + str(date) + "_" + str(time) files = os.listdir(source) for f in files: if not os.path.isdir(dest): os.makedirs(dest) if f not in os.listdir(dest): shutil.move(source + f, dest) self.logger.info('End of Archiving Old Processed Files...') self.logger.info('Start of Archiving Old Result Files...') source = self.data_path + '_results/' if os.path.isdir(source): path = self.data_path + '_archive' if not os.path.isdir(path): os.makedirs(path) dest = path + '/results_' + str(date) + "_" + str(time) files = os.listdir(source) for f in files: if not os.path.isdir(dest): os.makedirs(dest) if f not in os.listdir(dest): shutil.move(source + f, dest) self.logger.info('End of Archiving Old Result Files...') except Exception as e: self.logger.exception( 'Exception raised while Archiving Old Rejected Files: %s' % e) raise e def move_processed_files(self): try: self.logger.info('Start of Moving Processed Files...') for file in listdir(self.data_path): shutil.move(self.data_path + '/' + file, self.data_path + '_processed') self.logger.info("Moved the already processed file %s" % file) self.logger.info('End of Moving Processed Files...') except Exception as e: self.logger.exception( 'Exception raised while Moving Processed Files: %s' % e) raise e def validate_trainset(self): try: self.logger.info( 'Start of Data Load, validation and transformation') # archive old files self.archive_old_files() # extracting values from training schema column_names, number_of_columns = self.values_from_schema( 'schema_train') # validating column length in the file self.validate_column_length(number_of_columns) # validating if any column has all values missing self.validate_missing_values() # replacing blanks in the csv file with "Null" values self.replace_missing_values() # create database with given name, if present open the connection! Create table with columns given in schema self.dbOperation.create_table('training', 'training_raw_data_t', column_names) # insert csv files in the table self.dbOperation.insert_data('training', 'training_raw_data_t') # export data in table to csv file self.dbOperation.export_csv('training', 'training_raw_data_t') # move processed files self.move_processed_files() self.logger.info('End of Data Load, validation and transformation') except Exception: self.logger.exception( 'Unsuccessful End of Data Load, validation and transformation') raise Exception def validate_predictset(self): try: self.logger.info( 'Start of Data Load, validation and transformation') # archive old rejected files self.archive_old_files() # extracting values from schema column_names, number_of_columns = self.values_from_schema( 'schema_predict') # validating column length in the file self.validate_column_length(number_of_columns) # validating if any column has all values missing self.validate_missing_values() # replacing blanks in the csv file with "Null" values self.replace_missing_values() # create database with given name, if present open the connection! Create table with columns given in schema self.dbOperation.create_table('prediction', 'prediction_raw_data_t', column_names) # insert csv files in the table self.dbOperation.insert_data('prediction', 'prediction_raw_data_t') # export data in table to csv file self.dbOperation.export_csv('prediction', 'prediction_raw_data_t') # move processed files self.move_processed_files() self.logger.info('End of Data Load, validation and transformation') except Exception: self.logger.exception( 'Unsuccessful End of Data Load, validation and transformation') raise Exception
class Preprocessor: """ ***************************************************************************** * * filename: Preprocessor.py * version: 1.0 * author: * creation date: * * * * description: Class to pre-process training and predict dataset * **************************************************************************** """ def __init__(self, run_id, data_path, mode): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'Preprocessor', mode) def get_data(self): """ * method: get_data * description: method to read datafile * return: A pandas DataFrame * * * Parameters * none: """ try: # reading the data file self.logger.info('Start of reading dataset...') self.data = pd.read_csv(self.data_path + '_validation/InputFile.csv') self.logger.info('End of reading dataset...') return self.data except Exception as e: self.logger.exception( 'Exception raised while reading dataset: %s' + str(e)) raise Exception() def save_encoded_data(self): """ * method: get_data * description: method to save datafile * return: A pandas DataFrame * * * Parameters * none: """ try: # reading the data file self.logger.info('Start of saving dataset...') self.data.to_csv(self.data_path + '_encode/encoded.csv') self.logger.info('End of saving dataset...') return self.data except Exception as e: self.logger.exception( 'Exception raised while reading dataset: %s' + str(e)) raise Exception() def drop_columns(self, data, columns): """ * method: drop_columns * description: method to drop columns * return: A pandas DataFrame after removing the specified columns. * * * Parameters * data: * columns: """ self.data = data self.columns = columns try: self.logger.info('Start of Droping Columns...') self.useful_data = self.data.drop( labels=self.columns, axis=1) # drop the labels specified in the columns self.logger.info('End of Droping Columns...') return self.useful_data except Exception as e: self.logger.exception('Exception raised while Droping Columns:' + str(e)) raise Exception() def replace_invalid_values_with_null(self, data): """ Method Name: is_null_present Description: This method replaces invalid values i.e. '?' with null, as discussed in EDA. """ # self.data = data try: self.logger.info('Start of replacing invalid values...') for column in data.columns: count = data[column][data[column] == '?'].count() if count != 0: data[column] = data[column].replace('?', np.nan) self.logger.info('end of replacing invalid values...') return data except Exception as e: self.logger.exception( 'Exception raised while replacing invalid values' + str(e)) raise Exception() def is_null_present(self, data): """ * method: is_null_present * description: method to check null values * return: Returns a Boolean Value. True if null values are present in the DataFrame, False if they are not present. * * Parameters * data: """ self.null_present = False try: self.logger.info('Start of finding missing values...') self.null_counts = data.isna().sum( ) # check for the count of null values per column for i in self.null_counts: if i > 0: self.null_present = True break if (self.null_present ): # write the logs to see which columns have null values dataframe_with_null = pd.DataFrame() dataframe_with_null['columns'] = data.columns dataframe_with_null['missing values count'] = np.asarray( data.isna().sum()) dataframe_with_null.to_csv( self.data_path + '_validation/' + 'null_values.csv' ) # storing the null column information to file self.logger.info('End of finding missing values...') return self.null_present except Exception as e: self.logger.exception( 'Exception raised while finding missing values:' + str(e)) raise Exception() def impute_missing_values(self, data): """ * method: impute_missing_values * description: method to impute missing values * return: none * * Parameters * data: """ self.data = data try: self.logger.info('Start of imputing missing values...') imputer = KNNImputer(n_neighbors=3, weights='uniform', missing_values=np.nan) self.new_array = imputer.fit_transform( self.data) # impute the missing values # convert the nd-array returned in the step above to a Data frame self.new_data = pd.DataFrame(data=self.new_array, columns=self.data.columns) self.logger.info('End of imputing missing values...') return self.new_data except Exception as e: self.logger.exception( 'Exception raised while imputing missing values:' + str(e)) raise Exception() def feature_encoding(self, data): """ * method: feature_encoding * description: method to convert categiorical to numerical * return: none * * * Parameters * data: """ try: self.logger.info('Start of feature encoding...') self.new_data = data.select_dtypes(include=['object']).copy() # Using the dummy encoding to encode the categorical columns to numerical ones for col in self.new_data.columns: self.new_data = pd.get_dummies(self.new_data, columns=[col], prefix=[col], drop_first=True) self.logger.info('End of feature encoding...') return self.new_data except Exception as e: self.logger.exception('Exception raised while feature encoding:' + str(e)) raise Exception() def encode_categorical_values(self, data): """ Method Name: encodeCategoricalValues Description: This method encodes all the categorical values in the training set. Output: A Dataframe which has all the categorical values encoded. On Failure: Raise Exception """ try: self.logger.info('Start of encode Categorical Values ...') # We can map the categorical values like below: data['Gender'] = data['Gender'].map({'a': 0, 'b': 1}) # columns with two categorical data have same value 'f' and 't'. data['PriorDefault'] = data['PriorDefault'].map({'f': 0, 't': 1}) data['Employed'] = data['Employed'].map({'f': 0, 't': 1}) data['DriversLicense'] = data['DriversLicense'].map({ 'f': 0, 't': 1 }) self.logger.info('end of encode Categorical Values...') return data except Exception as e: self.logger.exception( 'Exception raised while splitting features and label:' + str(e)) raise Exception() def feature_selection(self, data): """ * method: get_data * description: method to feature selection of dataset * return: A pandas DataFrame * * * Parameters * none: """ self.data = data try: self.logger.info('Start feature selection of dataset...') X = self.data.iloc[:, :-18] y = self.data['Approved'] ordered_rank_features = SelectKBest(score_func=chi2, k='all') ordered_feature = ordered_rank_features.fit(X, y) data_scores = pd.DataFrame(ordered_feature.scores_, columns=["Score"]) data_columns = pd.DataFrame(X.columns) features_rank = pd.concat([data_columns, data_scores], axis=1) features_rank.columns = ['Features', 'Score'] features_rank.nlargest(10, 'Score').to_csv(self.data_path + '_encode/features_rank.csv') data1 = self.data[[ 'PriorDefault', 'YearsEmployed', 'CreditScore', 'Income', 'Approved' ]] data1.to_csv(self.data_path + '_encode/feature_selection.csv') self.logger.info('End feature selection of dataset...') return data1 except Exception as e: self.logger.exception( 'Exception raised while feature selection of dataset: %s' + str(e)) raise Exception() def feature_select(self, data): """ * method: get_data * description: method to feature selection of dataset * return: A pandas DataFrame * * * Parameters * none: """ self.data = data try: self.logger.info('Start feature selection of dataset...') data1 = self.data[[ 'PriorDefault', 'YearsEmployed', 'CreditScore', 'Income' ]] self.logger.info('End feature selection of dataset...') return data1 except Exception as e: self.logger.exception( 'Exception raised while feature selection of dataset: %s' + str(e)) raise Exception() def split_features_label(self, data, label_name): """ * method: split_features_label * description: method to separate features and label * return: none * * Parameters * data: * label_name: """ self.data = data try: self.logger.info('Start of splitting features and label ...') self.X = self.data.drop( labels=label_name, axis=1 ) # drop the columns specified and separate the feature columns self.y = self.data[label_name] # Filter the Label columns self.logger.info('End of splitting features and label ...') return self.X, self.y except Exception as e: self.logger.exception( 'Exception raised while splitting features and label:' + str(e)) raise Exception() def final_predictset(self, data): """ * method: final_predictset * description: method to build final predict set by adding additional encoded column with value as 0 * return: column_names, Number of Columns * * Parameters * none: """ try: self.logger.info('Start of building final predictset...') with open('apps/database/columns.json', 'r') as f: data_columns = json.load(f)['data_columns'] f.close() df = pd.DataFrame(data=None, columns=data_columns) df_new = pd.concat([df, data], ignore_index=True, sort=False) data_new = df_new.fillna(0) self.logger.info('End of building final predictset...') return data_new except ValueError: self.logger.exception( 'ValueError raised while building final predictset') raise ValueError except KeyError: self.logger.exception( 'KeyError raised while building final predictset') raise KeyError except Exception as e: self.logger.exception( 'Exception raised while building final predictset: %s' % e) raise e def preprocess_trainset(self): """ * method: preprocess_trainset * description: method to pre-process training data * return: none * * * Parameters * none: """ try: self.logger.info('Start of Preprocessing...') # get data into pandas data frame data = self.get_data() # drop unwanted columns data = self.drop_columns(data, ['ZipCode']) # Replacing '?' with nan data = self.replace_invalid_values_with_null(data) # handle Categorical Values data = self.encode_categorical_values(data) cat_df = self.feature_encoding(data) data = pd.concat([data, cat_df], axis=1) # drop categorical column data = self.drop_columns(data, [ 'Married', 'BankCustomer', 'Citizen', 'EducationLevel', 'Ethnicity' ]) # check if missing values are present in the data set is_null_present = self.is_null_present(data) # if missing values are there, replace them appropriately. if (is_null_present): data = self.impute_missing_values( data) # missing value imputation # feature engineering data1 = self.feature_selection(data) # create separate features and labels self.X, self.y = self.split_features_label(data1, label_name='Approved') self.logger.info('End of Preprocessing...') return self.X, self.y except Exception: self.logger.exception('Unsuccessful End of Preprocessing...') raise Exception def preprocess_predictset(self): """ * method: preprocess_predictset * description: method to pre-process prediction data * return: none * * * Parameters * none: """ try: self.logger.info('Start of Preprocessing...') # get data into pandas data frame data = self.get_data() # drop unwanted columns data = self.drop_columns(data, ['ZipCode']) # Replacing '?' with nan data = self.replace_invalid_values_with_null(data) # handle Categorical Values data = self.encode_categorical_values(data) cat_df = self.feature_encoding(data) data = pd.concat([data, cat_df], axis=1) # drop categorical column data = self.drop_columns(data, [ 'Married', 'BankCustomer', 'Citizen', 'EducationLevel', 'Ethnicity' ]) # check if missing values are present in the data set is_null_present = self.is_null_present(data) # if missing values are there, replace them appropriately. if (is_null_present): data = self.impute_missing_values( data) # missing value imputation # feature engineering data1 = self.feature_select(data) data = self.final_predictset(data1) self.logger.info('End of Preprocessing...') return data except Exception: self.logger.exception('Unsuccessful End of Preprocessing...') raise Exception def preprocess_predict(self, data): """ * method: preprocess_predict * description: method to pre-process prediction data * return: none * * * Parameters * none: """ try: self.logger.info('Start of Preprocessing...') data = self.encode_categorical_values(data) cat_df = self.feature_encoding(data) data = pd.concat([data, cat_df], axis=1) # drop categorical column data = self.drop_columns(data, [ 'Married', 'BankCustomer', 'Citizen', 'EducationLevel', 'Ethnicity' ]) is_null_present = self.is_null_present(data) # if missing values are there, replace them appropriately. if (is_null_present): data = self.impute_missing_values( data) # missing value imputation data = self.final_predictset(data) self.logger.info('End of Preprocessing...') return data except Exception: self.logger.exception('Unsuccessful End of Preprocessing...') raise Exception
class ModelTuner: """ ***************************************************************************** * * filename: model_tuner.py * version: 1.0 * author: * creation date: * * * description: Class to tune and select best model * **************************************************************************** """ def __init__(self, run_id, data_path, mode): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'ModelTuner', mode) self.rfc = RandomForestClassifier() self.xgb = XGBClassifier(objective='binary:logistic') # self.knn = KNeighborsClassifier() def best_params_randomforest(self, train_x, train_y): """ * method: best_params_randomforest * description: method to get the parameters for Random Forest Algorithm which give the best accuracy. Use Hyper Parameter Tuning. * return: The model with the best parameters * * * Parameters * train_x: * train_y: """ try: self.logger.info('Start of finding best params for randomforest algo...') # initializing with different combination of parameters self.param_grid = { "n_estimators": [2, 3, 4], "criterion": ['gini', 'entropy'], "max_depth": range(2, 4, 1), "max_features": ['auto', 'log2']} # self.cv_method = RepeatedStratifiedKFold(n_splits=5, # n_repeats=3, # random_state=3) # Creating an object of the Grid Search class self.grid = GridSearchCV(estimator=self.rfc, param_grid=self.param_grid, cv=5) # finding the best parameters self.grid.fit(train_x, train_y) # extracting the best parameters self.criterion = self.grid.best_params_['criterion'] self.max_depth = self.grid.best_params_['max_depth'] self.max_features = self.grid.best_params_['max_features'] self.n_estimators = self.grid.best_params_['n_estimators'] # creating a new model with the best parameters self.rfc = RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, max_depth=self.max_depth, max_features=self.max_features) # training the mew model self.rfc.fit(train_x, train_y) self.logger.info('Random Forest best params: ' + str(self.grid.best_params_)) self.logger.info('End of finding best params for randomforest algo...') return self.rfc except Exception as e: self.logger.exception('Exception raised while finding best params for randomforest algo:' + str(e)) raise Exception() # def get_best_params_for_KNN(self, train_x, train_y): # """ # Method Name: get_best_params_for_KNN # Description: get the parameters for KNN Algorithm which give the best accuracy. # Use Hyper Parameter Tuning. # Output: The model with the best parameters # On Failure: Raise Exception # # # # """ # # try: # self.logger.info('Start of finding best params for KNN algo...') # # initializing with different combination of parameters # self.param_grid_knn = { # 'algorithm': ['ball_tree', 'kd_tree', 'brute'], # 'leaf_size': [10, 17, 24, 28, 30, 35], # 'n_neighbors': [2, 3], # 'p': [1, 2] # } # # self.cv_method = RepeatedStratifiedKFold(n_splits=5, # # n_repeats=3, # # random_state=999) # # Creating an object of the Grid Search class # self.grid = GridSearchCV(self.knn, self.param_grid_knn, verbose=3, # cv=5) # # finding the best parameters # self.grid.fit(train_x, train_y) # # # extracting the best parameters # self.algorithm = self.grid.best_params_['algorithm'] # self.leaf_size = self.grid.best_params_['leaf_size'] # self.n_neighbors = self.grid.best_params_['n_neighbors'] # self.p = self.grid.best_params_['p'] # # # creating a new model with the best parameters # self.knn = KNeighborsClassifier(algorithm=self.algorithm, leaf_size=self.leaf_size, # n_neighbors=self.n_neighbors, p=self.p, n_jobs=-1) # # training the mew model # self.knn.fit(train_x, train_y) # self.logger.info('Knn Forest best params: ' + str(self.grid.best_params_)) # self.logger.info('End of finding best params for knn algo...') # # return self.knn # except Exception as e: # self.logger.exception('Exception raised while finding best params for knn algo:' + str(e)) # raise Exception() def best_params_xgboost(self, train_x, train_y): """ * method: best_params_xgboost * description: method to get the parameters for XGBoost Algorithm which give the best accuracy. Use Hyper Parameter Tuning. * return: The model with the best parameters * * Parameters * train_x: * train_y: """ try: self.logger.info('Start of finding best params for XGBoost algo...') # initializing with different combination of parameters self.param_grid_xgboost = { 'learning_rate': [0.5, 0.1, 0.01, 0.001], 'max_depth': [3, 5, 10, 20], 'n_estimators': [10, 50, 100, 200] } # Creating an object of the Grid Search class self.grid = GridSearchCV(XGBClassifier(objective='binary:logistic'), self.param_grid_xgboost, cv=5) # finding the best parameters self.grid.fit(train_x, train_y) # extracting the best parameters self.learning_rate = self.grid.best_params_['learning_rate'] self.max_depth = self.grid.best_params_['max_depth'] self.n_estimators = self.grid.best_params_['n_estimators'] # creating a new model with the best parameters self.xgb = XGBClassifier(objective='binary:logistic', learning_rate=self.learning_rate, max_depth=self.max_depth, n_estimators=self.n_estimators) # training the mew model self.xgb.fit(train_x, train_y) self.logger.info('XGBoost best params: ' + str(self.grid.best_params_)) self.logger.info('End of finding best params for XGBoost algo...') return self.xgb except Exception as e: self.logger.exception('Exception raised while finding best params for XGBoost algo:' + str(e)) raise Exception() def get_best_model(self, train_x, train_y, test_x, test_y): """ * method: get_best_model * description: method to get best model * return: none * * * Parameters * train_x: * train_y: * test_x: * test_y: """ try: # self.logger.info('Start of finding best model...') # self.knn = self.get_best_params_for_KNN(train_x, train_y) # self.prediction_knn = self.knn.predict(test_x) # Predictions using the Knn Model # # if len(test_y.unique()) == 1: # if there is only one label in y, then roc_auc_score returns error. We # # will use accuracy in that case # self.knn_score = accuracy_score(test_y, self.prediction_knn) # self.logger.info('Accuracy for knn:' + str(self.knn_score)) # else: # self.knn_score = roc_auc_score(test_y, self.prediction_knn) # AUC for knn # self.logger.info('AUC for knn:' + str(self.knn_score)) self.xgb = self.best_params_xgboost(train_x, train_y) self.prediction_xgb = self.xgb.predict(test_x) # prediction using the xgb Algorithm if len(test_y.unique()) == 1: # if there is only one label in y, then roc_auc_score returns error. We # will use accuracy in that case self.xgb_score = accuracy_score(test_y, self.prediction_xgb) self.logger.info('Accuracy for Xgboost:' + str(self.xgb_score)) else: self.xgb_score = roc_auc_score(test_y, self.prediction_xgb) # AUC for XGBoost self.logger.info('AUC for Xgboost:' + str(self.xgb_score)) # create best model for Random Forest self.random_forest = self.best_params_randomforest(train_x, train_y) self.prediction_random_forest = self.random_forest.predict( test_x) # prediction using the Random Forest Algorithm if len(test_y.unique()) == 1: # if there is only one label in y, then roc_auc_score returns error. We # will use accuracy in that case self.random_forest_score = accuracy_score(test_y, self.prediction_random_forest) self.logger.info('Accuracy for Random Forest:' + str(self.random_forest_score)) else: self.random_forest_score = roc_auc_score(test_y, self.prediction_random_forest) # AUC for XGBoost self.logger.info('AUC for Random Forest:' + str(self.random_forest_score)) # comparing the two models self.logger.info('End of finding best model...') if (self.random_forest_score < self.xgb_score): return 'XGB', self.xgb else: return 'RandomForest', self.random_forest except Exception as e: self.logger.exception('Exception raised while finding best model:' + str(e)) raise Exception()
class TrainModel: """ ***************************************************************************** * * filename: TrainModel.py * version: 1.0 * author: * creation date: * * * * * description: Class to training the models * **************************************************************************** """ def __init__(self, run_id, data_path): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'TrainModel', 'training') self.loadValidate = LoadValidate(self.run_id, self.data_path, 'training') self.preProcess = Preprocessor(self.run_id, self.data_path, 'training') self.modelTuner = ModelTuner(self.run_id, self.data_path, 'training') self.fileOperation = FileOperation(self.run_id, self.data_path, 'training') # self.cluster = KMeansCluster(self.run_id, self.data_path) def training_model(self): """ * method: trainingModel * description: method to training the model * return: none * * * Parameters * none: """ try: self.logger.info('Start of Training') self.logger.info('Run_id:' + str(self.run_id)) # Load, validations and transformation self.loadValidate.validate_trainset() # preprocessing activities self.X, self.y = self.preProcess.preprocess_trainset() columns = {"data_columns": [col for col in self.X.columns]} with open('apps/database/columns.json', 'w') as f: f.write(json.dumps(columns)) # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split( self.X, self.y, test_size=0.2, random_state=0) # getting the best model for each of the clusters best_model_name, best_model = self.modelTuner.get_best_model( x_train, y_train, x_test, y_test) # saving the best model to the directory. save_model = self.fileOperation.save_model( best_model, best_model_name) self.logger.info('End of Training') except Exception: self.logger.exception('Unsuccessful End of Training') raise Exception
class FileOperation: """ ***************************************************************************** * * file_name: FileOperation.py * version: 1.0 * author: * creation date: * * change history: * * * * description: Class for file operation * **************************************************************************** """ def __init__(self, run_id, data_path, mode): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'FileOperation', mode) def save_model(self, model, file_name): """ * method: save_model * description: method to save the model file * return: File gets saved * * * Parameters * model: * file_name: """ try: self.logger.info('Start of Save Models') path = os.path.join('apps/models/', file_name) # create seperate directory for each cluster if os.path.isdir(path): # remove previously existing models for each clusters shutil.rmtree('apps/models') os.makedirs(path) else: os.makedirs(path) # with open(path + '/' + file_name + '.sav', 'wb') as f: pickle.dump(model, f) # save the model to file self.logger.info('Model File ' + file_name + ' saved') self.logger.info('End of Save Models') return 'success' except Exception as e: self.logger.exception('Exception raised while Save Models: %s' % e) raise Exception() def load_model(self, file_name): """ * method: load_model * description: method to load the model file * return: File gets saved * * * Parameters * file_name: """ try: self.logger.info('Start of Load Model') with open('apps/models/' + file_name + '/' + file_name + '.sav', 'rb') as f: self.logger.info('Model File ' + file_name + ' loaded') self.logger.info('End of Load Model') return pickle.load(f) except Exception as e: self.logger.exception('Exception raised while Loading Model: %s' % e) raise Exception()
class KMeansCluster: """ ***************************************************************************** * * filename: KMeansCluster.py * version: 1.0 * author: CODESTUDIO * creation date: 05-MAY-2020 * * change history: * * who when version change (include bug# if apply) * ---------- ----------- ------- ------------------------------ * bcheekati 05-MAY-2020 1.0 initial creation * * * description: Class to cluster the dataset * **************************************************************************** """ def __init__(self, run_id, data_path): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'KMeansCluster', 'training') self.fileOperation = FileOperation(self.run_id, self.data_path, 'training') def elbow_plot(self, data): """ * method: log * description: method to saves the plot to decide the optimum number of clusters to the file. * return: A picture saved to the directory * * who when version change (include bug# if apply) * ---------- ----------- ------- ------------------------------ * bcheekati 05-MAY-2020 1.0 initial creation * * Parameters * data: """ wcss = [] # initializing an empty list --within cluster sum of errors try: self.logger.info('Start of elbow plotting...') for i in range(1, 11): kmeans = KMeans( n_clusters=i, init='k-means++', random_state=0) # initializing the KMeans object kmeans.fit(data) # fitting the data to the KMeans Algorithm wcss.append(kmeans.inertia_) plt.plot( range(1, 11), wcss ) # creating the graph between WCSS and the number of clusters plt.title('The Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') #plt.show() plt.savefig('apps/models/kmeans_elbow.png' ) # saving the elbow plot locally # finding the value of the optimum cluster programmatically self.kn = KneeLocator(range(1, 11), wcss, curve='convex', direction='decreasing') self.logger.info('The optimum number of clusters is: ' + str(self.kn.knee)) self.logger.info('End of elbow plotting...') return self.kn.knee except Exception as e: self.logger.exception('Exception raised while elbow plotting:' + str(e)) raise Exception() def create_clusters(self, data, number_of_clusters): """ * method: create_clusters * description: method to create clusters * return: A date frame with cluster column * * who when version change (include bug# if apply) * ---------- ----------- ------- ------------------------------ * bcheekati 05-MAY-2020 1.0 initial creation * * Parameters * data: * number_of_clusters: """ self.data = data try: self.logger.info('Start of Create clusters...') self.kmeans = KMeans(n_clusters=number_of_clusters, init='k-means++', random_state=0) self.y_kmeans = self.kmeans.fit_predict( data) # divide data into clusters self.saveModel = self.fileOperation.save_model( self.kmeans, 'KMeans') # saving the KMeans model to directory # passing 'Model' as the functions need three parameters self.data[ 'Cluster'] = self.y_kmeans # create a new column in dataset for storing the cluster information self.logger.info('succesfully created ' + str(self.kn.knee) + 'clusters.') self.logger.info('End of Create clusters...') return self.data except Exception as e: self.logger.exception('Exception raised while Creating clusters:' + str(e)) raise Exception()
class Preprocessor: """ ***************************************************************************** * * filename: Preprocessor.py * version: 1.0 * author: CODESTUDIO * creation date: 05-MAY-2020 * * change history: * * who when version change (include bug# if apply) * ---------- ----------- ------- ------------------------------ * bcheekati 05-MAY-2020 1.0 initial creation * * * description: Class to pre-process training dataset * **************************************************************************** """ def __init__(self,run_id,data_path,mode): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'Preprocessor', mode) def get_data(self): """ * method: get_data * description: method to read datafile * return: A pandas DataFrame * * who when version change (include bug# if apply) * ---------- ----------- ------- ------------------------------ * bcheekati 05-MAY-2020 1.0 initial creation * * Parameters * none: """ try: # reading the data file self.logger.info('Start of reading dataset...') self.data= pd.read_csv(self.data_path+'_validation/InputFile.csv') self.logger.info('End of reading dataset...') return self.data except Exception as e: self.logger.exception('Exception raised while reading dataset: %s'+str(e)) raise Exception() def drop_columns(self,data,columns): """ * method: drop_columns * description: method to drop columns * return: A pandas DataFrame after removing the specified columns. * * who when version change (include bug# if apply) * ---------- ----------- ------- ------------------------------ * bcheekati 05-MAY-2020 1.0 initial creation * * Parameters * data: * columns: """ self.data=data self.columns=columns try: self.logger.info('Start of Droping Columns...') self.useful_data=self.data.drop(labels=self.columns, axis=1) # drop the labels specified in the columns self.logger.info('End of Droping Columns...') return self.useful_data except Exception as e: self.logger.exception('Exception raised while Droping Columns:'+str(e)) raise Exception() def is_null_present(self,data): """ * method: is_null_present * description: method to check null values * return: Returns a Boolean Value. True if null values are present in the DataFrame, False if they are not present. * * who when version change (include bug# if apply) * ---------- ----------- ------- ------------------------------ * bcheekati 05-MAY-2020 1.0 initial creation * * Parameters * data: """ self.null_present = False try: self.logger.info('Start of finding missing values...') self.null_counts=data.isna().sum() # check for the count of null values per column for i in self.null_counts: if i>0: self.null_present=True break if(self.null_present): # write the logs to see which columns have null values dataframe_with_null = pd.DataFrame() dataframe_with_null['columns'] = data.columns dataframe_with_null['missing values count'] = np.asarray(data.isna().sum()) dataframe_with_null.to_csv(self.data_path+'_validation/'+'null_values.csv') # storing the null column information to file self.logger.info('End of finding missing values...') return self.null_present except Exception as e: self.logger.exception('Exception raised while finding missing values:'+str(e)) raise Exception() def impute_missing_values(self, data): """ * method: impute_missing_values * description: method to impute missing values * return: none * * who when version change (include bug# if apply) * ---------- ----------- ------- ------------------------------ * bcheekati 05-MAY-2020 1.0 initial creation * * Parameters * data: """ self.data= data try: self.logger.info('Start of imputing missing values...') imputer=KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan) self.new_array=imputer.fit_transform(self.data) # impute the missing values # convert the nd-array returned in the step above to a Data frame self.new_data=pd.DataFrame(data=self.new_array, columns=self.data.columns) self.logger.info('End of imputing missing values...') return self.new_data except Exception as e: self.logger.exception('Exception raised while imputing missing values:'+str(e)) raise Exception() def feature_encoding(self, data): """ * method: feature_encoding * description: method to impute missing values * return: none * * who when version change (include bug# if apply) * ---------- ----------- ------- ------------------------------ * bcheekati 05-MAY-2020 1.0 initial creation * * Parameters * data: """ try: self.logger.info('Start of feature encoding...') self.new_data = data.select_dtypes(include=['object']).copy() # Using the dummy encoding to encode the categorical columns to numerical ones for col in self.new_data.columns: self.new_data = pd.get_dummies(self.new_data, columns=[col], prefix=[col], drop_first=True) self.logger.info('End of feature encoding...') return self.new_data except Exception as e: self.logger.exception('Exception raised while feature encoding:' + str(e)) raise Exception() def split_features_label(self, data, label_name): """ * method: split_features_label * description: method to separate features and label * return: none * * who when version change (include bug# if apply) * ---------- ----------- ------- ------------------------------ * bcheekati 05-MAY-2020 1.0 initial creation * * Parameters * data: * label_name: """ self.data =data try: self.logger.info('Start of splitting features and label ...') self.X=self.data.drop(labels=label_name,axis=1) # drop the columns specified and separate the feature columns self.y=self.data[label_name] # Filter the Label columns self.logger.info('End of splitting features and label ...') return self.X,self.y except Exception as e: self.logger.exception('Exception raised while splitting features and label:' + str(e)) raise Exception() def final_predictset(self,data): """ * method: final_predictset * description: method to build final predict set by adding additional encoded column with value as 0 * return: column_names, Number of Columns * * who when version change (include bug# if apply) * ---------- ----------- ------- ------------------------------ * bcheekati 05-MAY-2020 1.0 initial creation * * Parameters * none: """ try: self.logger.info('Start of building final predictset...') with open('apps/database/columns.json', 'r') as f: data_columns = json.load(f)['data_columns'] f.close() df = pd.DataFrame(data=None, columns=data_columns) df_new = pd.concat([df, data], ignore_index=True,sort=False) data_new = df_new.fillna(0) self.logger.info('End of building final predictset...') return data_new except ValueError: self.logger.exception('ValueError raised while building final predictset') raise ValueError except KeyError: self.logger.exception('KeyError raised while building final predictset') raise KeyError except Exception as e: self.logger.exception('Exception raised while building final predictset: %s' % e) raise e def preprocess_trainset(self): """ * method: preprocess_trainset * description: method to pre-process training data * return: none * * who when version change (include bug# if apply) * ---------- ----------- ------- ------------------------------ * bcheekati 05-MAY-2020 1.0 initial creation * * Parameters * none: """ try: self.logger.info('Start of Preprocessing...') # get data into pandas data frame data=self.get_data() # drop unwanted columns data=self.drop_columns(data,['empid']) # handle label encoding cat_df = self.feature_encoding(data) data = pd.concat([data, cat_df], axis=1) # drop categorical column data = self.drop_columns(data, ['salary']) # check if missing values are present in the data set is_null_present = self.is_null_present(data) # if missing values are there, replace them appropriately. if (is_null_present): data = self.impute_missing_values(data) # missing value imputation # create separate features and labels self.X, self.y = self.split_features_label(data, label_name='left') self.logger.info('End of Preprocessing...') return self.X, self.y except Exception: self.logger.exception('Unsuccessful End of Preprocessing...') raise Exception def preprocess_predictset(self): """ * method: preprocess_predictset * description: method to pre-process prediction data * return: none * * who when version change (include bug# if apply) * ---------- ----------- ------- ------------------------------ * bcheekati 05-MAY-2020 1.0 initial creation * * Parameters * none: """ try: self.logger.info('Start of Preprocessing...') # get data into pandas data frame data=self.get_data() # drop unwanted columns #data=self.drop_columns(data,['empid']) # handle label encoding cat_df = self.feature_encoding(data) data = pd.concat([data, cat_df], axis=1) # drop categorical column data = self.drop_columns(data, ['salary']) # check if missing values are present in the data set is_null_present = self.is_null_present(data) # if missing values are there, replace them appropriately. if (is_null_present): data = self.impute_missing_values(data) # missing value imputation data = self.final_predictset(data) self.logger.info('End of Preprocessing...') return data except Exception: self.logger.exception('Unsuccessful End of Preprocessing...') raise Exception def preprocess_predict(self,data): """ * method: preprocess_predict * description: method to pre-process prediction data * return: none * * who when version change (include bug# if apply) * ---------- ----------- ------- ------------------------------ * bcheekati 05-MAY-2020 1.0 initial creation * * Parameters * none: """ try: self.logger.info('Start of Preprocessing...') cat_df = self.feature_encoding(data) data = pd.concat([data, cat_df], axis=1) # drop categorical column data = self.drop_columns(data, ['salary']) # check if missing values are present in the data set is_null_present = self.is_null_present(data) # if missing values are there, replace them appropriately. if (is_null_present): data = self.impute_missing_values(data) # missing value imputation data = self.final_predictset(data) self.logger.info('End of Preprocessing...') return data except Exception: self.logger.exception('Unsuccessful End of Preprocessing...') raise Exception
class PredictModel: def __init__(self,run_id,data_path): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'PredictModel', 'prediction') self.loadValidate = LoadValidate(self.run_id, self.data_path,'prediction') self.preProcess = Preprocessor(self.run_id, self.data_path,'prediction') self.fileOperation = FileOperation(self.run_id, self.data_path, 'prediction') def batch_predict_from_model(self): try: self.logger.info('Start of Prediction') self.logger.info('run_id:' + str(self.run_id)) #validations and transformation self.loadValidate.validate_predictset() #preprocessing activities self.X = self.preProcess.preprocess_predictset() #load model kmeans = self.fileOperation.load_model('KMeans') #cluster selection clusters = kmeans.predict(self.X.drop(['empid'],axis=1)) self.X['clusters'] = clusters clusters = self.X['clusters'].unique() y_predicted=[] for i in clusters: self.logger.info('clusters loop started') cluster_data = self.X[self.X['clusters'] == i] cluster_data_new = cluster_data.drop(['empid','clusters'], axis=1) model_name = self.fileOperation.correct_model(i) model = self.fileOperation.load_model(model_name) y_predicted = model.predict(cluster_data_new) result = pd.DataFrame({"EmpId": cluster_data['empid'],"Prediction": y_predicted}) result.to_csv(self.data_path+'_results/'+'Predictions.csv', header=True, mode='a+',index=False) self.logger.info('End of Prediction') except Exception: self.logger.exception('Unsuccessful End of Prediction') raise Exception def single_predict_from_model(self,data): try: self.logger.info('Start of Prediction') self.logger.info('run_id:' + str(self.run_id)) #preprocessing activities self.X = self.preProcess.preprocess_predict(data) #load model kmeans = self.fileOperation.load_model('KMeans') #cluster selection clusters = kmeans.predict(self.X.drop(['empid'],axis=1)) self.X['clusters'] = clusters clusters = self.X['clusters'].unique() y_predicted=[] for i in clusters: self.logger.info('clusters loop started') cluster_data = self.X[self.X['clusters'] == i] cluster_data_new = cluster_data.drop(['empid','clusters'], axis=1) model_name = self.fileOperation.correct_model(i) model = self.fileOperation.load_model(model_name) self.logger.info('Shape of Data '+str(cluster_data_new.shape)) self.logger.info('Shape of Data ' + str(cluster_data_new.info())) y_predicted = model.predict(cluster_data_new) self.logger.info('Output : '+str(y_predicted)) self.logger.info('End of Prediction') return int(y_predicted[0]) except Exception: self.logger.exception('Unsuccessful End of Prediction') raise Exception
class ModelTuner: """ ************************************************************************** * * filename: model_tuner.py * version: 1.0 * * change history: * * * description: Class to tune and select best model * ************************************************************************** """ def __init__(self,run_id,data_path,mode): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'ModelTuner', mode) self.rfc = RandomForestClassifier() self.xgb = XGBClassifier(objective='binary:logistic') def best_params_randomforest(self,train_x,train_y): """ * method: best_params_randomforest * description: method to get the parameters for Random Forest Algorithm which give the best accuracy.Use Hyper Parameter Tuning. * * return: The model with the best parameters * * Parameters * train_x: * train_y: """ try: self.logger.info('Start of finding best params for randomforest algo...') # initializing with different combination of parameters self.param_grid = {"n_estimators": [10, 50, 100, 130], "criterion": ['gini', 'entropy'], "max_depth": range(2, 4, 1), "max_features": ['auto', 'log2']} #Creating an object of the Grid Search class self.grid = GridSearchCV(estimator=self.rfc, param_grid=self.param_grid, cv=5) #finding the best parameters self.grid.fit(train_x, train_y) #extracting the best parameters self.criterion = self.grid.best_params_['criterion'] self.max_depth = self.grid.best_params_['max_depth'] self.max_features = self.grid.best_params_['max_features'] self.n_estimators = self.grid.best_params_['n_estimators'] #creating a new model with the best parameters self.rfc = RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, max_depth=self.max_depth, max_features=self.max_features) # training the mew model self.rfc.fit(train_x, train_y) self.logger.info('Random Forest best params: '+str(self.grid.best_params_)) self.logger.info('End of finding best params for randomforest algo...') return self.rfc except Exception as e: self.logger.exception('Exception raised while finding best params for randomforest algo:' + str(e)) raise Exception() def best_params_xgboost(self,train_x,train_y): """ * method: best_params_xgboost * description: method to get the parameters for XGBoost Algorithm which give the best accuracy.Use Hyper Parameter Tuning. * * return: The model with the best parameters * * Parameters * train_x: * train_y: """ try: self.logger.info('Start of finding best params for XGBoost algo...') # initializing with different combination of parameters self.param_grid_xgboost = { 'learning_rate': [0.5, 0.1, 0.01, 0.001], 'max_depth': [3, 5, 10, 20], 'n_estimators': [10, 50, 100, 200] } # Creating an object of the Grid Search class self.grid= GridSearchCV(XGBClassifier(objective='binary:logistic'),self.param_grid_xgboost, cv=5) # finding the best parameters self.grid.fit(train_x, train_y) # extracting the best parameters self.learning_rate = self.grid.best_params_['learning_rate'] self.max_depth = self.grid.best_params_['max_depth'] self.n_estimators = self.grid.best_params_['n_estimators'] # creating a new model with the best parameters self.xgb = XGBClassifier(objective='binary:logistic',learning_rate=self.learning_rate, max_depth=self.max_depth, n_estimators=self.n_estimators) # training the mew model self.xgb.fit(train_x, train_y) self.logger.info('XGBoost best params: ' + str(self.grid.best_params_)) self.logger.info('End of finding best params for XGBoost algo...') return self.xgb except Exception as e: self.logger.exception('Exception raised while finding best params for XGBoost algo:' + str(e)) raise Exception() def get_best_model(self,train_x,train_y,test_x,test_y): """ * method: get_best_model * description: method to get best model * return: none * * Parameters * train_x: * train_y: * test_x: * test_y: """ try: self.logger.info('Start of finding best model...') self.xgboost= self.best_params_xgboost(train_x,train_y) self.prediction_xgboost = self.xgboost.predict(test_x) # Predictions using the XGBoost Model if len(test_y.unique()) == 1: # if there is only one label in y, then roc_auc_score returns error. We will use accuracy in that case self.xgboost_score = accuracy_score(test_y, self.prediction_xgboost) self.logger.info('Accuracy for XGBoost:' + str(self.xgboost_score)) else: self.xgboost_score = roc_auc_score(test_y, self.prediction_xgboost) # AUC for XGBoost self.logger.info('AUC for XGBoost:' + str(self.xgboost_score)) # create best model for Random Forest self.random_forest=self.best_params_randomforest(train_x,train_y) self.prediction_random_forest=self.random_forest.predict(test_x) # prediction using the Random Forest Algorithm if len(test_y.unique()) == 1: # if there is only one label in y, then roc_auc_score returns error. We will use accuracy in that case self.random_forest_score = accuracy_score(test_y, self.prediction_random_forest) self.logger.info('Accuracy for XGBoost:' + str(self.random_forest_score)) else: self.random_forest_score = roc_auc_score(test_y, self.prediction_random_forest) # AUC for XGBoost self.logger.info('AUC for XGBoost:' + str(self.random_forest_score)) #comparing the two models self.logger.info('End of finding best model...') if(self.random_forest_score < self.xgboost_score): return 'XGBoost',self.xgboost else: return 'RandomForest',self.random_forest except Exception as e: self.logger.exception('Exception raised while finding best model:' + str(e)) raise Exception()
class KMeansCluster: def __init__(self, run_id, data_path): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'KMeansCluster', 'training') self.fileOperation = FileOperation(self.run_id, self.data_path, 'training') def elbow_plot(self, data): wcss = [] # initializing an empty list --within cluster sum of errors try: self.logger.info('Start of elbow plotting...') for i in range(1, 11): kmeans = KMeans( n_clusters=i, init='k-means++', random_state=0) # initializing the KMeans object kmeans.fit(data) # fitting the data to the KMeans Algorithm wcss.append(kmeans.inertia_) plt.plot( range(1, 11), wcss ) # creating the graph between WCSS and the number of clusters plt.title('The Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') #plt.show() plt.savefig('apps/models/kmeans_elbow.png' ) # saving the elbow plot locally # finding the value of the optimum cluster programmatically self.kn = KneeLocator(range(1, 11), wcss, curve='convex', direction='decreasing') self.logger.info('The optimum number of clusters is: ' + str(self.kn.knee)) self.logger.info('End of elbow plotting...') return self.kn.knee except Exception as e: self.logger.exception('Exception raised while elbow plotting:' + str(e)) raise Exception() def create_clusters(self, data, number_of_clusters): self.data = data try: self.logger.info('Start of Create clusters...') self.kmeans = KMeans(n_clusters=number_of_clusters, init='k-means++', random_state=0) self.y_kmeans = self.kmeans.fit_predict( data) # divide data into clusters self.saveModel = self.fileOperation.save_model( self.kmeans, 'KMeans') # saving the KMeans model to directory # passing 'Model' as the functions need three parameters self.data[ 'Cluster'] = self.y_kmeans # create a new column in dataset for storing the cluster information self.logger.info('succesfully created ' + str(self.kn.knee) + 'clusters.') self.logger.info('End of Create clusters...') return self.data except Exception as e: self.logger.exception('Exception raised while Creating clusters:' + str(e)) raise Exception()
class FileOperation: """ *************************************************************************** * * filename: file_operation.py * version: 1.0 * author: Abdullah Makhdoom * creation: 23-DEC-2020 * * change_history: * who when version change (include bug# if apply) * ---------- ----------- ------- ------------------------------ * * * description: Class for file operation * **************************************************************************** """ def __init__(self, run_id, data_path, mode): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'FIleOperation', mode) def save_model(self, model, file_name): """ * method: save_model * description: method to save the model file * return: File gets saved * * who when version change (include bug# if apply) * ---------- ----------- ------- ------------------------------ * * Parameters * model: * file_name: """ try: self.logger.info('Start of Save Models') path = os.path.join( 'apps/models/', file_name) # create seperate directory for each cluster if os.path.isdir( path ): # remove previously existing models for each clusters shutil.rmtree('apps/models') os.makedirs(path) else: os.makedirs(path) with open(path + '/' + file_name + '.sav', 'wb') as f: pickle.dump(model, f) self.logger.info('Model File ' + file_name + ' saved') self.logger.info('End of Save Models') return 'success' except Exception as e: self.logger.exception('Exception raised while Saving Models: %s' % e) raise Exception() def load_model(self, file_name): """ * method: load_model * decription: method to load the model file * return: File get saved * * who when version change ---------- ----------- --------- -------------------- * * * Parameters * file_name: """ try: self.logger.info('Start of Load Models') with open('apps/models/' + file_name + '/' + file_name + '.sav', 'rb') as f: self.logger.info('Model File ' + file_name + ' loaded') self.logger.info('End of Load Model') return pickle.load(f) except Exception as e: self.logger.exception('Exception raised while Loading Model: %s' % e) raise Exception() def correct_model(self, cluster_number): """ * method: correct_model * decription: method to find the best model * return: The Model FIle * * Parameters * cluster_number: """ try: self.logger.info('Start of finding correct model') self.cluster_number = cluster_number self.folder_name = 'apps/models' self.list_of_model_files = [] self.list_of_files = os.listdir(self.folder_name) for self.file in self.list_of_files: try: if (self.file.index(str(self.cluster_number)) != -1): self.model_name = self.file except: continue self.model_name = self.model_name.split('.')[0] self.logger.info('End of finding correct model') return self.model_name except Exception as e: self.logger.info('Exception raised while finding correct model' + str(e)) raise Exception()
class FileOperation: """ ********************************************************************************** * * file name : file_operation.py * version : 1.0 * author : Moncy Kurien * creation date : 04-Jan-2021 * * * change history: * * who when version change(include bug # if apply) * ---------- ------- -------- ----------------------------- * Moncy Kurien 04-Jan-2021 1.0 Initial Creation * * Description: Class for File operations * ********************************************************************************** """ def __init__(self, run_id, data_path, mode): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'FileOperation', mode) def save_model(self, model, file_name): """ ********************************************************************************** * * method : save_model * parameters : model: - Type(Object) : model object reference * file_name : Type(String) : Name of the file * description : Method to save the ML model file * return : none - File gets saved * * change history: * * who when version change(include bug # if apply) * ---------- ------- -------- ----------------------------- * Moncy Kurien 04-Jan-2021 1.0 Initial Creation * * * ********************************************************************************** """ try: self.logger.info('Start of Save Models') path = os.path.join( 'apps/models', file_name) #create a separate directory for each cluster if os.path.isdir(path): shutil.rmtree('apps/models') os.makedirs(path) else: os.makedirs(path) with open(path + '/' + file_name + '.sav', 'wb') as f: pickle.dump(model, f) #Save the model to the file self.logger.info('Model File ' + file_name + ' saved') self.logger.info('End of Save Models') return 'success' except Exception as e: self.logger.exception("Exception raised while Save Models: %s" % e) raise Exception() def load_model(self, file_name): """ ********************************************************************************** * * method : load_model * parameters : file_name : Type(String) : Name of the file * description : Method to load the ML model from file file * return : returns de-serialized model object * * change history: * * who when version change(include bug # if apply) * ---------- ------- -------- ----------------------------- * Moncy Kurien 04-Jan-2021 1.0 Initial Creation * * * ********************************************************************************** """ try: self.logger.info('Start of Load Model') with open('apps/models/' + file_name + '/' + file_name + '.sav', 'rb') as f: model = pickle.load(f) self.logger.info('Model File ' + file_name + ' loaded.') self.logger.info('End of Load Models') return model except Exception as e: self.logger.exception("Exception raised while Load Models: %s " % e) raise Exception() def correct_model(self, cluster_number): """ ********************************************************************************** * * method : correct_model * parameters : cluster_number * description : Method to find the best model * return : the model file * * change history: * * who when version change(include bug # if apply) * ---------- ------- -------- ----------------------------- * Moncy Kurien 04-Jan-2021 1.0 Initial Creation * * * ********************************************************************************** """ try: self.logger.info("Start of finding Correct Model.") self.cluster_number = cluster_number self.folder_name = 'apps/models' self.list_of_model_files = [] self.list_of_files = os.listdir(self.folder_name) for self.file in self.list_of_files: try: #String.index(str(i)) will look at the string and returns the index/position of the str(i) in the String. #If the str(i) is not found it errors. if (self.file.index(str(self.cluster_number)) != -1): self.model_name = self.file except: continue self.model_name = self.model_name.split('.')[0] self.logger.info('End of Correct Model.') return self.model_name except Exception as e: self.logger.exception( 'Exception raised while finding Correct Model: %s ' % e) raise Exception()