class prediction: def __init__(self): self.log_writer = App_Logger() self.file_object = open('./Prediction_log/prediction.txt', 'a+') def preprocessing_prediction(self, data): """ Method Name: predict Description: this method will perform the preprocessing of test data Output:None Written by: Harsh Kumar Version: 1.0 Revisions: None """ try: #featurising the features of the test data get_features = post_training() data1 = get_features.load_vectorizer(data) self.log_writer.log(self.file_object, 'Successfully fetched the data') return data1 except Exception as e: self.log_writer.log( self.file_object, 'Error %s occurred while preprocessing the data' % e) def predict_values(self, test_data, data, sig_clf, clf): """ Method Name: predict_values Description: this method will perform the prediction of test data Output:Returns feature importance, predicted probablities abd predicted class. Written by: Harsh Kumar Version: 1.0 Revisions: None """ self.path = './Save_Load/vectorizer.txt' try: with open(self.path, encoding='utf8') as f: path_1 = f.readline() print(path_1) if path_1[18:24] == 'OneHot': try: predicted_cls = sig_clf.predict(data.iloc[0])[0] predicted_prob = np.round( sig_clf.predict_proba(data.iloc[0], 4)) predicted_probs = [ (i + 1, v) for i, v in enumerate(predicted_prob[0]) ] no_features = 100 indices = np.argsort( -1 * abs(clf.coef_))[predicted_cls - 1][:, :no_features] fea_imp = feature_imp(self.log_writer, self.file_object) important_features = fea_imp.get_feature_imp( indices[0], test_data['TEXT'].iloc[0], test_data['Gene'].iloc[0].lower(), test_data['Variation'].iloc[0].lower(), no_features, predicted_cls[0]) return important_features, predicted_probs, predicted_cls except Exception as e: self.log_writer.log( self.file_object, "Error %s in computing the feature importance of one Hot Encoded features" % e) elif path_1[18:23] == 'Tfidf': try: predicted_cls = sig_clf.predict( data.reshape(1, data.shape[1])) predicted_prob = np.round( sig_clf.predict_proba( data.reshape(1, data.shape[1])), 4) predicted_probs = [ (i + 1, v) for i, v in enumerate(predicted_prob[0]) ] no_features = 100 indices = np.argsort( -1 * abs(clf.coef_))[predicted_cls - 1][:, :no_features] fea_imp = feature_imp(self.log_writer, self.file_object) important_features = fea_imp.get_feature_imp( indices[0], test_data['TEXT'].iloc[0], test_data['Gene'].iloc[0].lower(), test_data['Variation'].iloc[0].lower(), no_features, predicted_cls[0]) self.log_writer.log( self.file_object, 'Successfully returned the values') f = open('./Prediction_log/prediction.txt', 'a+') self.log_writer.log( f, 'Successfully returned important_features,predicted_prob,predicted_cls values' ) f.close() return important_features, predicted_probs, predicted_cls except Exception as e: self.log_writer.log( self.file_object, 'Error %s in computing the feature importance for tfidf features' % e) except Exception as e: self.log_writer.log( self.file_object, 'Error %s in computing the feature importance' % e)
def __init__(self): self.log_writer = App_Logger() self.file_object = open('./Prediction_log/prediction.txt', 'a+')
def __init__(self): self.path = 'Training_Database/' self.logger = App_Logger()
class Db_operation: """ This class shall be used for handling all the SQL operations. Written By: Harsh Kumar Version: 1.0 Revisions: Nonse """ def __init__(self): self.path = 'Training_Database/' self.logger = App_Logger() def database_connection(self, DatabaseName): """ Method Name: dataBaseConnection Description: This method creates the database with the given name and if Database already exists then opens the connection to the DB. Output: Connection to the DB On Failure: Raise ConnectionError Written By: Harsh Kumar Version: 1.0 Revisions: None """ try: conn = sqlite3.connect(self.path + DatabaseName + '.db') file = open('Training_Logs/DataBaseConnectionLog.txt', 'a+') self.logger.log(file, "Opened %s database successfully" % DatabaseName) file.close() except ConnectionError: file = open('Training_Logs/DataBaseConnectionLog.txt', 'a+') self.logger.log(file, 'Error while openening %s database' % DatabaseName) file.close() raise ConnectionError return conn def createTrainTextTable(self, DatabaseName, column_names): """ Method Name: createTextTable Description: This method creates a table for text data in the given database which will be used to insert training data. Output: None On Failure: Raise Exception Written By: Harsh Kumar Version: 1.0 Revisions: None """ try: file = open('Training_Logs/DbTableCreateLog.txt', 'a+') self.logger.log(file, 'Entered into createTrainTextTable table') file.close() conn = self.database_connection(DatabaseName) c = conn.cursor() c.execute( "SELECT count(name) FROM sqlite_master WHERE type='table' AND name='Train_Text_Table'" ) if c.fetchone()[0] == 1: conn.close() file = open('Training_Logs/DbTableCreateLog.txt', 'a+') self.logger.log(file, 'Table already exist!!') file.close() else: for key in column_names.keys(): type = column_names[key] try: # in try block we check if the table exists, if yes then add columns to the table # else in catch block we will create the table c.execute( 'ALTER TABLE Train_Text_Table ADD COLUMN {column_name} {dataType}' .format(column_name=key, dataType=type)) except: c.execute( 'CREATE TABLE Train_Text_Table({column_name} {dataType})' .format(column_name=key, dataType=type)) conn.close() file = open("Training_Logs/DbTableCreateLog.txt", 'a+') self.logger.log(file, 'Table created successfully') file.close() file = open("Training_Logs/DatabaseConnectionLog.txt", 'a+') self.logger.log( file, 'Closed %s database successfully' % DatabaseName) file.close() except Exception as e: file = open('Training_Logs/DbTableCreateLog.txt', 'a+') self.logger.log(file, 'Error while creating the table %s' % e) file.close() file = open('Training_Logs/DatabaseConnectionLog.txt', 'a+') self.logger.log(file, 'Closed %s database successfully' % DatabaseName) file.close() raise e def createPredTextTable(self, DatabaseName, column_names): """ Method Name: createPredTextTable Description: This method creates a table for prediction text data in the given database which will be used to insert Prediction data. Output: None On Failure: Raise Exception Written By: Harsh Kumar Version: 1.0 Revisions: None """ try: file = open('Training_Logs/DbTableCreateLog.txt', 'a+') self.logger.log(file, 'Entered into createPredTextTable table') file.close() conn = self.database_connection(DatabaseName) c = conn.cursor() c.execute( "SELECT count(name) FROM sqlite_master WHERE type='table' AND name='Pred_Text_Table'" ) if c.fetchone()[0] == 1: conn.close() file = open('Training_Logs/DbTableCreateLog.txt', 'a+') self.logger.log(file, 'Table already exist!!') file.close() else: for key in column_names.keys(): type = column_names[key] try: # in try block we check if the table exists, if yes then add columns to the table # else in catch block we will create the table c.execute( 'ALTER TABLE Pred_Text_Table ADD COLUMN {column_name} {dataType}' .format(column_name=key, dataType=type)) except: c.execute( 'CREATE TABLE Pred_Text_Table({column_name} {dataType})' .format(column_name=key, dataType=type)) conn.close() file = open("Training_Logs/DbTableCreateLog.txt", 'a+') self.logger.log(file, 'Table created successfully') file.close() file = open("Training_Logs/DatabaseConnectionLog.txt", 'a+') self.logger.log( file, 'Closed %s database successfully' % DatabaseName) file.close() except Exception as e: file = open('Training_Logs/DbTableCreateLog.txt', 'a+') self.logger.log(file, 'Error while creating the table %s' % e) file.close() file = open('Training_Logs/DatabaseConnectionLog.txt', 'a+') self.logger.log(file, 'Closed %s database successfully' % DatabaseName) file.close() raise e def createTrainGeneVarTable(self, DatabaseName, column_names): """ Method Name: createTrainGeneVarTable Description: This method creates a table gor gene and variation features in the given database which will be used to insert training data. Output: None On Failure: Raise Exception Written By: Harsh Kumar Version: 1.0 Revisions: None """ try: file = open('Training_Logs/DbTableCreateLog.txt', 'a+') self.logger.log(file, 'Entered in createTrainGeneVarTable function') conn = self.database_connection(DatabaseName) c = conn.cursor() c.execute( "SELECT COUNT(name) from sqlite_master where type='table' AND name='TrainGeneVarTable'" ) if c.fetchone()[0] == 1: conn.close() file = open('Training_Logs/DbTableCreateLog.txt', 'a+') self.logger.log(file, 'TrainGeneVarTable table already exists!!!') else: for key in column_names.keys(): type = column_names[key] try: c.execute( 'ALTER TABLE TrainGeneVarTable ADD COLUMN "{column_name}" {dataType}' .format(column_name=key, dataType=type)) except: c.execute( 'CREATE TABLE TrainGeneVarTable({column_name} {datatype})' .format(column_name=key, datatype=type)) self.logger.log(file, 'TrainGeneVarTable created successfully!!') file.close() conn.close() except Exception as e: file = open('Training_Logs/DbTableCreateLog.txt', 'a+') self.logger.log(file, 'Error while creating the table %s' % e) file.close() file = open('Training_Logs/DatabaseConnectionLog.txt', 'a+') self.logger.log(file, 'Closed %s database successfully' % DatabaseName) file.close() raise e def createPredGeneVarTable(self, DatabaseName, column_names): """ Method Name: createPredGeneVarTable Description: This method creates a table for gene and variation features in the given database which will be used to insert Prediction data. Output: None On Failure: Raise Exception Written By: Harsh Kumar Version: 1.0 Revisions: None """ try: file = open('Training_Logs/DbTableCreateLog.txt', 'a+') self.logger.log(file, 'Entered in createPredGeneVarTable function') conn = self.database_connection(DatabaseName) c = conn.cursor() c.execute( "SELECT COUNT(name) from sqlite_master where type='table' AND name='PredGeneVarTable'" ) if c.fetchone()[0] == 1: conn.close() file = open('Training_Logs/DbTableCreateLog.txt', 'a+') self.logger.log(file, 'PredGeneVarTable table already exists') else: for key in column_names.keys(): type = column_names[key] if key == 'Class': continue else: try: c.execute( 'ALTER TABLE PredGeneVarTable ADD COLUMN "{column_name}" {dataType}' .format(column_name=key, dataType=type)) except: c.execute( 'CREATE TABLE PredGeneVarTable({column_name} {datatype})' .format(column_name=key, datatype=type)) self.logger.log( file, 'PredGeneVarTable table created Successfully!!') conn.close() except Exception as e: file = open('Training_Logs/DbTableCreateLog.txt', 'a+') self.logger.log(file, 'Error while creating the table %s' % e) file.close() file = open('Training_Logs/DatabaseConnectionLog.txt', 'a+') self.logger.log(file, 'Closed %s database successfully' % DatabaseName) file.close() raise e def insertDataIntoTextTable(self, DatabaseName, name): """ Method Name: insertDataIntoTrainTable Description: This method inserts the text data into the above created table. Output: None On Failure: Raise Exception Written By: Harsh Kumar Version: 1.0 Revisions: None """ file = open('Training_Logs/DbInsertLog.txt', 'a+') if name == 'train': self.logger.log( file, 'Entered into insertDataIntoTrainTable for inserting data') conn = self.database_connection(DatabaseName) c = conn.cursor() for i in c.execute('SELECT COUNT(*) FROM Train_Text_Table'): if i[0] >= 1: self.logger.log(file, 'Values already inserted') file.close() else: sql_insert = ''' INSERT INTO Train_Text_Table(ID,TEXT) VALUES (?,?)''' with open('RawTrainData/training_text', encoding='utf8') as f: for line in f.readlines(): if line == 'ID,Text\n': continue else: try: line = line.split('||') c.execute(sql_insert, (int(line[0]), line[1])) conn.commit() except Exception as e: self.logger.log( file, "Error while inserting into table: %s " % e) file.close() self.logger.log(file, 'Data inserted successfully') elif name == 'Pred': self.logger.log( file, 'Entered into insertDataIntoPredTable for inserting data') conn = self.database_connection((DatabaseName)) c = conn.cursor() for i in c.execute(('SELECT COUNT(*) FROM Pred_Text_Table')): if i[0] >= 1: self.logger.log(file, 'Values are already inserted') file.close() else: sql_insert = ''' INSERT INTO Pred_Text_Table(ID,TEXT) VALUES (?,?)''' with open('RawPredictionData/test_text', encoding='utf8') as f: for line in f.readlines(): if line == 'ID,Text\n': continue else: try: line = line.split('||') c.execute(sql_insert, (int(line[0]), line[1])) conn.commit() except Exception as e: self.logger.log( file, "Error while inserting into table: %s " % e) file.close() self.logger.log(file, 'Data inserted successfully') file.close() def insertDataIntoGenVarTable(self, DatabaseName, name): """"" Method Name: insertDataIntoGenVarTable Description: This method inserts the data files into the above created table. Output: None On Failure: Raise Exception Written By: Harsh Kumar Version: 1.0 Revisions: None """ file = open('Training_Logs/DbInsertLog.txt', 'a+') if name == 'train': self.logger.log( file, 'Entered into TrainGeneVarTable table for inserting data') conn = self.database_connection(DatabaseName) c = conn.cursor() for i in c.execute(('SELECT COUNT(*) FROM TrainGeneVarTable')): if i[0] >= 1: self.logger.log(file, 'Values are already inserted') file.close() else: sql_insert = '''INSERT INTO TrainGeneVarTable(ID,Gene,Variation,Class) VALUES (?,?,?,?)''' with open('RawTrainData/training_variants', encoding='utf8') as f: for line in f.readlines(): if line == 'ID,Gene,Variation,Class\n': continue else: try: c.execute(sql_insert, (int(line.split(',')[0]), line.split(',')[1], line.split(',')[2], int(line.split(',')[3][0]))) conn.commit() except Exception as e: self.logger.log( file, "Error while inserting into table: %s " % e) file.close() self.logger.log( file, 'Data Entered into TrainGeneVarTable successfully !!') file.close() elif name == 'Pred': self.logger.log( file, 'Entered into PredGeneVarTable table for inserting data') conn = self.database_connection(DatabaseName) c = conn.cursor() for i in c.execute(('SELECT COUNT(*) FROM PredGeneVarTable')): if i[0] >= 1: self.logger.log(file, 'Values are already inserted') file.close() else: sql_insert = '''INSERT INTO PredGeneVarTable(ID,Gene,Variation) VALUES (?,?,?)''' with open('RawPredictionData/test_variants', encoding='utf8') as f: for line in f.readlines(): if line == 'ID,Gene,Variation\n': continue else: try: c.execute(sql_insert, (int(line.split(',')[0]), line.split(',')[1], line.split(',')[2])) conn.commit() except Exception as e: self.logger.log( file, "Error while inserting into table: %s " % e) file.close() self.logger.log( file, 'Data Entered into PredGeneVarTable successfully !!') file.close()
def __init__(self): self.log_writer = App_Logger() self.file_object = open('Training_Logs/Training_Main_Log.txt', 'a+')
class training: def __init__(self): self.log_writer = App_Logger() self.file_object = open('Training_Logs/Training_Main_Log.txt', 'a+') def training_model(self): """ Method Name: training Description: this method will get all the training operations required. This is the Entry point for Machine Learning Model Output: None Written by: Harsh Kumar Version: 1.0 Revisions: None """ try: self.log_writer.log(self.file_object, 'start of training') #calling the Pre_Training method for all table creation, table insertion and unzipping all the files self.log_writer.log( self.file_object, 'Entering the Pre_Training Method for all the unzipping and database operations' ) pre_training = Pretrain(self.log_writer, self.file_object) pre_training.Pre_training() self.log_writer.log( self.file_object, 'All Pre_training task completed successfully') #getting the data into pandas dataframe format from database self.log_writer.log( self.file_object, 'Entering into get_data method for retrieving data into dataFrame' ) get_data = data_getter(self.log_writer, self.file_object) self.train_text, self.train_Gene_Var, self.Pred_text, self.Pred_Gene_Var = get_data.get_data( ) self.log_writer.log(self.file_object, 'Got the data into Pandas Dataframe') # Merging the text data and the GeneVar data. self.log_writer.log( self.file_object, 'Entering into method for merging Text table and GeneVarTable' ) get_data = data_getter(self.log_writer, self.file_object) self.data = get_data.merge_data(self.train_Gene_Var, self.train_text) self.log_writer.log(self.file_object, 'Merged Two pandas dataframe successfully') # Drop the null rows self.log_writer.log(self.file_object, 'Entering into drop_null method') pre = preprocess(self.log_writer, self.file_object) self.data = pre.drop_null(self.data) self.log_writer.log( self.file_object, 'Dropped all the null values rows succesfully') # Removing all the Stop words and special characters from the TEXT feature self.log_writer.log(self.file_object, 'Entering into text_preprocessing method') pre = preprocess(self.log_writer, self.file_object) self.data = pre.text_preprocessing(self.data) self.log_writer.log( self.file_object, 'Preprocessing of the TEXT feature completed successfully') # Splitting data for Training ,Cross validation and Test self.log_writer.log( self.file_object, 'Entering into splitData method for splitting data into train,Cv,Test' ) pre = preprocess(self.log_writer, self.file_object) train_df, test_df, cv_df, y_train, y_test, y_cv = pre.splitData( self.data) self.log_writer.log(self.file_object, 'Splitted data successfully!!') # Generating the One Hot Encoding features self.log_writer.log( self.file_object, 'Entering into OneHot method for generating the one Hot encodded features and saving the respective objects' ) pre = preprocess(self.log_writer, self.file_object) train_x_onehotCoding, test_x_onehotCoding, cv_x_onehotCoding = pre.OneHot( train_df, cv_df, test_df) self.log_writer.log( self.file_object, 'Generated One Hot Encoded Features for train,test and cv successfullyy!!' ) #Generating the Tf-idf Encoding features self.log_writer.log( self.file_object, 'Entering into the TfIdf method for generating the Tf-idf encodded feature and saving the respective objectcs' ) pre = preprocess(self.log_writer, self.file_object) train_x_Tfidf, test_x_Tfidf, cv_x_Tfidf = pre.TfIdf( train_df, cv_df, test_df) #finding the best model self.log_writer.log( self.file_object, 'Entering into the FindBestModel method for finding the best model' ) tune = Tunner() self.best_model, self.feature_imp_model = tune.FindBestModel( train_x_onehotCoding, test_x_onehotCoding, cv_x_onehotCoding, train_x_Tfidf, test_x_Tfidf, cv_x_Tfidf, y_train, y_test, y_cv) self.log_writer.log( self.file_object, 'Found the best Model and the original model for feature importance ' ) #saving the mdoel self.log_writer.log( self.file_object, 'Entering the saving method for saving the model') save_model = post_training() save_model.save(self.best_model, self.feature_imp_model) self.log_writer.log( self.file_object, 'Best Model and the model for feature importance saved successfully!!' ) except Exception as e: self.log_writer.log( self.file_object, 'Error %s occurred while training the model' % e)
def __init__(self): self.log_writer = App_Logger() self.file_object = open('Training_Logs/Model_Tunning_Logs.txt', 'a+') self.path = './Save_Load/vectorizer.txt'
class Tunner: def __init__(self): self.log_writer = App_Logger() self.file_object = open('Training_Logs/Model_Tunning_Logs.txt', 'a+') self.path = './Save_Load/vectorizer.txt' def FindBestModel(self, train_x_onehotCoding, test_x_onehotCoding, cv_x_onehotCoding, train_x_Tfidf, test_x_Tfidf, cv_x_Tfidf, y_train, y_test, y_cv): """ Method Name: FindBestModel Description: this method will find the best model that gives us least log-loss Return :Best model found with least log loss Written by: Harsh Kumar Version: 1.0 Revisions: None """ try: loss = [] # training Logistic regression with balanced class and One Hot Encodded features try: self.alpha_1 = [pow(10, x) for x in range(-3, 3)] cv_log_error_array = [] for i in self.alpha_1: print('for alpha i', i) clf = SGDClassifier(class_weight='balanced', alpha=i, penalty='l2', loss='log', random_state=123) clf.fit(train_x_onehotCoding, y_train) sig_clf = CalibratedClassifierCV(clf, method='sigmoid') sig_clf.fit(train_x_onehotCoding, y_train) sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding) cv_log_error_array.append( log_loss(y_cv, sig_clf_probs, labels=clf.classes_, eps=1e-15)) # to avoid rounding error while multiplying probabilites we use log-probability estimates print("Log Loss :", log_loss(y_cv, sig_clf_probs)) self.best_alpha_1 = np.argmin(cv_log_error_array) clf = SGDClassifier(class_weight='balanced', alpha=self.alpha_1[self.best_alpha_1], penalty='l2', loss='log', random_state=123) clf.fit(train_x_onehotCoding, y_train) sig_clf = CalibratedClassifierCV(clf, method="sigmoid") sig_clf.fit(train_x_onehotCoding, y_train) predict_y = sig_clf.predict_proba(test_x_onehotCoding) self.log_writer.log( self.file_object, 'For Logistic Reg. with balanced class and OneHot features the best alpha value is:{}, The Test loss is {}' .format( self.alpha_1[self.best_alpha_1], log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))) self.loss_lr_OneHot_bal = log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15) loss.append(self.loss_lr_OneHot_bal) self.log_writer.log( self.file_object, 'Logistic Reg. with balanced class and OneHot features trainied successfully!!' ) except Exception as e: self.log_writer.log( self.file_object, 'Error %s Occurred while finding the log loss of Logistic Reg whith OneHot features and class balanced' % e) #training Logistic regression with clsss balanced and Tf-idf features and try: self.alpha_2 = [10**x for x in range(-3, 3)] cv_log_error_array = [] for i in self.alpha_2: print("for alpha =", i) clf = SGDClassifier(class_weight='balanced', alpha=i, penalty='l2', loss='log', random_state=42) clf.fit(train_x_Tfidf, y_train) sig_clf = CalibratedClassifierCV(clf, method="sigmoid") sig_clf.fit(train_x_Tfidf, y_train) sig_clf_probs = sig_clf.predict_proba(cv_x_Tfidf) cv_log_error_array.append( log_loss(y_cv, sig_clf_probs, labels=clf.classes_, eps=1e-15)) # to avoid rounding error while multiplying probabilites we use log-probability estimates print("Log Loss :", log_loss(y_cv, sig_clf_probs)) self.best_alpha_2 = np.argmin(cv_log_error_array) clf = SGDClassifier(class_weight='balanced', alpha=self.alpha_2[self.best_alpha_2], penalty='l2', loss='log', random_state=42) clf.fit(train_x_Tfidf, y_train) sig_clf = CalibratedClassifierCV(clf, method="sigmoid") sig_clf.fit(train_x_Tfidf, y_train) predict_y = sig_clf.predict_proba(test_x_Tfidf) self.log_writer.log( self.file_object, 'For Logistic Reg with balanced class and Tfidf features the best alpha value is:{}, The Test loss is {}' .format( self.alpha_2[self.best_alpha_2], log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))) self.loss_lr_Tfidf_bal = log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15) loss.append(self.loss_lr_Tfidf_bal) self.log_writer.log( self.file_object, 'Logistic Reg with balanced class and Tfidf features trainined successfully' ) except Exception as e: self.log_writer.log( self.file_object, 'Error %s occurred while training the Logistic Reg with balanced class and Tfidf features model ' % e) #Logistic Regression with Tfidf features and without class balancing try: self.alpha_3 = [10**x for x in range(-3, 3)] cv_log_error_array = [] for i in self.alpha_3: print("for alpha =", i) clf = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42) clf.fit(train_x_Tfidf, y_train) sig_clf = CalibratedClassifierCV(clf, method="sigmoid") sig_clf.fit(train_x_Tfidf, y_train) sig_clf_probs = sig_clf.predict_proba(cv_x_Tfidf) cv_log_error_array.append( log_loss(y_cv, sig_clf_probs, labels=clf.classes_, eps=1e-15)) # to avoid rounding error while multiplying probabilites we use log-probability estimates print("Log Loss :", log_loss(y_cv, sig_clf_probs)) self.best_alpha_3 = np.argmin(cv_log_error_array) clf = SGDClassifier(alpha=self.alpha_3[self.best_alpha_3], penalty='l2', loss='log', random_state=42) clf.fit(train_x_Tfidf, y_train) sig_clf = CalibratedClassifierCV(clf, method="sigmoid") sig_clf.fit(train_x_Tfidf, y_train) predict_y = sig_clf.predict_proba(test_x_Tfidf) self.log_writer.log( self.file_object, 'For Logistic Reg without balanced class and Tfidf features the best alpha value is:{}, The Test loss is {}' .format( self.alpha_3[self.best_alpha_3], log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))) self.loss_lr_Tfidf_Notbal = log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15) loss.append(self.loss_lr_Tfidf_Notbal) self.log_writer.log( self.file_object, 'Logistic Reg without balanced class and Tfidf features trainined successfully' ) except Exception as e: self.log_writer.log( self.file_object, 'Error %s occurred while training the Logistic Reg without balanced class and Tfidf features model ' % e) #finding the best model least_loss_indices = np.argmin(loss) least_loss = loss[least_loss_indices] if least_loss == self.loss_lr_OneHot_bal: #finally training the model with best parameters clf = SGDClassifier(class_weight='balanced', alpha=self.alpha_1[self.best_alpha_1], penalty='l2', loss='log', random_state=42) clf.fit(train_x_onehotCoding, y_train) sig_clf = CalibratedClassifierCV(clf, method="sigmoid") sig_clf.fit(train_x_onehotCoding, y_train) self.log_writer.log( self.file_object, 'Best Model choosen is Logistic Reg. with balanced class and OneHot features with Log loss {}' .format(least_loss)) path_1 = './CountVectorizer/OneHotGene.pkl' path_2 = './CountVectorizer/OneHotVariation.pkl' path_3 = './CountVectorizer/OneHotText.pkl' f = open(self.path, 'a+') f.write(path_1 + "\n") f.write(path_2 + "\n") f.write(path_3 + "\n") f.close() return sig_clf, clf elif least_loss == self.loss_lr_Tfidf_bal: # finally training the model with best parameters clf = SGDClassifier(class_weight='balanced', alpha=self.alpha_2[self.best_alpha_2], penalty='l2', loss='log', random_state=42) clf.fit(train_x_Tfidf, y_train) sig_clf = CalibratedClassifierCV(clf, method="sigmoid") sig_clf.fit(train_x_Tfidf, y_train) self.log_writer.log( self.file_object, 'Best Model choosen is Logistic Reg with balanced class and Tfidf features with Log loss {}' .format(least_loss)) path_1 = './CountVectorizer/TfidfGene.pkl' path_2 = './CountVectorizer/TfidfVariation.pkl' path_3 = './CountVectorizer/TfidfText.pkl' f = open(self.path, 'a+') f.write(path_1 + "\n") f.write(path_2 + "\n") f.write(path_3 + "\n") f.close() return sig_clf, clf else: clf = SGDClassifier(alpha=self.alpha_3[self.best_alpha_3], penalty='l2', loss='log', random_state=42) clf.fit(train_x_Tfidf, y_train) sig_clf = CalibratedClassifierCV(clf, method="sigmoid") sig_clf.fit(train_x_Tfidf, y_train) self.log_writer.log( self.file_object, 'Best Model choosen is Logistic Reg without balanced class and Tfidf features with Log loss {}' .format(least_loss)) path_1 = './CountVectorizer/TfidfGene.pkl' path_2 = './CountVectorizer/TfidfVariation.pkl' path_3 = './CountVectorizer/TfidfText.pkl' f = open(self.path, 'a+') f.write(path_1 + "\n") f.write(path_2 + "\n") f.write(path_3 + "\n") f.close() return sig_clf, clf except Exception as e: self.log_writer.log( self.file_object, 'Error %s occurred while finding the best model' % e)
class post_training: def __init__(self): self.log_writer = App_Logger() self.file_object = open('./Prediction_log/prediction.txt', 'a+') self.model_directory = './Model/' def save(self, calibrated_model, original_model): """ Method Name: save Description: this method will save the best model that has been trained. Return :None Written by: Harsh Kumar Version: 1.0 Revisions: None """ self.best_model = calibrated_model self.feature_imp_model = original_model try: f = open(self.model_directory + 'best_model' + '.pkl', 'wb') pickle.dump(self.best_model, f) f.close() except Exception as e: self.log_writer.log( self.file_object, 'Error %s occurred while saving the best model' % e) try: f = open(self.model_directory + 'feature_imp_model' + '.pkl', 'wb') pickle.dump(self.feature_imp_model, f) f.close() except Exception as e: self.log_writer.log( self.file_object, 'Error %s occurred while saving the original model which will give the festure importance' % e) def load_model(self): """ Method Name: load_model Description: this method will load the best model that has been trained. Return :best model Written by: Harsh Kumar Version: 1.0 Revisions: None """ self.path = './Model/' try: self.model = pickle.load( open(self.path + 'best_model' + '.pkl', 'rb')) except Exception as e: self.log_writer.log( self.file_object, 'Error %s occurred while loading the best model' % e) try: self.feature_imp_model = pickle.load( open(self.path + 'feature_imp_model' + '.pkl', 'rb')) return self.model, self.feature_imp_model except Exception as e: self.log_writer.log( self.file_object, 'Error %s occurred while loading the original model for feature importance' % e) def load_vectorizer(self, data): """ Method Name: load_vectorizer Description: this method will load the vectorizer required for featuring our original features Return : Written by: Harsh Kumar Version: 1.0 Revisions: None """ self.path = './Save_Load/vectorizer.txt' try: with open(self.path, encoding='utf8') as f: path_1 = f.readline() path_2 = f.readline() path_3 = f.readline() if path_1[18:24] == 'OneHot': try: # featuring the OneHot coding for test data point gene_vectorizer = CountVectorizer( vocabulary=pickle.load(open(path_1[:-1], 'rb'))) variation_vectorizer = CountVectorizer( vocabulary=pickle.load(open(path_2[:-1], 'rb'))) text_vectorizer = CountVectorizer( min_df=3, vocabulary=pickle.load(open(path_3[:-1], 'rb'))) test_gene_feature_onehotCoding = gene_vectorizer.fit_transform( data['Gene']).toarray() test_variation_feature_onehotCoding = variation_vectorizer.fit_transform( data['Variation']).toarray() test_text_feature_onehotCoding = text_vectorizer.fit_transform( data['TEXT']).toarray() #normalize the TEXT feature test_text_feature_onehotCoding = normalize( test_text_feature_onehotCoding, axis=0) # merging test_gene_feature_onehotCoding and test_variation_feature_onehotCoding test_gene_var_onehotCoding = np.hstack( test_gene_feature_onehotCoding, test_variation_feature_onehotCoding) # Finally merging and getting final test_x_onehotCoding test_x_onehotCoding = np.hstack( test_gene_var_onehotCoding, test_text_feature_onehotCoding) self.log_writer.log(self.file_object, 'Successfully featurized ') return test_x_onehotCoding except Exception as e: self.log_writer.log( self.file_object, 'Error %s occurred while featurizing OneHotEncoding for the test data point' % e) elif path_1[18:23] == 'Tfidf': try: # featurizing the Tfidf coding for test data point gene_tfidf_vectorizer = TfidfVectorizer( vocabulary=pickle.load(open(path_1[:-1], 'rb'))) variation_tfidf_vectorizer = TfidfVectorizer( vocabulary=pickle.load(open(path_2[:-1], 'rb'))) text_tfidf_vectorizer = TfidfVectorizer( vocabulary=pickle.load(open(path_3[:-1], 'rb'))) test_gene_feature_Tfidf = gene_tfidf_vectorizer.fit_transform( data['Gene'].values).toarray() test_variation_feature_Tfidf = variation_tfidf_vectorizer.fit_transform( data['Variation'].values).toarray() test_text_feature_Tfidf = text_tfidf_vectorizer.fit_transform( data['TEXT'].values).toarray() # normalizing test_text_feature_Tfidf = normalize( test_text_feature_Tfidf, axis=0) # merging test_gene_feature_Tfidf and test_variation_feature_Tfidf test_gene_var_Tfidf = np.hstack( (test_gene_feature_Tfidf, test_variation_feature_Tfidf)) # Finally merging and getting test_x_Tfidf test_x_Tfidf = np.hstack( (test_gene_var_Tfidf, test_text_feature_Tfidf)) self.log_writer.log(self.file_object, 'Successfully featurized ') return test_x_Tfidf except Exception as e: self.log_writer.log( self.file_object, 'Error %s occurred while featurizing Tfidf for the test data point' % e) except Exception as e: self.log_writer.log( self.file_object, 'Error %s Occurred while preprocessing the test data point' % e)