class App_LoggerDB: def __init__(self, execution_id): self.mongoDBObject = MongodbOperation() self.azureBlobObject = AzureBlobManagement() self.execution_id = execution_id pass def log(self, database_name, collection_name, log_message): try: self.now = datetime.now() self.date = self.now.date() self.current_time = self.now.strftime("%H:%M:%S") log = { 'Log_updated_date': self.now, 'Log_update_time': self.current_time, 'Log_message': log_message, 'execution_id': self.execution_id } res = self.mongoDBObject.insertRecordInCollection( database_name, collection_name, log) if res > 0: return True else: log = { 'Log_updated_date': [self.now], 'Log_update_time': [self.current_time], 'Log_message': [log_message], 'execution_id': self.execution_id } self.azureBlobObject.saveDataFrametoCSV("db-fail-log", "log_" + self.execution_id, pd.DataFrame(log), mode="a+", header=True) return True except Exception as e: log = { 'Log_updated_date': [self.now], 'Log_update_time': [self.current_time], 'Log_message': [log_message], 'execution_id': self.execution_id } log["Log_message"][0] = log["Log_message"][0] + str(e) self.azureBlobObject.saveDataFrametoCSV("db-fail-log", "log_" + self.execution_id, pd.DataFrame(log), mode="a+", header=True)
class Prediction_Data_validation: """ This class shall be used for handling all the validation done on the Raw Prediction Data!!. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ def __init__(self, path, execution_id): self.Batch_Directory = path self.execution_id = execution_id self.collection_name = "strength_schema_prediction" #code added by Avnish yadav self.database_name = "Wafer-sys" #code added by Avnish yadav self.logger_db_writer = App_LoggerDB( execution_id=execution_id) #code added by Avnish yadav self.mongdb = MongodbOperation() self.az_blob_mgt = AzureBlobManagement() self.good_directory_path = "good-raw-file-prediction-validated" self.bad_directory_path = "bad-raw-file-prediction-validated" def valuesFromSchema(self): """ Method Name: valuesFromSchema Description: This method extracts all the relevant information from the pre-defined "Schema" file. Output: LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, Number of Columns On Failure: Raise ValueError,KeyError,Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ log_database = "strength_prediction_log" log_collection = "values_from_schema_validation" try: log_database = "wafer_prediction_log" log_collection = "values_from_schema_validation" df_schema_training = self.mongdb.getDataFrameofCollection( self.database_name, self.collection_name) dic = {} [ dic.update({i: df_schema_training.loc[0, i]}) for i in df_schema_training.columns ] del df_schema_training #with open(self.schema_path, 'r') as f: # dic = json.load(f) # f.close() pattern = dic['SampleFileName'] LengthOfDateStampInFile = dic['LengthOfDateStampInFile'] LengthOfTimeStampInFile = dic['LengthOfTimeStampInFile'] column_names = dic['ColName'] NumberofColumns = dic['NumberofColumns'] #file = open("Training_Logs/valuesfromSchemaValidationLog.txt", 'a+') message = "LengthOfDateStampInFile:: %s" % LengthOfDateStampInFile + "\t" + "LengthOfTimeStampInFile:: %s" % LengthOfTimeStampInFile + "\t " + "NumberofColumns:: %s" % NumberofColumns + "\n" self.logger_db_writer.log(log_database, log_collection, message) #file.close() except ValueError: self.logger_db_writer.log( log_database, log_collection, "KeyError:Key value error incorrect key passed") raise ValueError except KeyError: self.logger_db_writer.log( log_database, log_collection, "KeyError:Key value error incorrect key passed") raise KeyError except Exception as e: self.logger_db_writer.log(log_database, log_collection, str(e)) raise e return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns def manualRegexCreation(self): """ Method Name: manualRegexCreation Description: This method contains a manually defined regex based on the "FileName" given in "Schema" file. This Regex is used to validate the filename of the prediction data. Output: Regex pattern On Failure: None Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ regex = "['cement_strength']+['\_'']+[\d_]+[\d]+\.csv" return regex def createDirectoryForGoodBadRawData(self): """ Method Name: createDirectoryForGoodBadRawData Description: This method creates directories to store the Good Data and Bad Data after validating the prediction data. Output: None On Failure: OSError Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ log_database = "strength_prediction_log" log_collection = "general_log" try: log_database = "strength_prediction_log" log_collection = "general_log" self.az_blob_mgt.createDir(self.good_directory_path, is_replace=True) self.az_blob_mgt.createDir(self.bad_directory_path, is_replace=True) msg = self.good_directory_path + " and " + self.bad_directory_path + " created successfully." self.logger_db_writer.log(log_database, log_collection, msg) except Exception as e: msg = "Error Occured in class Prediction_Data_validation method:createDirectoryForGoodBadRawData error: Failed to create directory " + self.good_directory_path + " and " + self.bad_directory_path self.logger_db_writer.log(log_database, log_collection, msg) raise e def deleteExistingGoodDataTrainingFolder(self): """ Method Name: deleteExistingGoodDataTrainingFolder Description: This method deletes the directory made to store the Good Data after loading the data in the table. Once the good files are loaded in the DB,deleting the directory ensures space optimization. Output: None On Failure: OSError Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ log_database = "strength_prediction_log" log_collection = "general_log" try: log_database = "strength_prediction_log" log_collection = "general_log" self.az_blob_mgt.deleteDir(self.good_directory_path) self.logger_db_writer.log( log_database, log_collection, self.good_directory_path + " deleted successfully!!") except Exception as e: msg = "Error Occured in class Raw_Data_validation method:deleteExistingGoodDataTrainingFolder Error occured while deleting :" + self.good_directory_path self.logger_db_writer.log(log_database, log_collection, msg) raise e def deleteExistingBadDataTrainingFolder(self): """ Method Name: deleteExistingBadDataTrainingFolder Description: This method deletes the directory made to store the bad Data. Output: None On Failure: OSError Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ log_database = "strength_prediction_log" log_collection = "general_log" try: log_database = "strength_prediction_log" log_collection = "general_log" self.az_blob_mgt.deleteDir(self.bad_directory_path) self.logger_db_writer.log( log_database, log_collection, self.bad_directory_path + " deleted successfully!!") except Exception as e: msg = "Error Occured in class Raw_Data_validation method:deleteExistingGoodDataTrainingFolder Error occured while deleting :" + self.good_directory_path self.logger_db_writer.log(log_database, log_collection, msg) raise e def moveBadFilesToArchiveBad(self): """ Method Name: moveBadFilesToArchiveBad Description: This method deletes the directory made to store the Bad Data after moving the data in an archive folder. We archive the bad files to send them back to the client for invalid data issue. Output: None On Failure: OSError Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ now = datetime.now() date = now.date() time = now.strftime("%H%M%S") log_database = "strength_prediction_log" log_collection = "general_log" try: log_database = "strength_prediction_log" log_collection = "general_log" # source = 'Training_Raw_files_validated/Bad_Raw/' source = self.bad_directory_path destination = "lap-" + self.execution_id self.logger_db_writer.log(log_database, log_collection, "Started moving bad raw data..") for file in self.az_blob_mgt.getAllFileNameFromDirectory(source): self.az_blob_mgt.moveFileinDir(source, destination, file) self.logger_db_writer.log( log_database, log_collection, "File:" + file + " moved to directory:" + destination + " successfully.") self.logger_db_writer.log( log_database, log_collection, "All bad raw file moved to directory:" + destination) self.az_blob_mgt.deleteDir(source) self.logger_db_writer.log(log_database, log_collection, "Deleting bad raw directory:" + source) except Exception as e: self.logger_db_writer.log( log_database, log_collection, "class Raw_Data_validation method:moveBadFilesToArchiveBad Error while moving bad files to archive:" + str(e)) raise e def validationFileNameRaw(self, regex, LengthOfDateStampInFile, LengthOfTimeStampInFile): """ Method Name: validationFileNameRaw Description: This function validates the name of the prediction csv file as per given name in the schema! Regex pattern is used to do the validation.If name format do not match the file is moved to Bad Raw Data folder else in Good raw data. Output: None On Failure: Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ # delete the directories for good and bad data in case last run was unsuccessful and folders were not deleted. self.createDirectoryForGoodBadRawData() onlyfiles = self.az_blob_mgt.getAllFileNameFromDirectory( self.Batch_Directory) try: log_database = "strength_prediction_log" log_collection = "name_validation_log" for filename in onlyfiles: if (re.match(regex, filename)): splitAtDot = re.split('.csv', filename) splitAtDot = (re.split('_', splitAtDot[0])) if len(splitAtDot[2]) == LengthOfDateStampInFile: if len(splitAtDot[3]) == LengthOfTimeStampInFile: self.az_blob_mgt.CopyFileinDir( self.Batch_Directory, self.good_directory_path, filename) self.logger_db_writer.log( log_database, log_collection, "Valid File name!! File moved to " + self.good_directory_path + filename) else: self.az_blob_mgt.CopyFileinDir( self.Batch_Directory, self.bad_directory_path, filename) msg = "Invalid File Name !! File moved to " + self.bad_directory_path + filename self.logger_db_writer.log(log_database, log_collection, msg) else: self.az_blob_mgt.CopyFileinDir(self.Batch_Directory, self.bad_directory_path, filename) msg = "Invalid File Name !! File moved to " + self.bad_directory_path + filename self.logger_db_writer.log(log_database, log_collection, msg) else: self.az_blob_mgt.CopyFileinDir(self.Batch_Directory, self.bad_directory_path, filename) msg = "Invalid File Name !! File moved to " + self.bad_directory_path + filename self.logger_db_writer.log(log_database, log_collection, msg) except Exception as e: msg = "Error occured while validating FileName " + str(e) self.logger_db_writer.log(log_database, log_collection, msg) raise e def validateColumnLength(self, NumberofColumns): """ Method Name: validateColumnLength Description: This function validates the number of columns in the csv files. It is should be same as given in the schema file. If not same file is not suitable for processing and thus is moved to Bad Raw Data folder. If the column number matches, file is kept in Good Raw Data for processing. The csv file is missing the first column name, this function changes the missing name to "Wafer". Output: None On Failure: Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ try: log_database = "strength_prediction_log" log_collection = "column_validation_log" self.logger_db_writer.log(log_database, log_collection, "Column length validation Started!!") #for file in listdir('Prediction_Raw_Files_Validated/Good_Raw/'): for file in self.az_blob_mgt.getAllFileNameFromDirectory( self.good_directory_path): #csv = pd.read_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file) csv = self.az_blob_mgt.readCSVFilefromDir( self.good_directory_path, file) print(csv.shape) if csv.shape[1] == NumberofColumns: #csv.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True) print(csv) #csv.to_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file, index=None, header=True) self.az_blob_mgt.saveDataFrametoCSV( self.good_directory_path, file, csv, index=None, header=True) else: self.az_blob_mgt.moveFileinDir(self.good_directory_path, self.bad_directory_path, file) self.logger_db_writer.log( log_database, log_collection, "Invalid Column Length for the file!! " "File moved to Bad Raw Folder :: %s" % file) self.logger_db_writer.log(log_database, log_collection, "Column Length Validation Completed!!") except Exception as e: self.logger_db_writer.log(log_database, log_collection, 'Error Occured::' + str(e)) raise e def deletePredictionFile(self): try: log_database = "strenth_prediction_log" log_collection = "general_log" directory = "prediction-file" filename = "Prediction.csv" if directory in self.az_blob_mgt.dir_list: filenames = self.az_blob_mgt.getAllFileNameFromDirectory( directory_name=directory) if filename in filenames: self.az_blob_mgt.deleteFilefromDir( directory_name=directory, filename=filename) self.logger_db_writer.log( log_database, log_collection, filename + " is deleted from dir:" + directory + " successfully") except Exception as e: self.logger_db_writer.log( log_database, log_collection, "Error occure while deleting prediction file from prediction-file directory" + str(e)) raise e def validateMissingValuesInWholeColumn(self): """ Method Name: validateMissingValuesInWholeColumn Description: This function validates if any column in the csv file has all values missing. If all the values are missing, the file is not suitable for processing. SUch files are moved to bad raw data. Output: None On Failure: Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ try: log_database = "strength_prediction_log" log_collection = "missing_values_in_column" #f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+') #self.logger.log(f, "Missing Values Validation Started!!") self.logger_db_writer.log(log_database, log_collection, "Missing Values Validation Started!!") #for file in listdir('Prediction_Raw_Files_Validated/Good_Raw/'): for file in self.az_blob_mgt.getAllFileNameFromDirectory( self.good_directory_path): #csv = pd.read_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file) csv = self.az_blob_mgt.readCSVFilefromDir( self.good_directory_path, file) print(csv) count = 0 for columns in csv: if (len(csv[columns]) - csv[columns].count()) == len( csv[columns]): count += 1 #shutil.move("Prediction_Raw_Files_Validated/Good_Raw/" + file, # "Prediction_Raw_Files_Validated/Bad_Raw") self.az_blob_mgt.moveFileinDir( self.good_directory_path, self.bad_directory_path, file) #self.logger.log(f,"Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file) self.logger_db_writer.log( log_database, log_collection, "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file) break if count == 0: csv.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True) print("column unnamed may not be present") self.az_blob_mgt.saveDataFrametoCSV( self.good_directory_path, file, csv, index=None, header=True) #csv.to_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file, index=None, header=True) except Exception as e: self.logger_db_writer.log(log_database, log_collection, "Error occured:" + str(e)) raise e
class Raw_Data_validation: """ This class shall be used for handling all the validation done on the Raw Training Data!!. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ def __init__(self, path, execution_id): self.Batch_Directory = path self.execution_id = execution_id #self.schema_path = 'schema_training.json' self.collection_name = "schema-training" #code added by Avnish yadav self.database_name = "Wafer-sys" #code added by Avnish yadav self.logger_db_writer = App_LoggerDB(execution_id=execution_id) self.mongdb = MongodbOperation() self.az_blob_mgt = AzureBlobManagement() self.good_directory_path = "good-raw-file-train-validated" self.bad_directory_path = "bad-raw-file-train-validated" def valuesFromSchema(self): """ Method Name: valuesFromSchema Description: This method extracts all the relevant information from the pre-defined "Schema" file. Output: LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, Number of Columns On Failure: Raise ValueError,KeyError,Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ log_database = "wafer_training_log" log_collection = "values_from_schema_validation" try: #with open(self.schema_path, 'r') as f: # dic = json.load(f) # f.close() log_database = "wafer_training_log" log_collection = "values_from_schema_validation" df_schema_training = self.mongdb.getDataFrameofCollection( self.database_name, self.collection_name) dic = {} for i in df_schema_training.columns: dic.update({i: df_schema_training.loc[0, i]}) #[dic.update({i: df_schema_training.loc[0, i]}) for i in df_schema_training.columns] print(dic) del df_schema_training pattern = dic['SampleFileName'] LengthOfDateStampInFile = dic['LengthOfDateStampInFile'] LengthOfTimeStampInFile = dic['LengthOfTimeStampInFile'] column_names = dic['ColName'] NumberofColumns = dic['NumberofColumns'] #file = open("Training_Logs/valuesfromSchemaValidationLog.txt", 'a+') message = "LengthOfDateStampInFile:: %s" % LengthOfDateStampInFile + "\t" + "LengthOfTimeStampInFile:: %s" % LengthOfTimeStampInFile + "\t " + "NumberofColumns:: %s" % NumberofColumns + "\n" self.logger_db_writer.log(log_database, log_collection, message) #file.close() except ValueError: file = open("Training_Logs/valuesfromSchemaValidationLog.txt", 'a+') self.logger_db_writer.log( log_database, log_collection, "ValueError:Value not found inside schema_training.json") file.close() raise ValueError except KeyError: file = open("Training_Logs/valuesfromSchemaValidationLog.txt", 'a+') self.logger_db_writer.log( log_database, log_collection, "KeyError:Key value error incorrect key passed") file.close() raise KeyError except Exception as e: file = open("Training_Logs/valuesfromSchemaValidationLog.txt", 'a+') self.logger_db_writer.log(log_database, log_collection, str(e)) file.close() raise e return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns def manualRegexCreation(self): """ Method Name: manualRegexCreation Description: This method contains a manually defined regex based on the "FileName" given in "Schema" file. This Regex is used to validate the filename of the training data. Output: Regex pattern On Failure: None Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ regex = "['wafer']+['\_'']+[\d_]+[\d]+\.csv" return regex def createDirectoryForGoodBadRawData(self): """ Method Name: createDirectoryForGoodBadRawData Description: This method creates directories to store the Good Data and Bad Data after validating the training data. Output: None On Failure: OSError Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ """try: path = os.path.join("Training_Raw_files_validated/", "Good_Raw/") if not os.path.isdir(path): os.makedirs(path) path = os.path.join("Training_Raw_files_validated/", "Bad_Raw/") if not os.path.isdir(path): os.makedirs(path)""" log_database = "wafer_training_log" log_collection = "general_log" try: log_database = "wafer_training_log" log_collection = "general_log" self.az_blob_mgt.createDir(self.good_directory_path, is_replace=True) self.az_blob_mgt.createDir(self.bad_directory_path, is_replace=True) msg = self.good_directory_path + " and " + self.bad_directory_path + " created successfully." print(msg) self.logger_db_writer.log(log_database, log_collection, msg) except Exception as e: msg = "Error Occured in class Raw_Data_validation method:createDirectoryForGoodBadRawData error: Failed to create directory " + self.good_directory_path + " and " + self.bad_directory_path self.logger_db_writer.log(log_database, log_collection, msg) raise e def deleteExistingGoodDataTrainingFolder(self): """ Method Name: deleteExistingGoodDataTrainingFolder Description: This method deletes the directory made to store the Good Data after loading the data in the table. Once the good files are loaded in the DB,deleting the directory ensures space optimization. Output: None On Failure: OSError Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ #try: #path = 'Training_Raw_files_validated/' ### if os.path.isdir("ids/" + userName): ## if os.path.isdir(path + 'Bad_Raw/'): ## shutil.rmtree(path + 'Bad_Raw/') #if os.path.isdir(path + 'Good_Raw/'): # shutil.rmtree(path + 'Good_Raw/') # file = open("Training_Logs/GeneralLog.txt", 'a+') # self.logger.log(file,"GoodRaw directory deleted successfully!!!") # file.close() log_database = "wafer_training_log" log_collection = "general_log" try: log_database = "wafer_training_log" log_collection = "general_log" self.az_blob_mgt.deleteDir(self.good_directory_path) self.logger_db_writer.log( log_database, log_collection, self.good_directory_path + " deleted successfully!!") except Exception as e: msg = "Error Occured in class Raw_Data_validation method:deleteExistingGoodDataTrainingFolder Error occured while deleting :" + self.good_directory_path self.logger_db_writer.log(log_database, log_collection, msg) raise e def deleteExistingBadDataTrainingFolder(self): """ Method Name: deleteExistingBadDataTrainingFolder Description: This method deletes the directory made to store the bad Data. Output: None On Failure: OSError Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ #try: # # path = 'Training_Raw_files_validated/' # if os.path.isdir(path + 'Bad_Raw/'): # shutil.rmtree(path + 'Bad_Raw/') # file = open("Training_Logs/GeneralLog.txt", 'a+') # self.logger.log(file,"BadRaw directory deleted before starting validation!!!") # file.close() #except OSError as s: # file = open("Training_Logs/GeneralLog.txt", 'a+') # self.logger.log(file,"Error while Deleting Directory : %s" %s) # file.close() # raise OSError log_database = "wafer_training_log" log_collection = "general_log" try: log_database = "wafer_training_log" log_collection = "general_log" self.az_blob_mgt.deleteDir(self.bad_directory_path) self.logger_db_writer.log( log_database, log_collection, self.bad_directory_path + " deleted successfully!!") except Exception as e: msg = "Error Occured in class Raw_Data_validation method:deleteExistingGoodDataTrainingFolder Error occured while deleting :" + self.good_directory_path self.logger_db_writer.log(log_database, log_collection, msg) raise e def moveBadFilesToArchiveBad(self): """ Method Name: moveBadFilesToArchiveBad Description: This method deletes the directory made to store the Bad Data after moving the data in an archive folder. We archive the bad files to send them back to the client for invalid data issue. Output: None On Failure: OSError Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ now = datetime.now() date = now.date() time = now.strftime("%H%M%S") #try: #source = 'Training_Raw_files_validated/Bad_Raw/' #if os.path.isdir(source): # path = "TrainingArchiveBadData" # if not os.path.isdir(path): # os.makedirs(path) # dest = 'TrainingArchiveBadData/BadData_' + str(date)+"_"+str(time) # if not os.path.isdir(dest): # os.makedirs(dest) # files = os.listdir(source) # for f in files: # if f not in os.listdir(dest): # shutil.move(source + f, dest) # file = open("Training_Logs/GeneralLog.txt", 'a+') # self.logger.log(file,"Bad files moved to archive") # path = 'Training_Raw_files_validated/' # if os.path.isdir(path + 'Bad_Raw/'): # shutil.rmtree(path + 'Bad_Raw/') # self.logger.log(file,"Bad Raw Data Folder Deleted successfully!!") # file.close() # log_database = "wafer_training_log" log_collection = "general_log" try: log_database = "wafer_training_log" log_collection = "general_log" # source = 'Training_Raw_files_validated/Bad_Raw/' source = self.bad_directory_path destination = "lat-" + self.execution_id self.logger_db_writer.log(log_database, log_collection, "Started moving bad raw data..") for file in self.az_blob_mgt.getAllFileNameFromDirectory(source): self.az_blob_mgt.moveFileinDir(source, destination, file) self.logger_db_writer.log( log_database, log_collection, "File:" + file + " moved to directory:" + destination + " successfully.") self.logger_db_writer.log( log_database, log_collection, "All bad raw file moved to directory:" + destination) self.az_blob_mgt.deleteDir(source) self.logger_db_writer.log(log_database, log_collection, "Deleting bad raw directory:" + source) except Exception as e: self.logger_db_writer.log( log_database, log_collection, "class Raw_Data_validation method:moveBadFilesToArchiveBad " "Error while moving bad files to archive:" + str(e)) raise e def validationFileNameRaw(self, regex, LengthOfDateStampInFile, LengthOfTimeStampInFile): """ Method Name: validationFileNameRaw Description: This function validates the name of the training csv files as per given name in the schema! Regex pattern is used to do the validation.If name format do not match the file is moved to Bad Raw Data folder else in Good raw data. Output: None On Failure: Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ #pattern = "['Wafer']+['\_'']+[\d_]+[\d]+\.csv" # delete the directories for good and bad data in case last run was unsuccessful and folders were not deleted. #self.deleteExistingBadDataTrainingFolder() #self.deleteExistingGoodDataTrainingFolder() #create new directories self.createDirectoryForGoodBadRawData() onlyfiles = self.az_blob_mgt.getAllFileNameFromDirectory( self.Batch_Directory) #onlyfiles = [f for f in listdir(self.Batch_Directory)] log_database = "wafer_training_log" log_collection = "name_validation_log" try: log_database = "wafer_training_log" log_collection = "name_validation_log" #f = open("Training_Logs/nameValidationLog.txt", 'a+') for filename in onlyfiles: if (re.match(regex, filename)): splitAtDot = re.split('.csv', filename) splitAtDot = (re.split('_', splitAtDot[0])) if len(splitAtDot[1]) == LengthOfDateStampInFile: if len(splitAtDot[2]) == LengthOfTimeStampInFile: #shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Good_Raw") #self.logger.log(f,"Valid File name!! File moved to GoodRaw Folder :: %s" % filename) self.az_blob_mgt.CopyFileinDir( self.Batch_Directory, self.good_directory_path, filename) self.logger_db_writer.log( log_database, log_collection, "Valid File name!! File moved to " + self.good_directory_path + filename) else: #shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Bad_Raw") #self.logger.log(f,"Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) self.az_blob_mgt.CopyFileinDir( self.Batch_Directory, self.bad_directory_path, filename) msg = "Invalid File Name !! File moved to " + self.bad_directory_path + filename self.logger_db_writer.log(log_database, log_collection, msg) else: #shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Bad_Raw") #self.logger.log(f,"Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) self.az_blob_mgt.CopyFileinDir(self.Batch_Directory, self.bad_directory_path, filename) msg = "Invalid File Name !! File moved to " + self.bad_directory_path + filename self.logger_db_writer.log(log_database, log_collection, msg) else: #shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Bad_Raw") #self.logger.log(f, "Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) self.az_blob_mgt.CopyFileinDir(self.Batch_Directory, self.bad_directory_path, filename) msg = "Invalid File Name !! File moved to " + self.bad_directory_path + filename self.logger_db_writer.log(log_database, log_collection, msg) # f.close() except Exception as e: msg = "Error occured while validating FileName " + str(e) self.logger_db_writer.log(log_database, log_collection, msg) raise e #f = open("Training_Logs/nameValidationLog.txt", 'a+') #self.logger.log(f, "Error occured while validating FileName %s" % e) #f.close() #raise e def validateColumnLength(self, NumberofColumns): """ Method Name: validateColumnLength Description: This function validates the number of columns in the csv files. It is should be same as given in the schema file. If not same file is not suitable for processing and thus is moved to Bad Raw Data folder. If the column number matches, file is kept in Good Raw Data for processing. The csv file is missing the first column name, this function changes the missing name to "Wafer". Output: None On Failure: Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ log_collection = "column_validation_log" log_database = "wafer_training_log" try: log_collection = "column_validation_log" log_database = "wafer_training_log" #f = open("Training_Logs/columnValidationLog.txt", 'a+') #self.logger.log(f,"Column Length Validation Started!!") self.logger_db_writer.log(log_database, log_collection, "Column Length Validation Started!!") print("column lenght validation started") for file in self.az_blob_mgt.getAllFileNameFromDirectory( self.good_directory_path): #csv = pd.read_csv("Training_Raw_files_validated/Good_Raw/" + file) print(file) csv = self.az_blob_mgt.readCSVFilefromDir( self.good_directory_path, file) print(csv.shape) if csv.shape[1] == NumberofColumns: csv.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True) print("pass statement ") pass #for file in listdir('Training_Raw_files_validated/Good_Raw/'): # csv = pd.read_csv("Training_Raw_files_validated/Good_Raw/" + file) # if csv.shape[1] == NumberofColumns: # # pass else: self.az_blob_mgt.moveFileinDir(self.good_directory_path, self.bad_directory_path, file) msg = "Invalid Column Length for the file!! File moved to " + self.bad_directory_path + "file:" + file self.logger_db_writer.log(log_database, log_collection, msg) self.logger_db_writer.log( log_database, log_collection, "Column Length Validation Completed!!") except Exception as e: self.logger_db_writer.log( log_database, log_collection, "Error Occured in class Raw_Data_validation method: validateColumnLength error:" + str(e)) raise e # shutil.move("Training_Raw_files_validated/Good_Raw/" + file, "Training_Raw_files_validated/Bad_Raw") # # self.logger.log(f, "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file) #self.logger.log(f, "Column Length Validation Completed!!") #except OSError: # f = open("Training_Logs/columnValidationLog.txt", 'a+') # self.logger.log(f, "Error Occured while moving the file :: %s" % OSError) # f.close() # raise OSError #except Exception as e: # f = open("Training_Logs/columnValidationLog.txt", 'a+') # self.logger.log(f, "Error Occured:: %s" % e) # f.close() # # raise e #f.close() def validateMissingValuesInWholeColumn(self): """ Method Name: validateMissingValuesInWholeColumn Description: This function validates if any column in the csv file has all values missing. If all the values are missing, the file is not suitable for processing. SUch files are moved to bad raw data. Output: None On Failure: Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ log_database = "wafer_training_log" log_collection = "missing_values_in_column" try: log_database = "wafer_training_log" log_collection = "missing_values_in_column" self.logger_db_writer.log(log_database, log_collection, "Missing Values Validation Started!!") #f = open("Training_Logs/missingValuesInColumn.txt", 'a+') #self.logger.log(f,"Missing Values Validation Started!!") #for file in listdir('Training_Raw_files_validated/Good_Raw/'): # csv = pd.read_csv("Training_Raw_files_validated/Good_Raw/" + file) for file in self.az_blob_mgt.getAllFileNameFromDirectory( self.good_directory_path): csv = self.az_blob_mgt.readCSVFilefromDir( self.good_directory_path, file) print(csv) count = 0 for columns in csv: print(columns) if (len(csv[columns]) - csv[columns].count()) == len( csv[columns]): count += 1 #shutil.move("Training_Raw_files_validated/Good_Raw/" + file, # "Training_Raw_files_validated/Bad_Raw") #self.logger.log(f,"Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file) self.az_blob_mgt.moveFileinDir( self.good_directory_path, self.bad_directory_path, file) msg = "Invalid Column Length for the file!! File moved to " + self.bad_directory_path + ":: %s" % file self.logger_db_writer.log(log_database, log_collection, msg) break if count == 0: print("entering rename") csv.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True) self.az_blob_mgt.saveDataFrametoCSV( self.good_directory_path, file, csv, index=None, header=True) #csv.to_csv("Training_Raw_files_validated/Good_Raw/" + file, index=None, header=True) #except OSError: # f = open("Training_Logs/missingValuesInColumn.txt", 'a+') # self.logger.log(f, "Error Occured while moving the file :: %s" % OSError) # f.close() # raise OSError except Exception as e: # f = open("Training_Logs/missingValuesInColumn.txt", 'a+') # # self.logger.log(f, "Error Occured:: %s" % e) # f.close() # raise e #f.close() self.logger_db_writer.log( log_database, log_collection, "Error Occured class:Raw_Data_validation method:validateMissingValuesInWholeColumn error:" + str(e)) raise e #path = 'training-batch-files' #noofcolumns=592 #a=Raw_Data_validation #res=a.validateMissingValuesInWholeColumn(a(path,333)) #print(res)
class Preprocessor: """ This class shall be used to clean and transform the data before training. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ def __init__(self, log_database, log_collection, execution_id): self.log_database = log_database self.log_collection = log_collection self.execution_id = execution_id self.mongoDBObject = MongodbOperation() self.log_db_writer = App_LoggerDB(execution_id=execution_id) self.az_blob_mgt = AzureBlobManagement() def remove_columns(self, data, columns): """ Method Name: remove_columns Description: This method removes the given columns from a pandas dataframe. Output: A pandas DataFrame after removing the specified columns. On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.log_db_writer.log( self.log_database, self.log_collection, "Entered the remove_columns method of the Preprocessor class") self.data = data self.columns = columns try: self.useful_data = self.data.drop( labels=self.columns, axis=1) # drop the labels specified in the columns print("Useful data in dataframe") print(self.useful_data) self.log_db_writer.log( self.log_database, self.log_collection, "Column removal Successful.Exited the " "remove_columns method of the Preprocessor class") return self.useful_data except Exception as e: self.log_db_writer.log( self.log_database, self.log_collection, "Exception occured in remove_columns method" " of the Preprocessor class. Exception message: " + str(e)) self.log_db_writer.log( self.log_database, self.log_collection, 'Column removal Unsuccessful. Exited the ' 'remove_columns method of the Preprocessor class') raise Exception() def separate_label_feature(self, data, label_column_name): """ Method Name: separate_label_feature Description: This method separates the features and a Label Coulmns. Output: Returns two separate Dataframes, one containing features and the other containing Labels . On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.log_db_writer.log( self.log_database, self.log_collection, 'Entered the separate_label_feature method of the Preprocessor class' ) try: self.X = data.drop( labels=label_column_name, axis=1 ) # drop the columns specified ,i,e output column and separate the feature columns self.Y = data[label_column_name] # Filter the Label columns self.log_db_writer.log( self.log_database, self.log_collection, 'Label Separation Successful. Exited the separate_label_feature method of the Preprocessor class' ) print(self.X, self.Y) return self.X, self.Y except Exception as e: self.log_db_writer.log( self.log_database, self.log_collection, 'Exception occured in separate_label_feature method of the Preprocessor class. Exception message: ' + str(e)) self.log_db_writer.log( self.log_database, self.log_collection, 'Label Separation Unsuccessful. Exited the separate_label_feature' ' method of the Preprocessor class') raise Exception() def is_null_present(self, data): """ Method Name: is_null_present Description: This method checks whether there are null values present in the pandas Dataframe or not. Output: Returns a Boolean Value. True if null values are present in the DataFrame, False if they are not present. On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.log_db_writer.log( self.log_database, self.log_collection, 'Entered the is_null_present method of the Preprocessor class') self.null_present = False try: self.null_counts = data.isna().sum( ) # check for the count of null values per column for i in self.null_counts: if i > 0: self.null_present = True break if (self.null_present ): # write the logs to see which columns have null values dataframe_with_null = pd.DataFrame() dataframe_with_null['columns'] = data.columns dataframe_with_null['missing values count'] = np.asarray( data.isna().sum()) print(dataframe_with_null) #dataframe_with_null.to_csv('preprocessing_data/null_values.csv') # storing the null column information to file self.az_blob_mgt.saveDataFrametoCSV( "preprocessing-data", "null_values.csv", data_frame=dataframe_with_null) self.log_db_writer.log( self.log_database, self.log_collection, 'Finding missing values is a success.Data written' '[preprocessing-data] to the null values file. ' 'Exited the is_null_present method of the Preprocessor class') return self.null_present except Exception as e: self.log_db_writer.log( self.log_database, self.log_collection, 'Exception occured in is_null_present method of the Preprocessor class. Exception message: ' + str(e)) self.log_db_writer.log( self.log_database, self.log_collection, 'Finding missing values failed. Exited the is_null_present method of the Preprocessor class' ) raise Exception() def impute_missing_values(self, data): """ Method Name: impute_missing_values Description: This method replaces all the missing values in the Dataframe using KNN Imputer. Output: A Dataframe which has all the missing values imputed. On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.log_db_writer.log( self.log_database, self.log_collection, 'Entered the impute_missing_values method of the Preprocessor class' ) self.data = data try: imputer = KNNImputer(n_neighbors=3, weights='uniform', missing_values=np.nan) self.new_array = imputer.fit_transform( self.data) # impute the missing values # convert the nd-array returned in the step above to a Dataframe self.new_data = pd.DataFrame(data=self.new_array, columns=self.data.columns) self.log_db_writer.log( self.log_database, self.log_collection, 'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class' ) return self.new_data except Exception as e: self.log_db_writer.log( self.log_database, self.log_collection, 'Exception occured in impute_missing_values method of the Preprocessor class. Exception message: ' + str(e)) self.log_db_writer.log( self.log_database, self.log_collection, 'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class' ) raise Exception() def get_columns_with_zero_std_deviation(self, data): """ Method Name: get_columns_with_zero_std_deviation Description: This method finds out the columns which have a standard deviation of zero. Output: List of the columns with standard deviation of zero On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.log_db_writer.log( self.log_database, self.log_collection, 'Entered the get_columns_with_zero_std_deviation method of the Preprocessor class' ) self.columns = data.columns self.data_n = data.describe() self.col_to_drop = [] try: for x in self.columns: if (self.data_n[x]['std'] == 0 ): # check if standard deviation is zero self.col_to_drop.append( x ) # prepare the list of columns with standard deviation zero self.log_db_writer.log( self.log_database, self.log_collection, 'Column search for Standard Deviation of Zero Successful. Exited the get_columns_with_zero_std_deviation method of the Preprocessor class' ) print(self.col_to_drop) return self.col_to_drop except Exception as e: self.log_db_writer.log( self.log_database, self.log_collection, 'Exception occured in get_columns_with_zero_std_deviation method of the Preprocessor class. Exception message: ' + str(e)) self.log_db_writer.log( self.log_database, self.log_collection, 'Column search for Standard Deviation of Zero Failed. Exited the get_columns_with_zero_std_deviation method of the Preprocessor class' ) raise Exception()
class Preprocessor: """ This class shall be used to clean and transform the data before training. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ def __init__(self, log_database, log_collection, execution_id): #self.file_object = file_object #self.logger_object = logger_object self.log_database = log_database self.log_collection = log_collection #self.execution_id=execution_id self.mongoDBObject = MongodbOperation() self.log_db_writer = App_LoggerDB(execution_id=execution_id) self.az_blob_mgt = AzureBlobManagement() def remove_columns(self, data, columns): """ Method Name: remove_columns Description: This method removes the given columns from a pandas dataframe. Output: A pandas DataFrame after removing the specified columns. On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.log_db_writer.log( self.log_database, self.log_collection, 'Entered the remove_columns method of the Preprocessor class') self.data = data self.columns = columns try: self.useful_data = self.data.drop( labels=self.columns, axis=1) # drop the labels specified in the columns self.log_db_writer.log( self.log_database, self.log_collection, 'Column removal Successful.Exited the remove_columns method of the Preprocessor class' ) return self.useful_data except Exception as e: self.log_db_writer.log( self.log_database, self.log_collection, 'Exception occured in remove_columns method of the Preprocessor class. Exception message: ' + str(e)) self.log_db_writer.log( self.log_database, self.log_collection, 'Column removal Unsuccessful. Exited the remove_columns method of the Preprocessor class' ) raise Exception() def separate_label_feature(self, data, label_column_name): """ Method Name: separate_label_feature Description: This method separates the features and a Label Coulmns. Output: Returns two separate Dataframes, one containing features and the other containing Labels . On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.log_db_writer.log( self.log_database, self.log_collection, 'Entered the separate_label_feature method of the Preprocessor class' ) try: self.X = data.drop( labels=label_column_name, axis=1 ) # drop the columns specified and separate the feature columns self.Y = data[label_column_name] # Filter the Label columns self.log_db_writer.log( self.log_database, self.log_collection, 'Label Separation Successful. Exited the separate_label_feature method of the Preprocessor class' ) return self.X, self.Y except Exception as e: self.log_db_writer.log( self.log_database, self.log_collection, 'Exception occured in separate_label_feature method of the Preprocessor class. Exception message: ' + str(e)) self.log_db_writer.log( self.log_database, self.log_collection, 'Label Separation Unsuccessful. Exited the separate_label_feature method of the Preprocessor class' ) raise Exception() def dropUnnecessaryColumns(self, data, columnNameList): """ Method Name: is_null_present Description: This method drops the unwanted columns as discussed in EDA section. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ data = data.drop(columnNameList, axis=1) return data def replaceInvalidValuesWithNull(self, data): """ Method Name: is_null_present Description: This method replaces invalid values i.e. '?' with null, as discussed in EDA. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ for column in data.columns: count = data[column][data[column] == '?'].count() if count != 0: data[column] = data[column].replace('?', np.nan) return data def is_null_present(self, data): """ Method Name: is_null_present Description: This method checks whether there are null values present in the pandas Dataframe or not. Output: Returns True if null values are present in the DataFrame, False if they are not present and returns the list of columns for which null values are present. On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.log_db_writer.log( self.log_database, self.log_collection, 'Entered the is_null_present method of the Preprocessor class') self.null_present = False self.cols_with_missing_values = [] self.cols = data.columns try: self.null_counts = data.isna().sum( ) # check for the count of null values per column for i in range(len(self.null_counts)): if self.null_counts[i] > 0: self.null_present = True self.cols_with_missing_values.append(self.cols[i]) if (self.null_present ): # write the logs to see which columns have null values dataframe_with_null = pd.DataFrame() dataframe_with_null['columns'] = data.columns dataframe_with_null['missing values count'] = np.asarray( data.isna().sum()) print(dataframe_with_null) #dataframe_with_null.to_csv('preprocessing_data/null_values.csv') # storing the null column information to file self.az_blob_mgt.saveDataFrametoCSV( "preprocessing-data", "null_values.csv", data_frame=dataframe_with_null) self.log_db_writer.log( self.log_database, self.log_collection, 'Finding missing values is a success.Data written to the null values file. Exited the is_null_present method of the Preprocessor class' ) return self.null_present, self.cols_with_missing_values except Exception as e: self.log_db_writer.log( self.log_database, self.log_collection, 'Exception occured in is_null_present method of the Preprocessor class. Exception message: ' + str(e)) self.log_db_writer.log( self.log_database, self.log_collection, 'Finding missing values failed. Exited the is_null_present method of the Preprocessor class' ) raise Exception() def encodeCategoricalValues(self, data): """ Method Name: encodeCategoricalValues Description: This method encodes all the categorical values in the training set. Output: A Dataframe which has all the categorical values encoded. On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ data["class"] = data["class"].map({'p': 1, 'e': 2}) for column in data.drop(['class'], axis=1).columns: data = pd.get_dummies(data, columns=[column]) return data def encodeCategoricalValuesPrediction(self, data): """ Method Name: encodeCategoricalValuesPrediction Description: This method encodes all the categorical values in the prediction set. Output: A Dataframe which has all the categorical values encoded. On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ for column in data.columns: data = pd.get_dummies(data, columns=[column]) return data # def handleImbalanceDataset(self,X,Y): # """ # Method Name: handleImbalanceDataset # Description: This method handles the imbalance in the dataset by oversampling. # Output: A Dataframe which is balanced now. # On Failure: Raise Exception # # Written By: iNeuron Intelligence # Version: 1.0 # Revisions: None # """ # # # # rdsmple = RandomOverSampler() # x_sampled, y_sampled = rdsmple.fit_sample(X, Y) # # return x_sampled,y_sampled def standardScalingData(self, X): scalar = StandardScaler() X_scaled = scalar.fit_transform(X) return X_scaled def logTransformation(self, X): for column in X.columns: X[column] += 1 X[column] = np.log(X[column]) return X def impute_missing_values(self, data): """ Method Name: impute_missing_values Description: This method replaces all the missing values in the Dataframe using KNN Imputer. Output: A Dataframe which has all the missing values imputed. On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.log_db_writer.log( self.log_database, self.log_collection, 'Entered the impute_missing_values method of the Preprocessor class' ) self.data = data try: imputer = KNNImputer(n_neighbors=3, weights='uniform', missing_values=np.nan) self.new_array = imputer.fit_transform( self.data) # impute the missing values # convert the nd-array returned in the step above to a Dataframe self.new_data = pd.DataFrame(data=(self.new_array), columns=self.data.columns) self.log_db_writer.log( self.log_database, self.log_collection, 'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class' ) print(self.new_data) return self.new_data except Exception as e: self.log_db_writer.log( self.log_database, self.log_collection, 'Exception occured in impute_missing_values method of the Preprocessor class. Exception message: ' + str(e)) self.log_db_writer.log( self.log_database, self.log_collection, 'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class' ) raise Exception() def get_columns_with_zero_std_deviation(self, data): """ Method Name: get_columns_with_zero_std_deviation Description: This method finds out the columns which have a standard deviation of zero. Output: List of the columns with standard deviation of zero On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.log_db_writer.log( self.log_database, self.log_collection, 'Entered the get_columns_with_zero_std_deviation method of the Preprocessor class' ) self.columns = data.columns self.data_n = data.describe() self.col_to_drop = [] try: for x in self.columns: if (self.data_n[x]['std'] == 0 ): # check if standard deviation is zero self.col_to_drop.append( x ) # prepare the list of columns with standard deviation zero self.log_db_writer.log( self.log_database, self.log_collection, 'Column search for Standard Deviation of Zero Successful. Exited the get_columns_with_zero_std_deviation method of the Preprocessor class' ) print(self.col_to_drop) return self.col_to_drop except Exception as e: self.log_db_writer.log( self.log_database, self.log_collection, 'Exception occured in get_columns_with_zero_std_deviation method of the Preprocessor class. Exception message: ' + str(e)) self.log_db_writer.log( self.log_database, self.log_collection, 'Column search for Standard Deviation of Zero Failed. Exited the get_columns_with_zero_std_deviation method of the Preprocessor class' ) raise Exception()
class DbOperationMongoDB: """ This class shall be used for handling all the mongodb operations. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ def __init__(self, execution_id): self.mongodb=MongodbOperation() self.az_blob_mgt=AzureBlobManagement() self.logger_db_writer=App_LoggerDB(execution_id=execution_id) self.good_file_path="good-raw-file-prediction-validated" self.bad_file_path="bad-raw-file-prediction-validated" def insertIntoTableGoodData(self,column_names): """ Description: Load all csv file into mongo db database "prediction_database" ,collection:"Good_Raw_Data" from azure storage -good data and clear the storage space. :return: """ try: prediction_database="prediction_database" # mongodb name prediction_collection="Good_Raw_Data" # mongodb name database_name = "wafer_prediction_log" ## logger name collection_name = "db_insert_log" ## logger name self.mongodb.dropCollection(prediction_database,prediction_collection) self.logger_db_writer.log(database_name,collection_name,"Droping collection:"+prediction_collection+" from database:"+prediction_database) self.logger_db_writer.log(database_name, collection_name,"Starting loading of good files in database:training_database and collection: Good_Raw_Data") files = self.az_blob_mgt.getAllFileNameFromDirectory(self.good_file_path) self.logger_db_writer.log(database_name, collection_name,"No of file found in good-raw-file-train-validated " + str(len(files))) for file in files: try: self.logger_db_writer.log(database_name, collection_name, "Insertion of file +" + file + " started...") df = self.az_blob_mgt.readCSVFilefromDir(self.good_file_path, file) df.columns=column_names print("dataframe before insertion") print(df) self.mongodb.insertDataFrame(prediction_database, prediction_collection, df) self.logger_db_writer.log(database_name, collection_name, "File: {0} loaded successfully".format(file)) except Exception as e: self.logger_db_writer.log(database_name, collection_name, str(e)) self.az_blob_mgt.moveFileinDir(self.good_file_path, self.bad_file_path, file) self.logger_db_writer.log(database_name, collection_name, "File: " + file + " was not loaded successfully hence moved to dir:" + self.bad_file_path) except Exception as e: error_message = "Error occured in class:DbOperationMongoDB method:insertIntoTableGoodData error:" + str(e) self.logger_db_writer.log(database_name, collection_name, error_message) def selectingDatafromtableintocsv(self,): """ :return: """ try: directory_name="prediction-file-from-db" # azure storage name file_name="InputFile.csv" # azure storage name database_name = "wafer_prediction_log" # logger name collection_name = "export_to_csv" # logger name prediction_database="prediction_database" # mongodb name prediction_collection="Good_Raw_Data" # mongodb name msg="starting of loading of database:"+prediction_database+",collection:"+prediction_collection+" records into file:"+file_name self.logger_db_writer.log(database_name,collection_name,msg) df=self.mongodb.getDataFrameofCollection(prediction_database,prediction_collection) print("after dataframe from db extraction") print(df) msg="Good_Raw_data has been loaded into pandas dataframe" print(msg) self.logger_db_writer.log(database_name,collection_name,msg) self.az_blob_mgt.saveDataFrametoCSV(directory_name,file_name,df,index=None,header=True) # since the inputFile.csv has unammed column, added index =0 later removed as unnamed 0.1 is present in predictionDatavalidation.py ln 410. msg = "InputFile.csv created successfully in directory"+directory_name print(msg) self.logger_db_writer.log(database_name, collection_name, msg) except Exception as e: msg="Error occured in class:DbOperationMongoDB method:insertIntoTableGoodData error:"+str(e) self.logger_db_writer.log(database_name,collection_name,msg)
class DbOperationMongoDB: """ This class shall be used for handling all the SQL operations. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ def __init__(self, execution_id): self.mongodb = MongodbOperation() self.az_blob_mgt = AzureBlobManagement() self.logger_db_writer = App_LoggerDB(execution_id=execution_id) self.good_file_path = "good-raw-file-train-validated" self.bad_file_path = "bad-raw-file-train-validated" def insertIntoTableGoodData(self, column_name): """ Description: Load all csv file into mongo db database "training_database" ,collection:"Good_Raw_Data" :return: """ database_name = "wafer_training_log" collection_name = "db_insert_log" try: database_name = "wafer_training_log" # logger name collection_name = "db_insert_log" # logger name self.logger_db_writer.log( database_name, collection_name, "Droping existing collection if present in database training_database" ) self.mongodb.dropCollection("training_database", "Good_Raw_Data") self.logger_db_writer.log( database_name, collection_name, "Starting loading of good files in database:training_database and collection: Good_Raw_Data" ) files = self.az_blob_mgt.getAllFileNameFromDirectory( self.good_file_path) self.logger_db_writer.log( database_name, collection_name, "No of file found in good-raw-file-train-validated " + str(len(files))) for file in files: try: self.logger_db_writer.log( database_name, collection_name, "Insertion of file " + file + " started...") df = self.az_blob_mgt.readCSVFilefromDir( self.good_file_path, file) df.columns = column_name self.mongodb.insertDataFrame("training_database", "Good_Raw_Data", df) self.logger_db_writer.log( database_name, collection_name, "File: {0} loaded successfully".format(file)) except Exception as e: self.logger_db_writer.log(database_name, collection_name, str(e)) self.az_blob_mgt.moveFileinDir(self.good_file_path, self.bad_file_path, file) self.logger_db_writer.log( database_name, collection_name, "File " + file + " was not loaded successfully hence moved tp dir:" + self.bad_file_path) except Exception as e: error_message = "Error occured in class:DbOperationMongoDB method:insertIntoTableGoodData error:" + str( e) self.logger_db_writer.log(database_name, collection_name, error_message) def selectingDatafromtableintocsv(self, ): """ :return: """ database_name = "wafer_training_log" collection_name = "export_to_csv" try: directory_name = "training-file-from-db" file_name = "InputFile" training_database = "training_database" training_collection = "Good_Raw_Data" msg = "starting of loading of database:training_database,collection:Good_Raw_Data records into InputFile.csv" print(msg) self.logger_db_writer.log(database_name, collection_name, msg) df = self.mongodb.getDataFrameofCollection(training_database, training_collection) print(df) msg = "Good_Raw_data has been loaded into pandas dataframe" self.logger_db_writer.log(database_name, collection_name, msg) self.az_blob_mgt.saveDataFrametoCSV(directory_name, file_name, df) msg = "InputFile.csv created successfully in directory " + directory_name self.logger_db_writer.log(database_name, collection_name, msg) except Exception as e: msg = "Error occured in class:DbOperationMongoDB method:insertIntoTableGoodData error:" + str( e) self.logger_db_writer.log(database_name, collection_name, msg)
class dataTransformPredict: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ def __init__(self, execution_id): self.execution_id = execution_id #self.goodDataPath = "Prediction_Raw_Files_Validated/Good_Raw" self.goodDataPath = "good-raw-file-prediction-validated" #self.logger = App_Logger() self.log_db_writer = App_LoggerDB(execution_id=execution_id) self.log_database = "wafer_prediction_log" self.az_blob_mgt = AzureBlobManagement() def replaceMissingWithNull(self): """ Method Name: replaceMissingWithNull Description: This method replaces the missing values in columns with "NULL" to store in the table. We are using substring in the first column to keep only "Integer" data for ease up the loading. This column is anyways going to be removed during prediction. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ log_collection = "data_transform_log" try: log_collection = "data_transform_log" onlyfiles = self.az_blob_mgt.getAllFileNameFromDirectory( self.goodDataPath) print(onlyfiles) for file in onlyfiles: csv = self.az_blob_mgt.readCSVFilefromDir( self.goodDataPath, file) csv.fillna('NULL', inplace=True) # #csv.update("'"+ csv['Wafer'] +"'") # csv.update(csv['Wafer'].astype(str)) csv['Wafer'] = csv['Wafer'].str[6:] #print(csv) print("before updating index") #csv.to_csv(self.goodDataPath+ "/" + file, index=None, header=True) self.az_blob_mgt.saveDataFrametoCSV(self.goodDataPath, file, csv, index=None, header=True) print("after updating index") #self.logger.log(log_file," %s: File Transformed successfully!!" % file) self.log_db_writer.log( self.log_database, log_collection, "File {0} transformed successfully".format(file)) print('File transformed replace missing with null succccc') #log_file.write("Current Date :: %s" %date +"\t" + "Current time:: %s" % current_time + "\t \t" + + "\n") except Exception as e: self.log_db_writer.log( self.log_database, log_collection, 'Data Transformation failed because:' + str(e)) raise e
class prediction: def __init__(self, path, execution_id): #self.file_object = open("Prediction_Logs/Prediction_Log.txt", 'a+') #self.log_writer = logger.App_Logger() #self.pred_data_val = Prediction_Data_validation(path) self.execution_id = execution_id self.log_database = "strength_prediction_log" self.log_collection = "prediction_log" self.log_db_writer = App_LoggerDB(execution_id) self.az_blob_mgt = AzureBlobManagement() if path is not None: self.pred_data_val = Prediction_Data_validation(path, execution_id) def predictionFromModel(self): try: self.pred_data_val.deletePredictionFile( ) #deletes the existing prediction file from last run! self.log_db_writer.log(self.log_database, self.log_collection, 'Start of Prediction') print("start of prediction") data_getter = data_loader_prediction.Data_Getter_Pred( self.log_database, self.log_collection, self.execution_id) data = data_getter.get_data() path = "" if data.__len__() == 0: self.log_db_writer.log( self.log_database, self.log_collection, "No data was present to perform prediction existing prediction method" ) return path, "No data was present to perform prediction" #code change # wafer_names=data['Wafer'] # data=data.drop(labels=['Wafer'],axis=1) preprocessor = preprocessing.Preprocessor(self.log_database, self.log_collection, self.execution_id) is_null_present, cols_with_missing_values = preprocessor.is_null_present( data) if (is_null_present): data = preprocessor.impute_missing_values(data) data = preprocessor.logTransformation(data) print("after log Transformation") print(data) #scale the prediction data data_scaled = pandas.DataFrame( preprocessor.standardScalingData(data), columns=data.columns) print("standard scaling for data completed") print(data_scaled) #data=data.to_numpy() file_loader = file_methods.File_Operation(self.log_database, self.log_collection, self.execution_id) kmeans = file_loader.load_model('kkmeans') ##Code changed #pred_data = data.drop(['Wafer'],axis=1) clusters = kmeans.predict( data_scaled) #drops the first column for cluster prediction data_scaled['clusters'] = clusters clusters = data_scaled['clusters'].unique() result = [] # initialize blank list for storing predicitons # with open('EncoderPickle/enc.pickle', 'rb') as file: #let's load the encoder pickle file to decode the values # encoder = pickle.load(file) for i in clusters: cluster_data = data_scaled[data_scaled['clusters'] == i] cluster_data = cluster_data.drop(['clusters'], axis=1) model_name = file_loader.find_correct_model_file(i) print(model_name) model = file_loader.load_model(model_name) for val in (model.predict(cluster_data.values)): result.append(val) result = pandas.DataFrame(result, columns=['strength-Predictions']) #result = list(model.predict(cluster_data)) #self.result = pandas.DataFrame(list(zip(result)), columns=['Prediction']) #for val in (model.predict(cluster_data.values)): # result.append(val) #print(self.result.shape) print("results after prediction with prediction columns") print(result) path = "Prediction-Output-File" #result.to_csv("Prediction_Output_File/Predictions.csv",header=True) #appends result to prediction file self.az_blob_mgt.saveDataFrametoCSV( path, "cement-strength-prediction.csv", result, header=True, mode="a+") self.log_db_writer.log(self.log_database, self.log_collection, 'End of Prediction') except Exception as ex: self.log_db_writer.log( self.log_database, self.log_collection, 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return path, result.head().to_json(orient="records")
class dataTransform: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ def __init__(self, execution_id): self.goodDataPath = "good-raw-file-train-validated" self.execution_id = execution_id self.logger_db_writer = App_LoggerDB(execution_id) self.az_blob_mgt = AzureBlobManagement() #self.goodDataPath = "Training_Raw_files_validated/Good_Raw" #self.logger = App_Logger() def replaceMissingWithNull(self): """ Method Name: replaceMissingWithNull Description: This method replaces the missing values in columns with "NULL" to store in the table. We are using substring in the first column to keep only "Integer" data for ease up the loading. This column is anyways going to be removed during training. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ log_collection = "data_transform_log" log_database = "wafer_training_log" #log_file = open("Training_Logs/dataTransformLog.txt", 'a+') try: #onlyfiles = [f for f in listdir(self.goodDataPath)] onlyfiles = self.az_blob_mgt.getAllFileNameFromDirectory( self.goodDataPath) for file in onlyfiles: csv = self.az_blob_mgt.readCSVFilefromDir(self.goodDataPath, filename=file) csv.fillna('NULL', inplace=True) csv["Wafer"] = csv["Wafer"].str[6:] self.az_blob_mgt.saveDataFrametoCSV(self.goodDataPath, file, csv, index=None, header=True) self.logger_db_writer.log( log_database, log_collection, "File {0} Transformed successfully!!".format(file)) # csv = pandas.read_csv(self.goodDataPath+"/" + file) # csv.fillna('NULL',inplace=True) # # #csv.update("'"+ csv['Wafer'] +"'") # # csv.update(csv['Wafer'].astype(str)) # csv['Wafer'] = csv['Wafer'].str[6:] # csv.to_csv(self.goodDataPath+ "/" + file, index=None, header=True) # self.logger.log(log_file," %s: File Transformed successfully!!" % file) #log_file.write("Current Date :: %s" %date +"\t" + "Current time:: %s" % current_time + "\t \t" + + "\n") except Exception as e: msg = "Error occured in class:dataTransform method:replaceMissingWithNull error:Data Transformation failed because:" + str( e) self.logger_db_writer.log(log_database, log_collection, msg) raise e
class prediction: def __init__(self, path, execution_id): self.execution_id = execution_id #self.file_object = open("Prediction_Logs/Prediction_Log.txt", 'a+') #self.log_writer = logger.App_Logger() self.log_database = "wafer_prediction_log" self.log_collection = "prediction_log" self.log_db_writer = App_LoggerDB(execution_id) self.az_blob_mgt = AzureBlobManagement() if path is not None: self.pred_data_val = Prediction_Data_validation(path, execution_id) def predictionFromModel(self): try: self.pred_data_val.deletePredictionFile( ) #deletes the existing prediction file from last run! self.log_db_writer.log(self.log_database, self.log_collection, 'Start of Prediction') data_getter = data_loader_prediction.Data_Getter_Pred( self.log_database, self.log_collection, self.execution_id) data = data_getter.get_data() path = "" if data.__len__() == 0: self.log_db_writer.log( self.log_database, self.log_collection, "No data was present to perform prediction existing prediction method" ) return path, "No data was present to perform prediction" #code change # wafer_names=data['Wafer'] # data=data.drop(labels=['Wafer'],axis=1) preprocessor = preprocessing.Preprocessor(self.log_database, self.log_collection, self.execution_id) is_null_present = preprocessor.is_null_present(data) if (is_null_present): data = preprocessor.impute_missing_values(data) cols_to_drop = preprocessor.get_columns_with_zero_std_deviation( data) data = preprocessor.remove_columns(data, cols_to_drop) #data=data.to_numpy() file_loader = file_methods.File_Operation(self.log_database, self.log_collection, self.execution_id) kmeans = file_loader.load_model('KMeans') #print(kmeans) ##Code changed pred_data = data.drop(['Wafer'], axis=1) clusters = kmeans.predict( pred_data) #drops the first column for cluster prediction data['clusters'] = clusters clust = data['clusters'].unique() for i in clust: cluster_data = data[ data['clusters'] == i] # filteing of data with clusters no 0,1,2,etc. wafer_names = list(cluster_data['Wafer']) cluster_data = data.drop(labels=['Wafer'], axis=1) cluster_data = cluster_data.drop(['clusters'], axis=1) model_name = file_loader.find_correct_model_file(i) model = file_loader.load_model(model_name) result = list(model.predict(cluster_data)) self.result = pandas.DataFrame(list(zip(wafer_names, result)), columns=['Wafer', 'Prediction']) print(self.result.shape) print(self.result) #path="Prediction_Output_File/Predictions.csv" path = "prediction-output-file" self.az_blob_mgt.saveDataFrametoCSV(path, "prediction.csv", self.result, header=True, mode="a+") #result.to_csv("Prediction_Output_File/Predictions.csv",header=True,mode='a+') #appends result to prediction file #self.log_writer.log(self.file_object,'End of Prediction') self.log_db_writer.log(self.log_database, self.log_collection, 'End of prediction') except Exception as ex: #self.log_writer.log(self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex) self.log_db_writer.log( self.log_database, self.log_collection, 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return path, self.result.head().to_json(orient="records")