def __init__(self, mainFilepath, additionalFilepath): self.raw_data = Raw_Data_Validation(mainFilepath, additionalFilepath) self.preproccesing_beforeDB = preprocessing_beforeDB() self.DbOperation = DBOperations() self.file_object = open('Prediction_Logs/Prediction_Main_Log.txt', 'a+') self.log_writer = App_Logger()
def __init__(self): self.path = "Prediction_Database/" self.goodRaw_MainFile_path = "Prediction_Raw_Validated_File/Good_Raw_MainFile" self.badRaw_MainFile_path = "Prediction_Raw_Validated_File/Bad_Raw_MainFile" self.goodRaw_AdditionalFile_path = "Prediction_Raw_Validated_File/Good_Raw_AdditionalFile" self.badRaw_AdditionalFile_path = "Prediction_Raw_Validated_File/Bad_Raw_AdditionalFile" self.logger = App_Logger()
class preprocessing_beforeDB: def __init__(self): self.goodData_MainFile_path = "Training_Raw_Validated_File/Good_Raw_MainFile" self.goodData_AdditionalFile_path = "Training_Raw_Validated_File/Good_Raw_AdditionalFile" self.logger = App_Logger() def replaceMissingWithNull_MainFile(self): try: f = open("Training_Logs/data_preprocessing_beforeDB.txt", "a+") only_files = [f for f in os.listdir(self.goodData_MainFile_path)] for file in only_files: csv = pd.read_csv(self.goodData_MainFile_path + "/" + file) csv.fillna('NULL',inplace=True) csv.to_csv(self.goodData_MainFile_path + "/" + file,index=None,header=True) self.logger.log(f,'Replace Missing values with Null Values in Good Raw Main File Successfully !!') f.close() except Exception as e: f = open("Training_Logs/data_preprocessing_beforeDB.txt", "a+") self.logger.log(f,'Replace missing with Null Values failed in Main File becasue:: %s' % str(e)) f.close() def replaceMissingWithNull_AdditionalFile(self): f = open("Training_Logs/data_preprocessing_beforeDB.txt","a+") try: only_files = [f for f in os.listdir(self.goodData_AdditionalFile_path)] for file in only_files: csv = pd.read_csv(self.goodData_AdditionalFile_path + "/" + file) csv.fillna('NULL',inplace=True) csv.to_csv(self.goodData_AdditionalFile_path + "/" + file,index=None,header=True) self.logger.log(f,'Replace Missing values with Null Values in Additional Raw Main File Successfully !!') except Exception as e: self.logger.log(f,'Replace missing with Null Values failed in Additional File becasue:: %s' % e) f.close() f.close()
class TrainModel: def __init__(self): self.log_writer = App_Logger() self.file_object = open('Training_Logs/ModelTrainingLog.txt', 'a+') def train_model(self): self.log_writer.log(self.file_object, 'Start of Training') try: data_getter = data_loader.Data_Getter(self.file_object, self.log_writer) main_data, additional_data = data_getter.get_data() preprocessor = data_preprocessing.PreProcessor( self.file_object, self.log_writer) is_null_present = preprocessor.is_null_present(main_data) if is_null_present == True: main_data = preprocessor.impute_missing_values(main_data) main_data = preprocessor.map_ip_to_country(main_data, additional_data) main_data = preprocessor.difference_signup_and_purchase(main_data) main_data = preprocessor.encoding_browser(main_data) main_data = preprocessor.encoding_source(main_data) main_data = preprocessor.encoding_sex(main_data) main_data = preprocessor.count_frequency_encoding_country( main_data) main_data = preprocessor.remove_unwanted_cols(main_data) x, y = preprocessor.separate_label_feature(main_data, 'class') x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) #x_train,y_train = preprocessor.over_sampling_smote(x_train,y_train) model_finder = tuner.Model_Finder(self.file_object, self.log_writer) best_model_name, best_model = model_finder.get_best_model( x_train, y_train, x_test, y_test) file_op = file_methods.File_Operation(self.file_object, self.log_writer) save_model = file_op.save_model(best_model, best_model_name) self.log_writer.log(self.file_object, 'Successfull End of Training') self.file_object.close() except Exception as e: self.log_writer.log(self.file_object, 'Unsuccessfull End of Training') self.file_object.close() raise e
class Prediction_Row: def __init__(self): self.log_writer = App_Logger() self.file_object = open('Prediction_Logs/PredictionLog.txt', 'a+') # self.datarow = pd.DataFrame({'signup_time': self.signup_time, 'purchase_time': self.purchase_time, # 'purchase_value': self.purchase_value, 'source': self.source, # 'browser': self.browser, 'sex': self.sex, 'age': self.age, # 'ip_address': self.ip_address}) #print(self.datarow) def predictRow(self, datarow): self.log_writer.log(self.file_object, 'Start of DataRow Prediction') self.datarow = datarow try: preprocessor = data_preprocessing.PreProcessorRow( self.datarow, self.file_object, self.log_writer) self.datarow = preprocessor.row_map_ip_to_country(self.datarow) self.datarow = preprocessor.row_difference_signup_and_purchase( self.datarow) self.datarow = preprocessor.row_encoding_browser(self.datarow) self.datarow = preprocessor.row_encoding_source(self.datarow) self.datarow = preprocessor.row_encoding_sex(self.datarow) self.datarow = preprocessor.row_count_frequency_encoding_country( self.datarow) self.datarow = preprocessor.row_remove_unwanted_cols(self.datarow) file_loader = file_methods.File_Operation(self.file_object, self.log_writer) model_name = file_loader.find_correct_model_file() model = file_loader.load_model(model_name) self.datarow['purchase_value'] = pd.to_numeric( self.datarow['purchase_value']) self.datarow['age'] = pd.to_numeric(self.datarow['age']) self.datarow = self.datarow.reindex([ 'purchase_value', 'age', 'd_day', 'd_hour', 'd_minutes', 'd_seconds', 'FireFox', 'IE', 'Opera', 'Safari', 'Direct', 'SEO', 'M', 'country_encode' ], axis=1) result = model.predict(self.datarow) self.log_writer.log(self.file_object, 'Successfull End of DataRow Prediction') self.file_object.close() except Exception as e: self.log_writer.log( self.file_object, 'Error Occured while doing the DataRaw Prediction !! Error :: %s' % str(e)) self.file_object.close() raise e return str(result[0])
def __init__(self): self.goodData_MainFile_path = "Prediction_Raw_Validated_File/Good_Raw_MainFile" self.goodData_AdditionalFile_path = "Prediction_Raw_Validated_File/Good_Raw_AdditionalFile" self.logger = App_Logger()
def __init__(self, mainfile_path, additionalfile_path): self.batch_directory_MainFile = mainfile_path self.batch_directory_AdditionalFile = additionalfile_path self.schema_path = 'schema_Training.json' self.logger = App_Logger()
class Raw_Data_Validation: def __init__(self, mainfile_path, additionalfile_path): self.batch_directory_MainFile = mainfile_path self.batch_directory_AdditionalFile = additionalfile_path self.schema_path = 'schema_Training.json' self.logger = App_Logger() def fetch_values_from_schema(self): try: with open(self.schema_path, 'r') as r: dic = json.load(r) r.close() main_file = dic['SampleFileName_Main'] additional_file = dic['SampleFileName_Additional'] main_lengthofdatestampinfile = dic['Main_LengthOfDateStampInFile'] additional_lengthofdatestampinfile = dic[ 'Additional_LengthOfDateStampInFile'] main_lengthoftimestampinfile = dic['Main_LengthOfTimeStampInFile'] additional_lengthoftimestampinfile = dic[ 'Additional_LengthOfTimeStampInFile'] no_col_mainfile = dic['NumberOfColumns_MainFile'] no_col_additionalfile = dic['NumberOfColumns_AdditionalFile'] mainfile_col_name = dic['MainFile_ColName'] additionalfile_colname = dic['AdditionalFile_ColName'] file = open('Training_Logs/valuesfromschema_Validation_Log.txt', 'a+') message = "Number of Columns in Main File:: %s" % mainfile_col_name + "Number of Columns in Additional File:: %s" % additionalfile_colname + "\n" + "MainFile Length of DateStamp::%s" % main_lengthofdatestampinfile + "\n" + "MainFile Length of TimeStamp:: %s" % main_lengthoftimestampinfile self.logger.log(file, message) file.close() except ValueError: file = open('Training_Logs/valuesfromschema_Validation_Log.txt', 'a+') self.logger.log( file, 'Value Error : Value not Found inside schema_Training.json') file.close() raise ValueError except KeyError: file = open('Training_Logs/valuesfromschema_Validation_Log.txt', 'a+') self.logger.log( file, 'Key Error : Key Value Error Incorrect Key Passed !!') file.close() raise KeyError except Exception as e: file = open('Training_Logs/valuesfromschema_Validation_Log.txt', 'a+') self.logger.log(file, str(e)) file.close() raise e return main_file, additional_file, main_lengthofdatestampinfile, main_lengthoftimestampinfile, additional_lengthofdatestampinfile, additional_lengthoftimestampinfile, mainfile_col_name, additionalfile_colname, no_col_mainfile, no_col_additionalfile def mainfile_manualRegexCreation(self): regex = "['Fraud_Data_']+['\_'']+[\d_]+[\d]+\.csv" return regex def additionalfile_manualRegexCreation(self): regex = "['IPAddress_To_Country_']+['\_'']+[\d_]+[\d]+\.csv" return regex def createDirectoryFor_GoodBadRawData_MainFile(self): try: path = os.path.join("Training_Raw_Validated_File/", "Good_Raw_MainFile/") if not os.path.isdir(path): os.makedirs(path) path = os.path.join("Training_Raw_Validated_File/", "Bad_Raw_MainFile/") if not os.path.isdir(path): os.makedirs(path) except OSError as ex: file = open("Training_Logs/GeneralLog.txt", 'a+') self.logger.log( file, 'Error while creating MainFile Good and Bad Directory %s' % ex) file.close() raise OSError def createDirectoryFor_GoodBadRawData_AdditionalFile(self): try: path = os.path.join("Training_Raw_Validated_File/", "Good_Raw_AdditionalFile") if not os.path.isdir(path): os.makedirs(path) path = os.path.join("Training_Raw_Validated_File/", "Bad_Raw_AdditionalFile") if not os.path.isdir(path): os.makedirs(path) except OSError as ex: file = open("Training_Logs/GeneralLog.txt", 'a+') self.logger.log( file, 'Error while creating Additional Good and Bad Directory %s' % ex) file.close() raise OSError def deleteExistingGoodDataTrainingDir_MainFile(self): try: path = "Training_Raw_Validated_File/" if os.path.isdir(path + 'Good_Raw_MainFile/'): shutil.rmtree(path + 'Good_Raw_MainFile/') file = open('Training_Logs/General_Log.txt', 'a+') self.logger.log( file, 'Good Raw Main File Directory deleted Sucessfully !!!') file.close() except OSError as ex: file = open('Training_Logs/General_Log.txt', 'a+') self.logger.log( file, 'Error while deleting Main File Good Raw Directory: %s' % ex) file.close() raise OSError def deleteExistingGoodDataTrainingDir_AdditionalFile(self): try: path = "Training_Raw_Validated_File/" if os.path.isdir(path + 'Good_Raw_AdditionalFile/'): shutil.rmtree(path + 'Good_Raw_AdditionalFile/') file = open('Training_Logs/General_Log.txt', 'a+') self.logger.log( file, 'Good Raw Main File Directory deleted Sucessfully !!!') file.close() except OSError as ex: file = open('Training_Logs/General_Log.txt', 'a+') self.logger.log(file, 'Error while deleting Good Raw Directory: %s' % ex) file.close() raise OSError def deleteExistingBadDataTrainingDir_MainFile(self): try: path = "Training_Raw_Validated_File/" if os.path.isdir(path + 'Bad_Raw_MainFile/'): shutil.rmtree(path + 'Bad_Raw_MainFile/') file = open('Training_Logs/General_Log.txt', 'a+') self.logger.log( file, 'Bad Raw Additional Directory deleted Sucessfully !!!') file.close() except OSError as ex: file = open('Training_Logs/General_Log.txt', 'a+') self.logger.log( file, 'Error while deleting Main File Bad Raw Directory: %s' % ex) file.close() raise OSError def deleteExistingBadDataTrainingDir_AdditionalFile(self): try: path = "Training_Raw_Validated_File/" if os.path.isdir(path + 'Bad_Raw_AdditionalFile/'): shutil.rmtree(path + 'Bad_Raw_AdditionalFile/') file = open('Training_Logs/General_Log.txt', 'a+') self.logger.log( file, 'Bad Raw Additional Directory deleted Sucessfully !!!') file.close() except OSError as ex: file = open('Training_Logs/General_Log.txt', 'a+') self.logger.log( file, 'Error while deleting Additional Bad Raw Directory: %s' % ex) file.close() raise OSError def moveBadFilesToArchiveBad_MainFile(self): now = datetime.now() date = now.date() time = now.strftime("%H%M%S") try: source = 'Training_Raw_Validated_File/Bad_Raw_MainFile/' if os.path.isdir(source): path = 'TrainingArchiveBadData_MainFile' if not os.path.isdir(path): os.makedirs(path) destination = 'TrainingArchiveBadData_MainFile/Bad_Data_' + str( date) + "_" + str(time) if not os.path.isdir(destination): os.makedirs(destination) files = os.listdir(source) for f in files: if f not in os.listdir(destination): shutil.move(source + f, destination) file = open("Training_Logs/GeneralLog.txt", 'a+') self.logger.log(file, 'Bad Main files moved to archive') path = "Training_Raw_Validated_File" if os.path.isdir(path + 'Bad_Raw_MainFile/'): shutil.rmtree(path + 'Bad_Raw_MainFile/') self.logger.log( file, 'Bad Raw Main Files Data Directory Removed Successfully!!') file.close() except Exception as e: file = open("Training_Logs/General_Log.txt", 'a+') self.logger.log( file, 'Error while moving bad main files to Archive::%s' % e) file.close() raise e def moveBadFilesToArchiveBad_AdditionalFile(self): now = datetime.now() date = now.date() time = now.strftime("%H%M%S") try: source = 'Training_Raw_Validated_File/Bad_Raw_AdditionalFile/' if os.path.isdir(source): path = 'TrainingArchiveBadData_AdditionalFile' if not os.path.isdir(path): os.makedirs(path) destination = 'TrainingArchiveBadData_AdditionalFile/Bad_Data_' + str( date) + "_" + str(time) if not os.path.isdir(destination): os.makedirs(destination) files = os.listdir(source) for f in files: if f not in os.listdir(destination): shutil.move(source + f, destination) file = open("Training_Logs/General_Log.txt", 'a+') self.logger.log(file, 'Bad Additional files moved to archive') path = "Training_Raw_Validated_File" if os.path.isdir(path + 'Bad_Raw_AdditionalFile/'): shutil.rmtree(path + 'Bad_Raw_AdditionalFile/') self.logger.log( file, 'Bad Raw Additional Files Data Directory Removed Successfully!!' ) file.close() except Exception as e: file = open("Training_Logs/GeneralLog.txt", 'a+') self.logger.log( file, 'Error while moving bad main files to Archive::%s' % e) file.close() raise e def validationFileNameRaw_MainFile(self, mainfile_Regex, main_lengthofdatestampinfile, main_lengthoftimestampinfile): self.deleteExistingBadDataTrainingDir_MainFile() self.deleteExistingGoodDataTrainingDir_MainFile() self.createDirectoryFor_GoodBadRawData_MainFile() onlyfiles = [f for f in os.listdir(self.batch_directory_MainFile)] try: file = open("Training_Logs/nameValidationLog.txt", 'a+') for filename in onlyfiles: if (re.match(mainfile_Regex, filename)): split = re.split('.csv', filename) split = re.split('_', split[0]) if len(split[2]) == main_lengthofdatestampinfile: if len(split[3]) == main_lengthoftimestampinfile: shutil.copy( "Training_Batch_Files/Main_File/" + filename, "Training_Raw_Validated_File/Good_Raw_MainFile" ) self.logger.log( file, 'Valid File Name !! File moved to GoodRaw_Main Directory ::%s' % filename) else: shutil.copy( "Training_Batch_Files/Main_File/" + filename, "Training_Raw_Validated_File/Bad_Raw_MainFile") self.logger.log( file, 'Invalid File Name!! File moved to Bad Raw Main File Directory' ) else: shutil.copy( "Training_Batch_Files/Main_File/" + filename, "Training_Raw_Validated_File/Bad_Raw_MainFile") self.logger.log( file, 'Invalid File Name!! File moved to Bad Raw Main File Directory' ) else: shutil.copy( "Training_Batch_Files/Main_File/" + filename, "Training_Raw_Validated_File/Bad_Raw_MainFile") self.logger.log( file, 'Invalid File Name!! File moved to Bad Raw Main File Directory' ) file.close() except Exception as e: file = open("Training_Logs/nameValidationLog.txt", 'a+') self.logger.log( file, "Error occured while validating Main FileName %s" % e) file.close() raise e def validationFileNameRaw_AdditionalFile( self, additionalfile_Regex, additionalfile_lengthofdatestampinfile, additionalfile_lengthoftimestampinfile): self.deleteExistingBadDataTrainingDir_AdditionalFile() self.deleteExistingGoodDataTrainingDir_AdditionalFile() self.createDirectoryFor_GoodBadRawData_AdditionalFile() onlyfiles = [ f for f in os.listdir(self.batch_directory_AdditionalFile) ] try: file = open("Training_Logs/nameValidationLog.txt", 'a+') for filename in onlyfiles: if (re.match(additionalfile_Regex, filename)): split = re.split('.csv', filename) split = re.split('_', split[0]) if len(split[3]) == additionalfile_lengthofdatestampinfile: if len(split[4] ) == additionalfile_lengthoftimestampinfile: shutil.copy( "Training_Batch_Files/Additional_File/" + filename, "Training_Raw_Validated_File/Good_Raw_AdditionalFile" ) self.logger.log( file, 'Valid File Name !! File moved to GoodRaw_Additional Directory ::%s' % filename) else: shutil.copy( "Training_Batch_Files/Additional_File/" + filename, "Training_Raw_Validated_File/Bad_Raw_AdditionalFile" ) self.logger.log( file, 'Invalid File Name!! File moved to Bad Raw Additional File Directory' ) else: shutil.copy( "Training_Batch_Files/Additional_File/" + filename, "Training_Raw_Validated_File/Bad_Raw_AdditionalFile" ) self.logger.log( file, 'Invalid File Name!! File moved to Bad Raw Additional File Directory' ) else: shutil.copy( "Training_Batch_Files/Additional_File/" + filename, "Training_Raw_Validated_File/Bad_Raw_AdditionalFile") self.logger.log( file, 'Invalid File Name!! File moved to Bad Raw Additional File Directory' ) file.close() except Exception as e: file = open("Training_Logs/nameValidationLog.txt", 'a+') self.logger.log( file, "Error occured while validating Additional FileName %s" % e) file.close() raise e def validate_NoOfCol_MainFile(self, noofcol_mainfile): try: f = open("Training_Logs/columnValidationLog.txt", 'a+') for file in os.listdir( 'Training_Raw_Validated_File/Good_Raw_MainFile/'): csv = pd.read_csv( 'Training_Raw_Validated_File/Good_Raw_MainFile/' + file) if csv.shape[1] == noofcol_mainfile: pass else: shutil.move( 'Training_Raw_Validated_File/Good_Raw_MainFile' + file, 'Training_Raw_Validated_File/Bad_Raw_MainFile') self.logger.log( f, 'Invalid Column length for the file !! File moved to bad raw main Directory :: %s' % file) self.logger.log( f, 'Main File Columns Length Validated Sucessfully') f.close() except OSError: f = open("Training_Logs/columnValidationLog.txt", 'a+') self.logger.log( f, 'Error Occured while moving file :: %s' % str(OSError)) f.close() raise OSError except Exception as e: f = open("Training_Logs/columnValidationLog.txt", 'a+') self.logger.log(f, "Error Occured:: %s" % e) f.close() raise e def validate_NoOfCol_AdditionalFile(self, noofcol_additionalfile): try: f = open("Training_Logs/columnValidationLog.txt", 'a+') for file in os.listdir( 'Training_Raw_Validated_File/Good_Raw_AdditionalFile/'): csv = pd.read_csv( 'Training_Raw_Validated_File/Good_Raw_AdditionalFile/' + file) if csv.shape[1] == noofcol_additionalfile: pass else: shutil.move( 'Training_Raw_Validated_File/Good_Raw_AdditionalFile' + file, 'Training_Raw_Validated_File/Bad_Raw_AdditionalFile') self.logger.log( f, 'Invalid Column length for the file !! File moved to bad raw additional Directory :: %s' % file) self.logger.log( f, 'Additional File Columns Length Validated Sucessfully') f.close() except OSError: f = open("Training_Logs/columnValidationLog.txt", 'a+') self.logger.log( f, 'Error Occured while moving file :: %s' % str(OSError)) f.close() raise OSError except Exception as e: f = open("Training_Logs/columnValidationLog.txt", 'a+') self.logger.log(f, "Error Occured:: %s" % e) f.close() raise e
def __init__(self, mainFilePath, additionalFilePath): self.log_writer = App_Logger() self.file_object = open('Prediction_Logs/PredictionLog.txt', 'a+') if mainFilePath is not None and additionalFilePath is not None: self.pred_data_val = Raw_Data_Validation(mainFilePath, additionalFilePath)
def __init__(self): self.log_writer = App_Logger() self.file_object = open('Prediction_Logs/PredictionLog.txt', 'a+')
class Prediction: def __init__(self, mainFilePath, additionalFilePath): self.log_writer = App_Logger() self.file_object = open('Prediction_Logs/PredictionLog.txt', 'a+') if mainFilePath is not None and additionalFilePath is not None: self.pred_data_val = Raw_Data_Validation(mainFilePath, additionalFilePath) def predict_from_model(self): self.log_writer.log(self.file_object, 'Start of Prediction') try: self.pred_data_val.deletePredictionFile() data_getter = data_loader_prediction.Data_Getter( self.file_object, self.log_writer) main_data, additional_data = data_getter.get_data() preprocessor = data_preprocessing.PreProcessor( self.file_object, self.log_writer) is_null_present = preprocessor.is_null_present(main_data) if is_null_present == True: main_data = preprocessor.impute_missing_values(main_data) main_data = preprocessor.map_ip_to_country(main_data, additional_data) main_data = preprocessor.difference_signup_and_purchase(main_data) main_data = preprocessor.encoding_browser(main_data) main_data = preprocessor.encoding_source(main_data) main_data = preprocessor.encoding_sex(main_data) main_data = preprocessor.count_frequency_encoding_country( main_data) main_data, unwanted_data = preprocessor.remove_unwanted_cols( main_data, return_unwanted_data=True) #x,y = preprocessor.separate_label_feature(main_data,'class') #x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3) #x_train,y_train = preprocessor.over_sampling_smote(x_train,y_train) #model_finder = tuner.Model_Finder(self.file_object,self.log_writer) #best_model_name,best_model = model_finder.get_best_model(x_train,y_train,x_test,y_test) file_loader = file_methods.File_Operation(self.file_object, self.log_writer) #save_model = file_op.save_model(best_model,best_model_name) model_name = file_loader.find_correct_model_file() model = file_loader.load_model(model_name) result = list(model.predict(main_data)) data = list( zip(unwanted_data['user_id'], unwanted_data['signup_time'], unwanted_data['purchase_time'], unwanted_data['device_id'], unwanted_data['source'], unwanted_data['browser'], unwanted_data['sex'], unwanted_data['ip_address'], unwanted_data['Country'], result)) result = pd.DataFrame(data, columns=[ 'user_id', 'signup_time', 'purchase_time', 'device_id', 'source', 'browser', 'sex', 'ip_address', 'Country', 'Prediction' ]) path = "Prediction_Output_File/Prediction.csv" result.to_csv(path, header=True, mode='a+') self.log_writer.log(self.file_object, 'Successfull End of Prediction') self.file_object.close() except Exception as e: self.log_writer.log( self.file_object, 'Error Occured while doing the Prediction !! Error :: %s' % str(e)) self.file_object.close() raise e return path, result.head().to_json(orient="records")
class Train_Validation: def __init__(self, mainFilepath, additionalFilepath): self.raw_data = Raw_Data_Validation(mainFilepath, additionalFilepath) self.preproccesing_beforeDB = preprocessing_beforeDB() self.DbOperation = DBOperations() self.file_object = open('Training_Logs/Training_Main_Log.txt', 'a+') self.log_writer = App_Logger() def training_validation(self): try: self.log_writer.log(self.file_object, 'Start of Raw Data Validation on Files !!') main_file, additional_file, mainFile_LengthofDataStampInFile, mainFile_LengthofTimeStampInFile, additional_LengthofDateStampInFile, additional_LengthofTimeStampInFile, mainFile_ColName, additionalFile_ColName, NoCol_MainFile, NoCol_AdditionalFile = self.raw_data.fetch_values_from_schema( ) mainFile_regex = self.raw_data.mainfile_manualRegexCreation() additionalFile_regex = self.raw_data.additionalfile_manualRegexCreation( ) self.raw_data.validationFileNameRaw_MainFile( mainFile_regex, mainFile_LengthofDataStampInFile, mainFile_LengthofTimeStampInFile) self.raw_data.validationFileNameRaw_AdditionalFile( additionalFile_regex, additional_LengthofDateStampInFile, additional_LengthofTimeStampInFile) self.raw_data.validate_NoOfCol_MainFile(NoCol_MainFile) self.raw_data.validate_NoOfCol_AdditionalFile(NoCol_AdditionalFile) self.log_writer.log(self.file_object, 'Raw Data Validation Completed !!') self.log_writer.log(self.file_object, 'Start of Data Preprocessing before DB') self.preproccesing_beforeDB.replaceMissingWithNull_MainFile() self.preproccesing_beforeDB.replaceMissingWithNull_AdditionalFile() self.log_writer.log(self.file_object, 'Data Preprocessing before DB Completed !!') self.log_writer.log( self.file_object, 'Start of Creating TrainingDatabase and Table based on given schema!!!' ) self.DbOperation.createTable_MainFile('Training', mainFile_ColName) self.DbOperation.createTable_AdditionalFile( 'Training', additionalFile_ColName) self.log_writer.log( self.file_object, 'Creation of Table in Database Successfull !!!') self.log_writer.log(self.file_object, "Insertion of Data into Table started!!!!") self.DbOperation.InsertIntoTableGoodData_MainFile('Training') self.DbOperation.InsertIntoTableGoodData_AdditionalFile('Training') self.log_writer.log(self.file_object, "Insertion of Data into Tables Completed!!!!") self.log_writer.log( self.file_object, "Deleting Main and Additional File Good Data Folder!!!") self.raw_data.deleteExistingGoodDataTrainingDir_MainFile() self.raw_data.deleteExistingGoodDataTrainingDir_AdditionalFile() self.log_writer.log( self.file_object, 'Main and Additional Good File Directory Deleted !!!') self.log_writer.log( self.file_object, 'Starting moving bad files to Archive and deleting bad data directory' ) self.raw_data.moveBadFilesToArchiveBad_MainFile() self.raw_data.moveBadFilesToArchiveBad_AdditionalFile() self.log_writer.log( self.file_object, 'Bad Files moved to Archive!! and Bad Directory Deleted !!') self.log_writer.log(self.file_object, 'Raw Data Validation Completed Successfully') self.log_writer.log(self.file_object, 'Exporting Data Into CSV File Started') self.DbOperation.SelectingDataFromTableIntoCSV_MainFile('Training') self.DbOperation.SelectingDataFromTableIntoCSV_AdditionalFile( 'Training') self.log_writer.log(self.file_object, 'Data to CSV File Exported Successfull') self.log_writer.log(self.file_object, 'End of Raw Data Validation!!!') self.file_object.close() except Exception as e: raise e
def __init__(self): self.log_writer = App_Logger() self.file_object = open('Training_Logs/ModelTrainingLog.txt', 'a+')
def preprocess_and_split(config_path): file_object = open('Training_log.txt', 'a+') logger = App_Logger() config = read_params(config_path) train_data_path = config["split_data"]["train_path"] raw_train_data_path = config["load_data"]["raw_train_data_csv"] logger.log(file_object, "Training Data load was successful") train_df = pd.read_csv(raw_train_data_path) logger.log(file_object, "Data reading successful") # 1.Function for extracting features from date column train_df = date_process( train_df) # function for datetime cols processing in train data logger.log(file_object, "Datetime Processing in train data completed ") # 2. Function to validate the columns in the dataset for json datatype train_json_columns = column_validator( train_df ) # Validating the columns in the train dataset for json datatype logger.log(file_object, "Column_validator successful") # 2.1 Function for flattening the json columns and merge them with original dataset if train_json_columns is not None: train_df = json_to_df( train_df, train_json_columns) #Normalizing the json columns in train data target = train_df['transactionRevenue'] logger.log(file_object, "Normalizing the json columns completed") # 3.Dropping columns which have more than 50% of null values and columns not contributing to the target variable train_df = remove_nan_cols(train_df) logger.log(file_object, "50% NAN value columns are removed") train_df.drop( 'sessionId', axis=1, inplace=True ) # Removing this column as it is the combination of fullVisitorId and visitId train_df.drop( 'visitStartTime', axis=1, inplace=True) # Removing this column as it is extracted into visitHour train_df.drop( 'fullVisitorId', axis=1, inplace=True ) # This column is very long and of no much contribution towards target variable #drop_columns = ['visitId', 'weekday', 'day', 'bounces', 'keyword'] drop_columns = ['visitId', 'weekday', 'day'] train_df.drop(drop_columns, axis=1, inplace=True) logger.log( file_object, 'Dropped columns which are not contributing to the transaction revenue' ) # 4.Imputation of null values train_df = pd.concat( [train_df, target], axis=1 ) # transactionRevenue col is attached to the dataframe for imputing nan with 0 train_df = impute_na(train_df) logger.log(file_object, "Imputing NAN values with 0 is completed") # 5.Changing datatypes from object to desired ones train_df = data_type_convert(train_df) logger.log(file_object, "Conversion of Datatype to int completed") # 6. Removing columns with constant values or with zero standard deviation train_df = remove_zero_std_cols(train_df) logger.log(file_object, "Zero standard deviation columns are removed") # 7 Function to gather categorical columns in the dataset and performing label encoding label_cols = categorical_cols(train_df) logger.log(file_object, "Gathering of label _cols in train data completed ") train_df = label_encoding(train_df, label_cols) logger.log(file_object, "Label_encoding in train data completed ") # 8. Imputing pageviews column with KNNImputer in train data from sklearn.impute import KNNImputer imputer = KNNImputer() imputer_train_df = imputer.fit_transform(train_df[[ 'pageviews' ]]) ## Imputing pageviews with KNNimputer in training data train_df['pageviews'] = imputer_train_df logger.log(file_object, "Pageviews column imputed with KNNimputer") train_df.to_csv(train_data_path, sep=",", index=False, encoding="utf-8") ## Storing Processed train data logger.log( file_object, "Training data is processed and stored as data/processed/train_processed.csv" ) file_object.close()
class DBOperations: def __init__(self): self.path = "Prediction_Database/" self.goodRaw_MainFile_path = "Prediction_Raw_Validated_File/Good_Raw_MainFile" self.badRaw_MainFile_path = "Prediction_Raw_Validated_File/Bad_Raw_MainFile" self.goodRaw_AdditionalFile_path = "Prediction_Raw_Validated_File/Good_Raw_AdditionalFile" self.badRaw_AdditionalFile_path = "Prediction_Raw_Validated_File/Bad_Raw_AdditionalFile" self.logger = App_Logger() def DatabaseConnection(self,database_name): try: con = sqlite3.connect(self.path + database_name + '.db') file = open('Prediction_Logs/DataBaseConnection.txt','a+') self.logger.log(file,'Database Connection to %s Successfully' % database_name + '.db') file.close() except ConnectionError: file = open('Prediction_Logs/DataBaseConnection.txt','a+') self.logger.log(file,'Error while connecting to database:: %s' % str(ConnectionError)) file.close() raise ConnectionError return con def createTable_MainFile(self,database,colname_MainFile): try: con = self.DatabaseConnection(database) c = con.cursor() c.execute("SELECT count(name) FROM sqlite_master WHERE type = 'table' AND name = 'MainFile_Good_Raw_Data'") if c.fetchone()[0] == 1: con.close() file = open('Prediction_Logs/DbTableCreateLog.txt', 'a+') self.logger.log(file,'MainFile_Good_Raw_Data Table Created Successfully !!') file.close() else: for key in colname_MainFile.keys(): type = colname_MainFile[key] try: con.execute('ALTER TABLE MainFile_Good_Raw_Data ADD COLUMN "{column_name}" {dataType}'.format(column_name=key,dataType=type)) except: con.execute('CREATE TABLE MainFile_Good_Raw_Data ({column_name} {dataType})'.format(column_name=key, dataType=type)) con.close() except Exception as e: file = open("Prediction_Logs/DbTableCreateLog.txt", 'a+') self.logger.log(file,'Error while creating Table:: %s' % e) file.close() con.close() file = open("Prediction_Logs/DataBaseConnectionLog.txt", 'a+') self.logger.log(file, "Closed %s database successfully" % database) file.close() raise e def createTable_AdditionalFile(self,database,colname_AdditionalFile): try: con = self.DatabaseConnection(database) c = con.cursor() c.execute("SELECT count(name) FROM sqlite_master WHERE type = 'table' AND name = 'AdditionalFile_Good_Raw_Data'") if c.fetchone()[0] == 1: con.close() file = open('Prediction_Logs/DbTableCreateLog.txt', 'a+') self.logger.log(file,'AdditionalFile_Good_Raw_Data Table Created Successfully !!') file.close() else: for key in colname_AdditionalFile.keys(): type = colname_AdditionalFile[key] try: con.execute('ALTER TABLE AdditionalFile_Good_Raw_Data ADD COLUMN "{column_name}" {dataType}'.format(column_name=key,dataType=type)) except: con.execute('CREATE TABLE AdditionalFile_Good_Raw_Data ({column_name} {dataType})'.format(column_name=key, dataType=type)) con.close() except Exception as e: file = open("Prediction_Logs/DbTableCreateLog.txt", 'a+') self.logger.log(file,'Error while creating Table:: %s' % e) file.close() con.close() file = open("Prediction_Logs/DataBaseConnectionLog.txt", 'a+') self.logger.log(file, "Closed %s database successfully" % database) file.close() raise e def InsertIntoTableGoodData_MainFile(self,database): con = self.DatabaseConnection(database) c = con.cursor() MainFile_goodDataPath = self.goodRaw_MainFile_path only_files = [f for f in os.listdir(MainFile_goodDataPath)] log_file = open("Prediction_Logs/DbInsertLog.txt", 'a+') for file in only_files: try: with open(MainFile_goodDataPath + '/' + file, 'r') as f: #next(f) dr = csv.DictReader(f) to_dict = [(int(i['user_id']),i['signup_time'],i['purchase_time'],int(i['purchase_value']),i['device_id'],i['source'],i['browser'],i['sex'],int(i['age']),float(i['ip_address'])) for i in dr] try: insert = """INSERT INTO MainFile_Good_Raw_Data VALUES (?,?,?,?,?,?,?,?,?,?);""" c.executemany(insert,to_dict) con.commit() self.logger.log(log_file, "%s File Loaded Successfully in MainFile_Good_Raw_Data Table" % file) except Exception as e: self.logger.log(log_file,'Error while Inserting into MainFile_Good_Raw_Data Table %s' % str(e)) except Exception as e: con.rollback() self.logger.log(log_file,'Error while Inserting into MainFile_Good_Raw_Data %s' % str(e)) shutil.move(self.goodRaw_MainFile_path + '/' + file,self.badRaw_MainFile_path) self.logger.log(log_file,'Main File Moved Successfully after Error in Insertion into Database') log_file.close() con.close() con.close() log_file.close() def InsertIntoTableGoodData_AdditionalFile(self,database): con = self.DatabaseConnection(database) c = con.cursor() AdditionalFile_goodDataPath = self.goodRaw_AdditionalFile_path only_files = [f for f in os.listdir(AdditionalFile_goodDataPath)] log_file = open("Prediction_Logs/DbInsertLog.txt", 'a+') for file in only_files: try: with open(AdditionalFile_goodDataPath + '/' + file, 'r') as f: # next(f) dr = csv.DictReader(f) to_dict = [(float(i['lower_bound_ip_address']),float(i['upper_bound_ip_address']),i['country']) for i in dr] try: insert = """INSERT INTO AdditionalFile_Good_Raw_Data VALUES (?,?,?);""" c.executemany(insert, to_dict) con.commit() self.logger.log(log_file, "%s File Loaded Successfully in AdditionalFile_Good_Raw_Data Table" % file) except Exception as e: self.logger.log(log_file, 'Error while Inserting into MainFile_Good_Raw_Data Table %s' % str(e)) except Exception as e: con.rollback() self.logger.log(log_file, 'Error while Inserting into AdditionalFile_Good_Raw_Data %s' % str(e)) shutil.move(self.goodRaw_AdditionalFile_path + '/' + file, self.badRaw_AdditionalFile_path) self.logger.log(log_file, 'Additional File Moved Successfully after Error in Insertion into Database') log_file.close() con.close() con.close() log_file.close() def SelectingDataFromTableIntoCSV_MainFile(self,database): self.TrainingfileFromDB_Dir ='PredictionFileFromDB' self.MainFilePath = 'MainFile' self.MainFile_Name = 'InputFile.csv' log_file = open("Prediction_Logs/ExportToCsv.txt", 'a+') try: con = self.DatabaseConnection(database) sql_select = 'SELECT * FROM MainFile_Good_Raw_Data' cursor = con.cursor() cursor.execute(sql_select) results = cursor.fetchall() header =[i[0] for i in cursor.description] if not os.path.isdir(self.TrainingfileFromDB_Dir + '/' + self.MainFilePath): os.makedirs(os.path.join(self.TrainingfileFromDB_Dir,self.MainFilePath)) csvFile = csv.writer(open(self.TrainingfileFromDB_Dir + '/' + self.MainFilePath + '/' + self.MainFile_Name,'w',newline=''),delimiter=',', lineterminator='\r\n',quoting=csv.QUOTE_ALL,escapechar='\\') csvFile.writerow(header) csvFile.writerows(results) self.logger.log(log_file,'MainFile Exported as .csv Format Successfully') log_file.close() except Exception as e: self.logger.log(log_file,'MainFile Exporting Failed:: %s' % e) log_file.close() def SelectingDataFromTableIntoCSV_AdditionalFile(self,database): self.TrainingfileFromDB_Dir ='PredictionFileFromDB' self.AdditionalFilePath = 'AdditionalFile' self.AdditionalFile_Name = 'AdditionalFile.csv' log_file = open("Prediction_Logs/ExportToCsv.txt", 'a+') try: con = self.DatabaseConnection(database) sql_select = 'SELECT * FROM AdditionalFile_Good_Raw_Data' cursor = con.cursor() cursor.execute(sql_select) results = cursor.fetchall() header =[i[0] for i in cursor.description] if not os.path.isdir(self.TrainingfileFromDB_Dir + '/' + self.AdditionalFilePath): os.makedirs(os.path.join(self.TrainingfileFromDB_Dir,self.AdditionalFilePath)) csvFile = csv.writer(open(self.TrainingfileFromDB_Dir + '/' + self.AdditionalFilePath + '/' + self.AdditionalFile_Name, 'w', newline=''), delimiter=',',lineterminator='\r\n', quoting=csv.QUOTE_ALL, escapechar='\\') csvFile.writerow(header) csvFile.writerows(results) self.logger.log(log_file,'AdditionalFile Exported as .csv Format Successfully') log_file.close() except Exception as e: self.logger.log(log_file,'AdditionalFile Exporting Failed:: %s' % e) log_file.close()