class dataTransform: def __init__(self): self.connectionstrings = "DefaultEndpointsProtocol=https;AccountName=trainingbatchfiles;AccountKey=JPHQiUP+0kPN4UlfW+jXZm9EaPg0nsSUd9MZMLnhpjmJZnO7OXiemYqM+vosRjXA8MLOTqV2fsDEAmz6tIjGFw==;EndpointSuffix=core.windows.net" self.good_raw = Azure_Functions(self.connectionstrings) self.logger = App_Logger() def replaceMissingWithNull(self): try: onlyfiles = [f for f in self.good_raw.gettingcsvfile("goodraw")] for file in onlyfiles: csv = self.good_raw.readingcsvfile("goodraw", file) #csv = pandas.read_csv(self.goodDataPath + "/" + file) csv.fillna('NULL', inplace=True) # #csv.update("'"+ csv['Wafer'] +"'") # csv.update(csv['Wafer'].astype(str)) csv['Wafer'] = csv['Wafer'].str[6:] self.good_raw.saveDataFrameTocsv("goodraw", file, csv, index=None, header=True) self.logger.log("Training_Logs", "dataTransformLog", " %s: File Transformed successfully!!" % file) #csv.to_csv(self.goodDataPath + "/" + file, index=None, header=True) except Exception as e: self.logger.log("Training_Logs", "dataTransformLog", "Data Transformation failed because:: %s" % e)
def __init__(self, path): self.log_writer = App_Logger() self.file_object = "Prediction_Logs" self.connectionstrings = "DefaultEndpointsProtocol=https;AccountName=trainingbatchfiles;AccountKey=JPHQiUP+0kPN4UlfW+jXZm9EaPg0nsSUd9MZMLnhpjmJZnO7OXiemYqM+vosRjXA8MLOTqV2fsDEAmz6tIjGFw==;EndpointSuffix=core.windows.net" self.AzureFunc = Azure_Functions(self.connectionstrings) if path is not None: self.pred_data_val = Prediction_Data_validation(path)
def __init__(self,path): self.Batch_Directory = path self.schema_path = 'schema_training.json' self.connectionstrings = "DefaultEndpointsProtocol=https;AccountName=trainingbatchfiles;AccountKey=JPHQiUP+0kPN4UlfW+jXZm9EaPg0nsSUd9MZMLnhpjmJZnO7OXiemYqM+vosRjXA8MLOTqV2fsDEAmz6tIjGFw==;EndpointSuffix=core.windows.net" self.move_rawfile = Azure_Functions(self.connectionstrings) self.logger = App_Logger() self.mail = mail("*****@*****.**")
def __init__(self,path): self.raw_data = Prediction_Data_validation(path) self.dataTransform = dataTransformPredict() self.dBOperation = MongodBOperation() self.file_object = "Prediction_Logs" self.mail = mail("*****@*****.**") self.log_writer = App_Logger()
def __init__(self): self.logger = App_Logger() self.connectionstrings = "DefaultEndpointsProtocol=https;AccountName=trainingbatchfiles;AccountKey=JPHQiUP+0kPN4UlfW+jXZm9EaPg0nsSUd9MZMLnhpjmJZnO7OXiemYqM+vosRjXA8MLOTqV2fsDEAmz6tIjGFw==;EndpointSuffix=core.windows.net" self.AzureFunc = Azure_Functions(self.connectionstrings) self.client = pymongo.MongoClient( "mongodb+srv://demo:[email protected]/myFirstDatabase?retryWrites=true&w=majority" ) self.db = self.client.test
def train(): strategy = tf.distribute.MirroredStrategy() (x_train, y_train), (x_test, y_test) = get_data() with open('DataLoading.txt', 'a+') as f: App_Logger.log(f, 'Loaded data successfully...') callbacks = [keras.callbacks.TensorBoard(log_dir='./logs'), keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, verbose=1), keras.callbacks.ReduceLROnPlateau(monitor='accuracy', factor=0.01, verbose=1)] try: with strategy.scope(): K.clear_session() myModel = model.create_model() myModel.compile(loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=keras.optimizers.Adam(), metrics=['accuracy']) with open('DataLoading.txt', 'a+') as f: App_Logger.log(f, 'Created and compiled model....\n' + myModel.summary()) history = myModel.fit(x_train, y_train, validation_split=0.25, callbacks=callbacks, verbose=1) with open('train.txt', 'a+') as f: App_Logger.log(f, 'Training successful ' + history.history) except Exception as e: with open('Error.txt', 'a+') as f: App_Logger.log(f, e)
def __init__(self): self.window = tk.Tk() self.window.title("Web Automation Tool Edge") # driverpath = "./chromedriver" self.driverpath = "./msedgedriver" self.url = "http://bptravel.blueprism.com" self.file_object = open("logs.txt", 'a+') self.log_writer = App_Logger() self.success = False # this removes the maximize button self.window.resizable(0, 0) window_height = 300 window_width = 800 screen_width = self.window.winfo_screenwidth() screen_height = self.window.winfo_screenheight() x_cordinate = int((screen_width / 2) - (window_width / 2)) y_cordinate = int((screen_height / 2) - (window_height / 2)) self.window.geometry("{}x{}+{}+{}".format(window_width, window_height, x_cordinate, y_cordinate)) #self.window.geometry('880x600') self.window.configure(background='#ffffff') #self.window.attributes('-fullscreen', True) self.window.grid_rowconfigure(0, weight=1) self.window.grid_columnconfigure(0, weight=1) header = tk.Label(self.window, text="R&D Microsoft Edge Automation with Python", width=60, height=1, fg="white", bg="#363e75", font=('times', 18, 'bold')) header.place(x=0, y=0) self.button() self.filetxt = tk.Entry(self.window, width=50, textvariable="Select the file path", bg="white", fg="black", font=('times', 15)) self.start_button() self.filetxt.place(x=205, y=80) self.window.mainloop()
class prediction: def __init__(self, path): self.log_writer = App_Logger() self.file_object = "Prediction_Logs" self.connectionstrings = "DefaultEndpointsProtocol=https;AccountName=trainingbatchfiles;AccountKey=JPHQiUP+0kPN4UlfW+jXZm9EaPg0nsSUd9MZMLnhpjmJZnO7OXiemYqM+vosRjXA8MLOTqV2fsDEAmz6tIjGFw==;EndpointSuffix=core.windows.net" self.AzureFunc = Azure_Functions(self.connectionstrings) if path is not None: self.pred_data_val = Prediction_Data_validation(path) def predictionFromModel(self): try: self.pred_data_val.deletePredictionFile() self.log_writer.log(self.file_object, "Prediction_Log", 'Start of Prediction') data_getter = data_loader_prediction.Data_Getter_Pred( self.file_object, self.log_writer) data = data_getter.get_data() """doing the data preprocessing""" preprocessor = Preprocessor(self.file_object, self.log_writer) data = preprocessor.remove_columns( data, ["Unnamed: 0"] ) # remove the unnamed column as it doesn't contribute to prediction. is_null_present = preprocessor.is_null_present(data) # if missing values are there, replace them appropriately. if (is_null_present): data = preprocessor.impute_missing_values( data) # missing value imputation # check further which columns do not contribute to predictions # if the standard deviation for a column is zero, it means that the column has constant values # and they are giving the same output both for good and bad sensors # prepare the list of such columns to drop cols_to_drop = preprocessor.get_columns_with_zero_std_deviation( data) # drop the columns obtained above data = preprocessor.remove_columns(data, cols_to_drop) """ Applying the clustering approach""" file_loader = file_methods.File_Operation(self.file_object, self.log_writer) kmeans = file_loader.load_model('KMeans') ##Code changed # pred_data = data.drop(['Wafer'],axis=1) clusters = kmeans.predict(data.drop( ['Wafer'], axis=1)) # drops the first column for cluster prediction data['clusters'] = clusters clusters = data['clusters'].unique() for i in clusters: cluster_data = data[data['clusters'] == i] wafer_names = list(cluster_data['Wafer']) cluster_data = data.drop(labels=['Wafer'], axis=1) cluster_data = cluster_data.drop(['clusters'], axis=1) model_name = file_loader.find_correct_model_file(i) model = file_loader.load_model(model_name) result = list(model.predict(cluster_data)) result = pandas.DataFrame(list(zip(wafer_names, result)), columns=['Wafer', 'Prediction']) #path = "Predictions.csv" pred_result = result.to_csv(header=True) self.AzureFunc.uploadBlob("predictionoutputfile", "predictions.csv", pred_result) output = self.AzureFunc.readingcsvfile("predictionoutputfile", "predictions.csv") #result.to_csv("Predictions.csv", header=True,mode='a+') # appends result to prediction file self.log_writer.log(self.file_object, "Prediction_Log", 'End of Prediction') except Exception as ex: self.log_writer.log( self.file_object, "Prediction_Log", 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return output.head().to_json(orient="records")
class MongodBOperation: def __init__(self): self.logger = App_Logger() self.connectionstrings = "DefaultEndpointsProtocol=https;AccountName=trainingbatchfiles;AccountKey=JPHQiUP+0kPN4UlfW+jXZm9EaPg0nsSUd9MZMLnhpjmJZnO7OXiemYqM+vosRjXA8MLOTqV2fsDEAmz6tIjGFw==;EndpointSuffix=core.windows.net" self.AzureFunc = Azure_Functions(self.connectionstrings) self.client = pymongo.MongoClient( "mongodb+srv://demo:[email protected]/myFirstDatabase?retryWrites=true&w=majority" ) self.db = self.client.test def dataBaseConnection(self): try: self.client = pymongo.MongoClient( "mongodb+srv://demo:[email protected]/myFirstDatabase?retryWrites=true&w=majority" ) self.db = self.client.test self.logger.log("Prediction_Log", "DataBaseConnectionLog", "Opened %s database successfully" % DatabaseName) except Exception as e: self.logger.log( "Prediction_Log", "DataBaseConnectionLog", "Error while connecting to database: %s" % ConnectionError) def createcollectionDB(self, DatabaseName): try: db = self.client["DatabaseName"] self.collection = db["predgood_data"] self.logger.log("Prediction_Log", "DbcollectionCreateLog", "collection created successfully!!") except Exception as e: self.logger.log("Prediction_Log", "DbcollectionCreateLog", "Error while creating collection: %s " % e) raise e def insertIntoTableGoodData(self): try: filename = self.AzureFunc.gettingcsvfile("predictiongoodraw") for file in filename: df = self.AzureFunc.readingcsvfile("predictiongoodraw", file) js = df.to_json() jsdata = json.loads(js) self.collection.insert_one(jsdata) self.logger.log("Prediction_Log", "DbInsertLog", " %s: File loaded successfully!!" % file) except Exception as e: self.logger.log( "Prediction_Log", "DbInsertLog", "Error while inserting data into collection: %s " % e) # for files in self.AzureFunc.gettingcsvfile("predictiongoodraw"): # csv = self.AzureFunc.readingcsvfile("predictiongoodraw", files) # js = csv.to_json() # jsdata = json.loads(js) # self.collection.insert_one(jsdata) def selectingDatafromtableintocsv(self): try: li = [] for i in self.collection.find(): di = dict(i) di.pop('_id') jsda = json.dumps(di) json_data = json.loads(jsda) jfdf = pd.DataFrame(json_data.values()).T jfdf.columns = json_data.keys() li.append(jfdf) frame = pd.concat(li, ignore_index=True) upload = frame.to_csv() self.AzureFunc.uploadBlob("predinputdata", "predinputdata.csv", upload) self.collection.drop() print(frame) self.logger.log("Prediction_Log", "ExportToCsv", "File exported successfully on Azure !!!") except Exception as e: self.logger.log("Prediction_Log", "ExportToCsv", "File exporting failed. Error : %s" % e)
class Raw_Data_validation: def __init__(self,path): self.Batch_Directory = path self.schema_path = 'schema_training.json' self.connectionstrings = "DefaultEndpointsProtocol=https;AccountName=trainingbatchfiles;AccountKey=JPHQiUP+0kPN4UlfW+jXZm9EaPg0nsSUd9MZMLnhpjmJZnO7OXiemYqM+vosRjXA8MLOTqV2fsDEAmz6tIjGFw==;EndpointSuffix=core.windows.net" self.move_rawfile = Azure_Functions(self.connectionstrings) self.logger = App_Logger() self.mail = mail("*****@*****.**") def valuesfromschema(self): """ Method Name: valuesFromSchema Description: This method extracts all the relevant information from the pre-defined "Schema" file. Output: LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, Number of Columns On Failure: Raise ValueError,KeyError,Exception Written By: Rahul Kashyap Version: 1.0 Revisions: None """ try: client = pymongo.MongoClient("mongodb+srv://demo:[email protected]/myFirstDatabase?retryWrites=true&w=majority") db = client.test db = client["Json_data"] if "schema_training" in db.list_collection_names(): collection = db["schema_training"] for i in collection.find(): d = dict(i) d.pop("_id") js = json.dumps(d) dic = json.loads(js) pattern = dic['SampleFileName'] LengthOfDateStampInFile = dic['LengthOfDateStampInFile'] LengthOfTimeStampInFile = dic['LengthOfTimeStampInFile'] column_names = dic['ColName'] NumberofColumns = dic['NumberofColumns'] print("Successfully loaded data in local") elif "schema_training" not in db.list_collection_names(): with open(self.schema_path, 'r') as f: js = json.load(f) f.close() collection = db["schema_training"] collection.insert_one(js) for i in collection.find(): d = dict(i) d.pop("_id") js = json.dumps(d) dic = json.loads(js) pattern = dic['SampleFileName'] LengthOfDateStampInFile = dic['LengthOfDateStampInFile'] LengthOfTimeStampInFile = dic['LengthOfTimeStampInFile'] column_names = dic['ColName'] NumberofColumns = dic['NumberofColumns'] print("inerted data in mongo db and load in local system") message ="LengthOfDateStampInFile:: %s" %LengthOfDateStampInFile + "\t" + "LengthOfTimeStampInFile:: %s" % LengthOfTimeStampInFile +"\t " + "NumberofColumns:: %s" % NumberofColumns + "\n" self.logger.log("Training_Logs","valuesfromSchemaValidationLog",message) except ValueError: self.logger.log("Training_Logs","valuesfromSchemaValidationLog ","ValueError:Value not found inside schema_training.json") raise ValueError except KeyError: self.logger.log("Training_Logs","valuesfromSchemaValidationLog ", "KeyError:Key value error incorrect key passed") raise KeyError except Exception as e: self.logger.log("Training_Logs","valuesfromSchemaValidationLog ", str(e)) raise e return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns def manualRegexCreation(self): """ Method Name: manualRegexCreation Description: This method contains a manually defined regex based on the "FileName" given in "Schema" file. This Regex is used to validate the filename of the training data. Output: Regex pattern On Failure: None Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ regex = "['wafer']+['\_'']+[\d_]+[\d]+\.csv" return regex def validationFileNameRaw(self, regex, LengthOfDateStampInFile, LengthOfTimeStampInFile,files): """ Method Name: validationFileNameRaw Description: This function validates the name of the training csv files as per given name in the schema! Regex pattern is used to do the validation.If name format do not match the file is moved to Bad Raw Data folder else in Good raw data. Output: None On Failure: Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ # pattern = "['Wafer']+['\_'']+[\d_]+[\d]+\.csv" # delete the directories for good and bad data in case last run was unsuccessful and folders were not deleted. # create new directories onlyfiles = [f for f in self.Batch_Directory] try: for filename in onlyfiles: if (re.match(regex, filename)): splitAtDot = re.split('.csv', filename) splitAtDot = (re.split('_', splitAtDot[0])) if len(splitAtDot[1]) == LengthOfDateStampInFile: if len(splitAtDot[2]) == LengthOfTimeStampInFile: self.move_rawfile.copytoanother(filename,"trainingfiles","goodraw") self.logger.log("Training_Logs","nameValidationLog","Valid File name!! File moved to GoodRaw Folder :: %s" % filename) #shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Good_Raw") else: self.move_rawfile.copytoanother(filename,"trainingfiles","badraw") self.logger.log("Training_Logs","nameValidationLog","Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) self.mail.Gmail("File is not Valid","Invalid File Name!! File moved to Bad Raw Folder",filename) #shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Bad_Raw") else: self.move_rawfile.copytoanother(filename,"trainingfiles","badraw") self.logger.log("Training_Logs", "nameValidationLog","Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) self.mail.Gmail("File is not Valid", "Invalid File Name!! File moved to Bad Raw Folder",filename) #shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Bad_Raw") else: self.move_rawfile.copytoanother(filename,"trainingfiles","badraw") self.logger.log("Training_Logs", "nameValidationLog","Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) self.mail.Gmail("File is not Valid", "Invalid File Name!! File moved to Bad Raw Folder", filename) #shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Bad_Raw") except Exception as e: self.logger.log("Training_Logs", "nameValidationLog", "Error occured while validating FileName %s" % e) raise e def validateColumnLength(self, NumberofColumns): """ Method Name: validateColumnLength Description: This function validates the number of columns in the csv files. It is should be same as given in the schema file. If not same file is not suitable for processing and thus is moved to Bad Raw Data folder. If the column number matches, file is kept in Good Raw Data for processing. The csv file is missing the first column name, this function changes the missing name to "Wafer". Output: None On Failure: Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ try: self.logger.log("Training_Logs", "nameValidationLog", "Column Length Validation Started!!") for files in self.move_rawfile.gettingcsvfile("goodraw"): csv = self.move_rawfile.readingcsvfile("goodraw",files) if csv.shape[1] == NumberofColumns: pass else: self.move_rawfile.copyandDel(files,'goodraw','badraw') #shutil.move("Training_Raw_files_validated/Good_Raw/" + file, "Training_Raw_files_validated/Bad_Raw") self.logger.log("Training_Logs", "nameValidationLog", "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % files) self.mail.Gmail("File is not Valid","Invalid Column Length for the file!! File moved to Bad Raw Folder",files) self.logger.log("Training_Logs", "nameValidationLog", "Column Length Validation Completed!!") except OSError: self.logger.log("Training_Logs", "nameValidationLog", "Error Occured while moving the file :: %s" % OSError) raise OSError except Exception as e: self.logger.log("Training_Logs", "nameValidationLog", "Error Occured:: %s" % e) raise e def validateMissingValuesInWholeColumn(self): """ Method Name: validateMissingValuesInWholeColumn Description: This function validates if any column in the csv file has all values missing. If all the values are missing, the file is not suitable for processing. SUch files are moved to bad raw data. Output: None On Failure: Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ try: self.logger.log("Training_Logs","missingValuesInColumn" ,"Missing Values Validation Started!!") for files in self.move_rawfile.gettingcsvfile("goodraw"): csv = self.move_rawfile.readingcsvfile("goodraw", files) count = 0 for columns in csv: if (len(csv[columns]) - csv[columns].count()) == len(csv[columns]): count += 1 self.move_rawfile.copyandDel(files,"goodraw","badraw") # shutil.move("Training_Raw_files_validated/Good_Raw/" + file, # "Training_Raw_files_validated/Bad_Raw") break if count == 0: csv.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True) self.move_rawfile.saveDataFrameTocsv("goodraw", files, csv, index=None, header=True) except OSError: self.logger.log("Training_Logs","missingValuesInColumn" , "Error Occured while moving the file :: %s" % OSError) raise OSError except Exception as e: self.logger.log("Training_Logs","missingValuesInColumn" , "Error Occured:: %s" % e) raise e
def __init__(self): self.log_writer = App_Logger() self.file_object = "Training_Logs"
class GUI: def __init__(self): self.window = tk.Tk() self.window.title("Web Automation Tool Edge") # driverpath = "./chromedriver" self.driverpath = "./msedgedriver" self.url = "http://bptravel.blueprism.com" self.file_object = open("logs.txt", 'a+') self.log_writer = App_Logger() self.success = False # this removes the maximize button self.window.resizable(0, 0) window_height = 300 window_width = 800 screen_width = self.window.winfo_screenwidth() screen_height = self.window.winfo_screenheight() x_cordinate = int((screen_width / 2) - (window_width / 2)) y_cordinate = int((screen_height / 2) - (window_height / 2)) self.window.geometry("{}x{}+{}+{}".format(window_width, window_height, x_cordinate, y_cordinate)) #self.window.geometry('880x600') self.window.configure(background='#ffffff') #self.window.attributes('-fullscreen', True) self.window.grid_rowconfigure(0, weight=1) self.window.grid_columnconfigure(0, weight=1) header = tk.Label(self.window, text="R&D Microsoft Edge Automation with Python", width=60, height=1, fg="white", bg="#363e75", font=('times', 18, 'bold')) header.place(x=0, y=0) self.button() self.filetxt = tk.Entry(self.window, width=50, textvariable="Select the file path", bg="white", fg="black", font=('times', 15)) self.start_button() self.filetxt.place(x=205, y=80) self.window.mainloop() def button(self): self.button = tk.Button(self.window, text="Browse File", command=self.file_dialog, width=15, height=1, fg="white", bg="#363e75", font=('times', 10)) self.button.place(x=80, y=80) def start_button(self): self.Start = tk.Button(self.window, text="Start", command=self.create_quotes, fg="white", bg="#363e75", width=10, height=1, activebackground="#118ce1", font=('times', 12, 'bold')) self.Start.place(x=320, y=150) def file_dialog(self): self.FilePath = filedialog.askopenfilenames( filetypes=[('Excel File', 'xlsx .xls .csv')], initialdir='/', title='Please select Input file') if len(self.FilePath) > 0: self.filetxt.insert(0, self.FilePath[0]) def create_quotes(self): try: if not os.path.exists(self.filetxt.get()): messagebox.showinfo("File", "File does not exist") else: self.bt = BPTravel(driver_path=self.driverpath, url=self.url) self.bt.login_bp_travel() self.bt.create_quotes(exl_path=self.FilePath[0]) self.success = True if self.success: messagebox.showinfo("BP Travel", "Done") except Exception as ex: self.log_writer.log( self.file_object, "Error occured while web processing BP Travel quotes!! Error:: %s" % ex) self.file_object.close() raise ex
class pred_validation: def __init__(self,path): self.raw_data = Prediction_Data_validation(path) self.dataTransform = dataTransformPredict() self.dBOperation = MongodBOperation() self.file_object = "Prediction_Logs" self.mail = mail("*****@*****.**") self.log_writer = App_Logger() def prediction_validation(self): try: self.log_writer.log(self.file_object,"Prediction_Log",'Start of Validation on files for prediction!!') # extracting values from prediction schema LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, noofcolumns = self.raw_data.valuesfromschema() # getting the regex defined to validate filename regex = self.raw_data.manualRegexCreation() # validating filename of prediction files self.raw_data.validationFileNameRaw(regex, LengthOfDateStampInFile, LengthOfTimeStampInFile,self.raw_data) # validating column length in the file self.raw_data.validateColumnLength(noofcolumns) self.raw_data.validateMissingValuesInWholeColumn() self.log_writer.log(self.file_object,"Prediction_Log", "Raw Data Validation Complete!!") self.log_writer.log(self.file_object,"Prediction_Log", ("Starting Data Transforamtion!!")) self.dataTransform.replaceMissingWithNull() self.log_writer.log(self.file_object,"Prediction_Log", "DataTransformation Completed!!!") self.log_writer.log(self.file_object,"Prediction_Log", "Creating Prediction_Database and collection on the basis of given schema!!!") self.dBOperation.createcollectionDB("predictionfiles") self.log_writer.log(self.file_object,"Prediction_Log", "collection creation Completed!!") self.log_writer.log(self.file_object,"Prediction_Log", "Insertion of Data into collection started!!!!") self.dBOperation.insertIntoTableGoodData() self.log_writer.log(self.file_object,"Prediction_Log", "Insertion in collection completed!!!") self.log_writer.log(self.file_object,"Prediction_Log", "Validation Operation completed!!") self.log_writer.log(self.file_object,"Prediction_Log" ,"Extracting csv file from collection") self.dBOperation.selectingDatafromtableintocsv() except Exception as e: raise e
def __init__(self, path): self.raw_data = Raw_Data_validation(path) self.dataTransform = dataTransform() self.dBOperation = MongodBOperation() self.file_object = ("Training_Logs") self.log_writer = App_Logger()
def __init__(self): self.connectionstrings = "DefaultEndpointsProtocol=https;AccountName=trainingbatchfiles;AccountKey=JPHQiUP+0kPN4UlfW+jXZm9EaPg0nsSUd9MZMLnhpjmJZnO7OXiemYqM+vosRjXA8MLOTqV2fsDEAmz6tIjGFw==;EndpointSuffix=core.windows.net" self.good_raw = Azure_Functions(self.connectionstrings) self.logger = App_Logger()
import numpy as np import matplotlib.pyplot as plt import seaborn as sns import graphviz #from IPython.display import Image from sklearn.tree import export_graphviz from sklearn import tree from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.preprocessing import StandardScaler from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score from sklearn.tree import DecisionTreeClassifier from sklearn.decomposition import PCA from logger import App_Logger import pickle log_writer = App_Logger() # file_object = open("logs/TrainingLogs.txt", 'a+') def get_data(log_writer, file_object): log_writer.log(file_object, 'Started getting the data') data = pd.read_csv('winequality_red.csv') return data def check_data(data, log_writer, file_object): print(data.head()) print(data.columns) print(data.info()) print(data.describe()) print(type(data)) print(data.shape)
class trainModel: try: def __init__(self): self.log_writer = App_Logger() self.file_object = "Training_Logs" def trainingModel(self): # Logging the start of Training self.log_writer.log(self.file_object, "ModelTrainingLog", 'Start of Training') # Getting the data from the source data_getter = data_loader.Data_Getter(self.file_object, self.log_writer) data = data_getter.get_data() """doing the data preprocessing""" preprocessor = Preprocessor(self.file_object, self.log_writer) data = preprocessor.remove_columns( data, ['Wafer', "Unnamed: 0"] ) # remove the unnamed column as it doesn't contribute to prediction. # create separate features and labels X, Y = preprocessor.separate_label_feature( data, label_column_name='Good/Bad') is_null_present = preprocessor.is_null_present(X) # if missing values are there, replace them appropriately. if (is_null_present): X = preprocessor.impute_missing_values( X) # missing value imputation # check further which columns do not contribute to predictions # if the standard deviation for a column is zero, it means that the column has constant values # and they are giving the same output both for good and bad sensors # prepare the list of such columns to drop cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(X) # drop the columns obtained above X = preprocessor.remove_columns(X, cols_to_drop) """ Applying the clustering approach""" kmeans = clustering.KMeansClustering( self.file_object, self.log_writer) # object initialization. number_of_clusters = kmeans.elbow_plot( X ) # using the elbow plot to find the number of optimum clusters # Divide the data into clusters X = kmeans.create_clusters(X, number_of_clusters) # create a new column in the dataset consisting of the corresponding cluster assignments. X['Labels'] = Y # getting the unique clusters from our dataset list_of_clusters = X['Cluster'].unique() """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster""" for i in list_of_clusters: cluster_data = X[X['Cluster'] == i] # filter the data for one cluster # Prepare the feature and Label columns cluster_features = cluster_data.drop(['Labels', 'Cluster'], axis=1) cluster_label = cluster_data['Labels'] # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split( cluster_features, cluster_label, test_size=1 / 3, random_state=355) model_finder = tuner.Model_Finder( self.file_object, self.log_writer) # object initialization # getting the best model for each of the clusters best_model_name, best_model = model_finder.get_best_model( x_train, y_train, x_test, y_test) # saving the best model to the directory. file_op = file_methods.File_Operation(self.file_object, self.log_writer) save_model = file_op.save_model(best_model, best_model_name + str(i)) # logging the successful Training self.log_writer.log(self.file_object, "ModelTrainingLog", 'Successful End of Training') except Exception: # logging the unsuccessful Training self.log_writer.log(self.file_object, "ModelTrainingLog", 'Unsuccessful End of Training') raise Exception