class dataTransform:
    def __init__(self):
        self.connectionstrings = "DefaultEndpointsProtocol=https;AccountName=trainingbatchfiles;AccountKey=JPHQiUP+0kPN4UlfW+jXZm9EaPg0nsSUd9MZMLnhpjmJZnO7OXiemYqM+vosRjXA8MLOTqV2fsDEAmz6tIjGFw==;EndpointSuffix=core.windows.net"
        self.good_raw = Azure_Functions(self.connectionstrings)
        self.logger = App_Logger()

    def replaceMissingWithNull(self):

        try:
            onlyfiles = [f for f in self.good_raw.gettingcsvfile("goodraw")]
            for file in onlyfiles:
                csv = self.good_raw.readingcsvfile("goodraw", file)
                #csv = pandas.read_csv(self.goodDataPath + "/" + file)
                csv.fillna('NULL', inplace=True)
                # #csv.update("'"+ csv['Wafer'] +"'")
                # csv.update(csv['Wafer'].astype(str))
                csv['Wafer'] = csv['Wafer'].str[6:]
                self.good_raw.saveDataFrameTocsv("goodraw",
                                                 file,
                                                 csv,
                                                 index=None,
                                                 header=True)
                self.logger.log("Training_Logs", "dataTransformLog",
                                " %s: File Transformed successfully!!" % file)
                #csv.to_csv(self.goodDataPath + "/" + file, index=None, header=True)

        except Exception as e:
            self.logger.log("Training_Logs", "dataTransformLog",
                            "Data Transformation failed because:: %s" % e)
Beispiel #2
0
class Data_Getter_Pred:
    def __init__(self, file_object, logger_object):
        self.connectionstrings = "DefaultEndpointsProtocol=https;AccountName=trainingbatchfiles;AccountKey=JPHQiUP+0kPN4UlfW+jXZm9EaPg0nsSUd9MZMLnhpjmJZnO7OXiemYqM+vosRjXA8MLOTqV2fsDEAmz6tIjGFw==;EndpointSuffix=core.windows.net"
        self.AzureFunc = Azure_Functions(self.connectionstrings)
        self.prediction_file = "predinputdata.csv"
        self.file_object = file_object
        self.logger_object = logger_object

    def get_data(self):

        self.logger_object.log(
            self.file_object, "Prediction_Log",
            'Entered the get_data method of the Data_Getter class')
        try:

            self.data = self.AzureFunc.readingcsvfile("predinputdata",
                                                      self.prediction_file)
            self.logger_object.log(
                self.file_object, "Prediction_Log",
                'Data Load Successful.Exited the get_data method of the Data_Getter class'
            )
            return self.data
        except Exception as e:
            self.logger_object.log(
                self.file_object, "Prediction_Log",
                'Exception occured in get_data method of the Data_Getter class. Exception message: '
                + str(e))
            self.logger_object.log(
                self.file_object, "Prediction_Log",
                'Data Load Unsuccessful.Exited the get_data method of the Data_Getter class'
            )
            raise Exception()
Beispiel #3
0
class mail:
    def __init__(self, receiveraddr):
        self.email_user = "******"
        self.email_password = "******"
        self.receiveraddr = receiveraddr
        self.Azurefunc = Azure_Functions(
            "DefaultEndpointsProtocol=https;AccountName=trainingbatchfiles;AccountKey=JPHQiUP+0kPN4UlfW+jXZm9EaPg0nsSUd9MZMLnhpjmJZnO7OXiemYqM+vosRjXA8MLOTqV2fsDEAmz6tIjGFw==;EndpointSuffix=core.windows.net"
        )

    def Gmail(self, subject, body, filename):
        try:

            msg = MIMEMultipart()
            msg['From'] = self.email_user
            msg['To'] = self.receiveraddr
            msg['Subject'] = subject

            msg.attach(MIMEText(body, 'plain'))

            filename = filename
            df = self.Azurefunc.readingcsvfile("badraw", filename)
            csv = df.to_csv(encoding="ISO-8859-1")
            attachment = csv

            part = MIMEBase('application', 'octet-stream')
            part.set_payload(attachment)
            encoders.encode_base64(part)
            part.add_header('Content-Disposition',
                            "attachment; filename= " + filename)

            msg.attach(part)
            text = msg.as_string()
            server = smtplib.SMTP('smtp.gmail.com', 587)
            server.starttls()
            server.login(self.email_user, self.email_password)

            server.sendmail(self.email_user, self.receiveraddr, text)
            server.quit()

        except Exception as e:
            raise e
Beispiel #4
0
class prediction:
    def __init__(self, path):
        self.log_writer = App_Logger()
        self.file_object = "Prediction_Logs"
        self.connectionstrings = "DefaultEndpointsProtocol=https;AccountName=trainingbatchfiles;AccountKey=JPHQiUP+0kPN4UlfW+jXZm9EaPg0nsSUd9MZMLnhpjmJZnO7OXiemYqM+vosRjXA8MLOTqV2fsDEAmz6tIjGFw==;EndpointSuffix=core.windows.net"
        self.AzureFunc = Azure_Functions(self.connectionstrings)
        if path is not None:
            self.pred_data_val = Prediction_Data_validation(path)

    def predictionFromModel(self):

        try:

            self.pred_data_val.deletePredictionFile()
            self.log_writer.log(self.file_object, "Prediction_Log",
                                'Start of Prediction')
            data_getter = data_loader_prediction.Data_Getter_Pred(
                self.file_object, self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = Preprocessor(self.file_object, self.log_writer)
            data = preprocessor.remove_columns(
                data, ["Unnamed: 0"]
            )  # remove the unnamed column as it doesn't contribute to prediction.

            is_null_present = preprocessor.is_null_present(data)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data)  # missing value imputation

            # check further which columns do not contribute to predictions
            # if the standard deviation for a column is zero, it means that the column has constant values
            # and they are giving the same output both for good and bad sensors
            # prepare the list of such columns to drop
            cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(
                data)

            # drop the columns obtained above
            data = preprocessor.remove_columns(data, cols_to_drop)
            """ Applying the clustering approach"""

            file_loader = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
            kmeans = file_loader.load_model('KMeans')

            ##Code changed
            # pred_data = data.drop(['Wafer'],axis=1)
            clusters = kmeans.predict(data.drop(
                ['Wafer'],
                axis=1))  # drops the first column for cluster prediction
            data['clusters'] = clusters
            clusters = data['clusters'].unique()
            for i in clusters:
                cluster_data = data[data['clusters'] == i]
                wafer_names = list(cluster_data['Wafer'])
                cluster_data = data.drop(labels=['Wafer'], axis=1)
                cluster_data = cluster_data.drop(['clusters'], axis=1)
                model_name = file_loader.find_correct_model_file(i)
                model = file_loader.load_model(model_name)
                result = list(model.predict(cluster_data))
                result = pandas.DataFrame(list(zip(wafer_names, result)),
                                          columns=['Wafer', 'Prediction'])
                #path = "Predictions.csv"

                pred_result = result.to_csv(header=True)
                self.AzureFunc.uploadBlob("predictionoutputfile",
                                          "predictions.csv", pred_result)
                output = self.AzureFunc.readingcsvfile("predictionoutputfile",
                                                       "predictions.csv")

                #result.to_csv("Predictions.csv", header=True,mode='a+')  # appends result to prediction file
            self.log_writer.log(self.file_object, "Prediction_Log",
                                'End of Prediction')
        except Exception as ex:
            self.log_writer.log(
                self.file_object, "Prediction_Log",
                'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex
        return output.head().to_json(orient="records")
Beispiel #5
0
class MongodBOperation:
    def __init__(self):
        self.logger = App_Logger()
        self.connectionstrings = "DefaultEndpointsProtocol=https;AccountName=trainingbatchfiles;AccountKey=JPHQiUP+0kPN4UlfW+jXZm9EaPg0nsSUd9MZMLnhpjmJZnO7OXiemYqM+vosRjXA8MLOTqV2fsDEAmz6tIjGFw==;EndpointSuffix=core.windows.net"
        self.AzureFunc = Azure_Functions(self.connectionstrings)
        self.client = pymongo.MongoClient(
            "mongodb+srv://demo:[email protected]/myFirstDatabase?retryWrites=true&w=majority"
        )
        self.db = self.client.test

    def dataBaseConnection(self):

        try:

            self.client = pymongo.MongoClient(
                "mongodb+srv://demo:[email protected]/myFirstDatabase?retryWrites=true&w=majority"
            )
            self.db = self.client.test
            self.logger.log("Prediction_Log", "DataBaseConnectionLog",
                            "Opened %s database successfully" % DatabaseName)

        except Exception as e:
            self.logger.log(
                "Prediction_Log", "DataBaseConnectionLog",
                "Error while connecting to database: %s" % ConnectionError)

    def createcollectionDB(self, DatabaseName):

        try:

            db = self.client["DatabaseName"]
            self.collection = db["predgood_data"]
            self.logger.log("Prediction_Log", "DbcollectionCreateLog",
                            "collection created successfully!!")

        except Exception as e:
            self.logger.log("Prediction_Log", "DbcollectionCreateLog",
                            "Error while creating collection: %s " % e)
            raise e

    def insertIntoTableGoodData(self):

        try:

            filename = self.AzureFunc.gettingcsvfile("predictiongoodraw")
            for file in filename:
                df = self.AzureFunc.readingcsvfile("predictiongoodraw", file)
                js = df.to_json()
                jsdata = json.loads(js)
                self.collection.insert_one(jsdata)
                self.logger.log("Prediction_Log", "DbInsertLog",
                                " %s: File loaded successfully!!" % file)

        except Exception as e:
            self.logger.log(
                "Prediction_Log", "DbInsertLog",
                "Error while inserting data into collection: %s " % e)

        # for files in self.AzureFunc.gettingcsvfile("predictiongoodraw"):
        #     csv = self.AzureFunc.readingcsvfile("predictiongoodraw", files)
        #     js = csv.to_json()
        #     jsdata = json.loads(js)
        #     self.collection.insert_one(jsdata)

    def selectingDatafromtableintocsv(self):

        try:

            li = []
            for i in self.collection.find():
                di = dict(i)
                di.pop('_id')
                jsda = json.dumps(di)
                json_data = json.loads(jsda)
                jfdf = pd.DataFrame(json_data.values()).T
                jfdf.columns = json_data.keys()
                li.append(jfdf)

            frame = pd.concat(li, ignore_index=True)
            upload = frame.to_csv()
            self.AzureFunc.uploadBlob("predinputdata", "predinputdata.csv",
                                      upload)
            self.collection.drop()
            print(frame)
            self.logger.log("Prediction_Log", "ExportToCsv",
                            "File exported successfully on Azure !!!")

        except Exception as e:
            self.logger.log("Prediction_Log", "ExportToCsv",
                            "File exporting failed. Error : %s" % e)
class Raw_Data_validation:


    def __init__(self,path):
        self.Batch_Directory = path
        self.schema_path = 'schema_training.json'
        self.connectionstrings = "DefaultEndpointsProtocol=https;AccountName=trainingbatchfiles;AccountKey=JPHQiUP+0kPN4UlfW+jXZm9EaPg0nsSUd9MZMLnhpjmJZnO7OXiemYqM+vosRjXA8MLOTqV2fsDEAmz6tIjGFw==;EndpointSuffix=core.windows.net"
        self.move_rawfile = Azure_Functions(self.connectionstrings)
        self.logger = App_Logger()
        self.mail = mail("*****@*****.**")

    def valuesfromschema(self):

        """
                        Method Name: valuesFromSchema
                        Description: This method extracts all the relevant information from the pre-defined "Schema" file.
                        Output: LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, Number of Columns
                        On Failure: Raise ValueError,KeyError,Exception

                         Written By: Rahul Kashyap
                        Version: 1.0
                        Revisions: None

                                """

        try:

            client = pymongo.MongoClient("mongodb+srv://demo:[email protected]/myFirstDatabase?retryWrites=true&w=majority")
            db = client.test
            db = client["Json_data"]
            if "schema_training" in db.list_collection_names():
                collection = db["schema_training"]
                for i in collection.find():
                    d = dict(i)
                    d.pop("_id")
                    js = json.dumps(d)
                    dic = json.loads(js)
                    pattern = dic['SampleFileName']
                    LengthOfDateStampInFile = dic['LengthOfDateStampInFile']
                    LengthOfTimeStampInFile = dic['LengthOfTimeStampInFile']
                    column_names = dic['ColName']
                    NumberofColumns = dic['NumberofColumns']
                    print("Successfully loaded data in local")
            elif "schema_training" not in db.list_collection_names():
                with open(self.schema_path, 'r') as f:
                    js = json.load(f)
                    f.close()
                    collection = db["schema_training"]
                    collection.insert_one(js)
                    for i in collection.find():
                        d = dict(i)
                        d.pop("_id")
                        js = json.dumps(d)
                        dic = json.loads(js)
                        pattern = dic['SampleFileName']
                        LengthOfDateStampInFile = dic['LengthOfDateStampInFile']
                        LengthOfTimeStampInFile = dic['LengthOfTimeStampInFile']
                        column_names = dic['ColName']
                        NumberofColumns = dic['NumberofColumns']
                        print("inerted data in mongo db and load in local system")

            message ="LengthOfDateStampInFile:: %s" %LengthOfDateStampInFile + "\t" + "LengthOfTimeStampInFile:: %s" % LengthOfTimeStampInFile +"\t " + "NumberofColumns:: %s" % NumberofColumns + "\n"
            self.logger.log("Training_Logs","valuesfromSchemaValidationLog",message)


        except ValueError:
            self.logger.log("Training_Logs","valuesfromSchemaValidationLog ","ValueError:Value not found inside schema_training.json")
            raise ValueError


        except KeyError:
            self.logger.log("Training_Logs","valuesfromSchemaValidationLog ", "KeyError:Key value error incorrect key passed")
            raise KeyError


        except Exception as e:
            self.logger.log("Training_Logs","valuesfromSchemaValidationLog ", str(e))
            raise e



        return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns


    def manualRegexCreation(self):

        """
                                        Method Name: manualRegexCreation
                                        Description: This method contains a manually defined regex based on the "FileName" given in "Schema" file.
                                                    This Regex is used to validate the filename of the training data.
                                        Output: Regex pattern
                                        On Failure: None

                                         Written By: iNeuron Intelligence
                                        Version: 1.0
                                        Revisions: None

                                                """

        regex = "['wafer']+['\_'']+[\d_]+[\d]+\.csv"
        return regex





    def validationFileNameRaw(self, regex, LengthOfDateStampInFile, LengthOfTimeStampInFile,files):



        """
                            Method Name: validationFileNameRaw
                            Description: This function validates the name of the training csv files as per given name in the schema!
                                         Regex pattern is used to do the validation.If name format do not match the file is moved
                                         to Bad Raw Data folder else in Good raw data.
                            Output: None
                            On Failure: Exception

                             Written By: iNeuron Intelligence
                            Version: 1.0
                            Revisions: None

                        """


        # pattern = "['Wafer']+['\_'']+[\d_]+[\d]+\.csv"
        # delete the directories for good and bad data in case last run was unsuccessful and folders were not deleted.

        # create new directories
        onlyfiles = [f for f in self.Batch_Directory]
        try:
            for filename in onlyfiles:
                if (re.match(regex, filename)):
                    splitAtDot = re.split('.csv', filename)
                    splitAtDot = (re.split('_', splitAtDot[0]))
                    if len(splitAtDot[1]) == LengthOfDateStampInFile:
                        if len(splitAtDot[2]) == LengthOfTimeStampInFile:
                            self.move_rawfile.copytoanother(filename,"trainingfiles","goodraw")
                            self.logger.log("Training_Logs","nameValidationLog","Valid File name!! File moved to GoodRaw Folder :: %s" % filename)
                            #shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Good_Raw")

                        else:
                            self.move_rawfile.copytoanother(filename,"trainingfiles","badraw")
                            self.logger.log("Training_Logs","nameValidationLog","Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename)
                            self.mail.Gmail("File is not Valid","Invalid File Name!! File moved to Bad Raw Folder",filename)
                            #shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Bad_Raw")
                    else:
                        self.move_rawfile.copytoanother(filename,"trainingfiles","badraw")
                        self.logger.log("Training_Logs", "nameValidationLog","Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename)
                        self.mail.Gmail("File is not Valid", "Invalid File Name!! File moved to Bad Raw Folder",filename)
                        #shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Bad_Raw")
                else:
                    self.move_rawfile.copytoanother(filename,"trainingfiles","badraw")
                    self.logger.log("Training_Logs", "nameValidationLog","Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename)
                    self.mail.Gmail("File is not Valid", "Invalid File Name!! File moved to Bad Raw Folder", filename)
                    #shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Bad_Raw")

        except Exception as e:
            self.logger.log("Training_Logs", "nameValidationLog", "Error occured while validating FileName %s" % e)
            raise e


    def validateColumnLength(self, NumberofColumns):



        """
                                  Method Name: validateColumnLength
                                  Description: This function validates the number of columns in the csv files.
                                               It is should be same as given in the schema file.
                                               If not same file is not suitable for processing and thus is moved to Bad Raw Data folder.
                                               If the column number matches, file is kept in Good Raw Data for processing.
                                              The csv file is missing the first column name, this function changes the missing name to "Wafer".
                                  Output: None
                                  On Failure: Exception

                                   Written By: iNeuron Intelligence
                                  Version: 1.0
                                  Revisions: None

                              """
        try:
            self.logger.log("Training_Logs", "nameValidationLog", "Column Length Validation Started!!")
            for files in self.move_rawfile.gettingcsvfile("goodraw"):
                csv = self.move_rawfile.readingcsvfile("goodraw",files)
                if csv.shape[1] == NumberofColumns:
                    pass
                else:
                    self.move_rawfile.copyandDel(files,'goodraw','badraw')
                    #shutil.move("Training_Raw_files_validated/Good_Raw/" + file, "Training_Raw_files_validated/Bad_Raw")
                    self.logger.log("Training_Logs", "nameValidationLog", "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % files)
                    self.mail.Gmail("File is not Valid","Invalid Column Length for the file!! File moved to Bad Raw Folder",files)

            self.logger.log("Training_Logs", "nameValidationLog", "Column Length Validation Completed!!")
        except OSError:
            self.logger.log("Training_Logs", "nameValidationLog", "Error Occured while moving the file :: %s" % OSError)
            raise OSError
        except Exception as e:
            self.logger.log("Training_Logs", "nameValidationLog", "Error Occured:: %s" % e)
            raise e






    def validateMissingValuesInWholeColumn(self):




        """
                                          Method Name: validateMissingValuesInWholeColumn
                                          Description: This function validates if any column in the csv file has all values missing.
                                                       If all the values are missing, the file is not suitable for processing.
                                                       SUch files are moved to bad raw data.
                                          Output: None
                                          On Failure: Exception

                                           Written By: iNeuron Intelligence
                                          Version: 1.0
                                          Revisions: None

                                                      """
        try:

            self.logger.log("Training_Logs","missingValuesInColumn" ,"Missing Values Validation Started!!")

            for files in self.move_rawfile.gettingcsvfile("goodraw"):
                csv = self.move_rawfile.readingcsvfile("goodraw", files)
                count = 0
                for columns in csv:
                    if (len(csv[columns]) - csv[columns].count()) == len(csv[columns]):
                        count += 1
                        self.move_rawfile.copyandDel(files,"goodraw","badraw")
                        # shutil.move("Training_Raw_files_validated/Good_Raw/" + file,
                        #             "Training_Raw_files_validated/Bad_Raw")
                        break
                if count == 0:
                    csv.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True)
                    self.move_rawfile.saveDataFrameTocsv("goodraw", files, csv, index=None, header=True)


        except OSError:
            self.logger.log("Training_Logs","missingValuesInColumn" , "Error Occured while moving the file :: %s" % OSError)
            raise OSError
        except Exception as e:
            self.logger.log("Training_Logs","missingValuesInColumn" , "Error Occured:: %s" % e)
            raise e