Exemple #1
0
class Data_Getter_Pred:
    """
    This class shall  be used for obtaining the data from the source for prediction.

    Written By: iNeuron Intelligence
    Version: 1.0
    Revisions: None

    """
    def __init__(self, log_database, log_collection, execution_id):
        #self.prediction_file='Prediction_FileFromDB/InputFile.csv'
        #self.file_object=file_object
        #self.logger_object=logger_object

        self.log_database = log_database
        self.log_collection = log_collection
        self.prediction_directory = "prediction-file-from-db"
        self.filename = "prediction-inputfile.csv"
        self.log_db_writer = App_LoggerDB(execution_id=execution_id)
        self.az_blob_mgt = AzureBlobManagement()

    def get_data(self):
        """
        Method Name: get_data
        Description: This method reads the data from source.
        Output: A pandas DataFrame.
        On Failure: Raise Exception

         Written By: iNeuron Intelligence
        Version: 1.0
        Revisions: None

        """
        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the get_data method of the Data_Getter class')
        print("Entered the get_data method of the Data_Getter class")
        try:
            #self.data= pd.read_csv(self.filename) # reading the data file
            self.data = self.az_blob_mgt.readCSVFilefromDir(
                self.prediction_directory, self.filename)
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Data Load Successful.Exited the get_data method of the Data_Getter class'
            )
            return self.data
        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exception occured in get_data method of the Data_Getter class. Exception message: '
                + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Data Load Unsuccessful.Exited the get_data method of the Data_Getter class'
            )
            raise Exception()
Exemple #2
0
class trainModel:
    def __init__(self, execution_id):
        #self.log_writer = logger.App_Logger()
        #self.file_object = open("Training_Logs/ModelTrainingLog.txt", 'a+')
        self.log_db_writer = App_LoggerDB(execution_id=execution_id)
        self.log_database = "wafer_training_log1"
        self.log_collection = "model_training_log1"
        self.execution_id = execution_id

    def trainingModel(self):
        # Logging the start of Training
        self.log_db_writer.log(self.log_database, self.log_collection,
                               "Start of Training")
        print("training started")
        #self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source (data from csv stored in mongodb)
            #data_getter=data_loader.Data_Getter(self.file_object,self.log_writer)
            data_getter = data_loader.Data_Getter(self.log_database,
                                                  self.log_collection,
                                                  self.execution_id)

            data = data_getter.get_data()

            if data.__len__() == 0:
                self.log_db_writer.log(self.log_database, self.log_collection,
                                       "No record found to train model")
                print("No previous file available")
                return 0
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.log_database,
                                                      self.log_collection,
                                                      self.execution_id)
            data = preprocessor.remove_columns(
                data, 'Wafer'
            )  # remove the unnamed column as it doesn't contribute to prediction.

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='Output')

            # check if missing values are present in the dataset
            is_null_present = preprocessor.is_null_present(X)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                X = preprocessor.impute_missing_values(
                    X)  # missing value imputation

            # check further which columns do not contribute to predictions
            # if the standard deviation for a column is zero, it means that the column has constant values
            # and they are giving the same output both for good and bad sensors
            # prepare the list of such columns to drop
            cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(X)

            # drop the columns obtained above
            X = preprocessor.remove_columns(X, cols_to_drop)
            print("print X values")
            print(X)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                execution_id=self.execution_id)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters`
            X = kmeans.create_clusters(X, number_of_clusters)
            print("cluster shape details")
            print(X)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=355)
                print("printing shape of X_train")
                print(x_train.shape)

                model_finder = tuner.Model_Finder(
                    self.log_database, self.log_collection,
                    self.execution_id)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.log_database,
                                                      self.log_collection,
                                                      self.execution_id)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_db_writer.log(self.log_database, self.log_collection,
                                   'Successful End of Training')
        # self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_db_writer.log(self.log_database, self.log_collection,
                                   'Unsuccessful End of Training')
            #self.file_object.close()
            raise Exception
Exemple #3
0
class Prediction_Data_validation:
    """
               This class shall be used for handling all the validation done on the Raw Prediction Data!!.

               Written By: iNeuron Intelligence
               Version: 1.0
               Revisions: None

               """
    def __init__(self, path, execution_id):
        self.Batch_Directory = path
        self.execution_id = execution_id
        self.collection_name = "strength_schema_prediction"  #code added by Avnish yadav
        self.database_name = "Wafer-sys"  #code added by Avnish yadav
        self.logger_db_writer = App_LoggerDB(
            execution_id=execution_id)  #code added by Avnish yadav
        self.mongdb = MongodbOperation()
        self.az_blob_mgt = AzureBlobManagement()
        self.good_directory_path = "good-raw-file-prediction-validated"
        self.bad_directory_path = "bad-raw-file-prediction-validated"

    def valuesFromSchema(self):
        """
                                Method Name: valuesFromSchema
                                Description: This method extracts all the relevant information from the pre-defined "Schema" file.
                                Output: LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, Number of Columns
                                On Failure: Raise ValueError,KeyError,Exception

                                 Written By: iNeuron Intelligence
                                Version: 1.0
                                Revisions: None

                                        """
        log_database = "strength_prediction_log"
        log_collection = "values_from_schema_validation"
        try:
            log_database = "wafer_prediction_log"
            log_collection = "values_from_schema_validation"
            df_schema_training = self.mongdb.getDataFrameofCollection(
                self.database_name, self.collection_name)
            dic = {}
            [
                dic.update({i: df_schema_training.loc[0, i]})
                for i in df_schema_training.columns
            ]
            del df_schema_training
            #with open(self.schema_path, 'r') as f:
            #    dic = json.load(f)
            #    f.close()
            pattern = dic['SampleFileName']
            LengthOfDateStampInFile = dic['LengthOfDateStampInFile']
            LengthOfTimeStampInFile = dic['LengthOfTimeStampInFile']
            column_names = dic['ColName']
            NumberofColumns = dic['NumberofColumns']

            #file = open("Training_Logs/valuesfromSchemaValidationLog.txt", 'a+')
            message = "LengthOfDateStampInFile:: %s" % LengthOfDateStampInFile + "\t" + "LengthOfTimeStampInFile:: %s" % LengthOfTimeStampInFile + "\t " + "NumberofColumns:: %s" % NumberofColumns + "\n"
            self.logger_db_writer.log(log_database, log_collection, message)

            #file.close()

        except ValueError:
            self.logger_db_writer.log(
                log_database, log_collection,
                "KeyError:Key value error incorrect key passed")
            raise ValueError

        except KeyError:
            self.logger_db_writer.log(
                log_database, log_collection,
                "KeyError:Key value error incorrect key passed")
            raise KeyError
        except Exception as e:
            self.logger_db_writer.log(log_database, log_collection, str(e))
            raise e

        return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns

    def manualRegexCreation(self):
        """
                                      Method Name: manualRegexCreation
                                      Description: This method contains a manually defined regex based on the "FileName" given in "Schema" file.
                                                  This Regex is used to validate the filename of the prediction data.
                                      Output: Regex pattern
                                      On Failure: None

                                       Written By: iNeuron Intelligence
                                      Version: 1.0
                                      Revisions: None

                                              """
        regex = "['cement_strength']+['\_'']+[\d_]+[\d]+\.csv"
        return regex

    def createDirectoryForGoodBadRawData(self):
        """
                                        Method Name: createDirectoryForGoodBadRawData
                                        Description: This method creates directories to store the Good Data and Bad Data
                                                      after validating the prediction data.

                                        Output: None
                                        On Failure: OSError

                                         Written By: iNeuron Intelligence
                                        Version: 1.0
                                        Revisions: None

                                                """
        log_database = "strength_prediction_log"
        log_collection = "general_log"
        try:
            log_database = "strength_prediction_log"
            log_collection = "general_log"
            self.az_blob_mgt.createDir(self.good_directory_path,
                                       is_replace=True)
            self.az_blob_mgt.createDir(self.bad_directory_path,
                                       is_replace=True)
            msg = self.good_directory_path + " and " + self.bad_directory_path + " created successfully."
            self.logger_db_writer.log(log_database, log_collection, msg)
        except Exception as e:
            msg = "Error Occured in class Prediction_Data_validation method:createDirectoryForGoodBadRawData error: Failed to create directory " + self.good_directory_path + " and " + self.bad_directory_path
            self.logger_db_writer.log(log_database, log_collection, msg)
            raise e

    def deleteExistingGoodDataTrainingFolder(self):
        """
                                            Method Name: deleteExistingGoodDataTrainingFolder
                                            Description: This method deletes the directory made to store the Good Data
                                                          after loading the data in the table. Once the good files are
                                                          loaded in the DB,deleting the directory ensures space optimization.
                                            Output: None
                                            On Failure: OSError

                                             Written By: iNeuron Intelligence
                                            Version: 1.0
                                            Revisions: None

                                                    """
        log_database = "strength_prediction_log"
        log_collection = "general_log"
        try:
            log_database = "strength_prediction_log"
            log_collection = "general_log"
            self.az_blob_mgt.deleteDir(self.good_directory_path)
            self.logger_db_writer.log(
                log_database, log_collection,
                self.good_directory_path + " deleted successfully!!")
        except Exception as e:
            msg = "Error Occured in class Raw_Data_validation method:deleteExistingGoodDataTrainingFolder Error occured while deleting :" + self.good_directory_path
            self.logger_db_writer.log(log_database, log_collection, msg)
            raise e

    def deleteExistingBadDataTrainingFolder(self):
        """
                                            Method Name: deleteExistingBadDataTrainingFolder
                                            Description: This method deletes the directory made to store the bad Data.
                                            Output: None
                                            On Failure: OSError

                                             Written By: iNeuron Intelligence
                                            Version: 1.0
                                            Revisions: None

                                                    """
        log_database = "strength_prediction_log"
        log_collection = "general_log"

        try:
            log_database = "strength_prediction_log"
            log_collection = "general_log"
            self.az_blob_mgt.deleteDir(self.bad_directory_path)
            self.logger_db_writer.log(
                log_database, log_collection,
                self.bad_directory_path + " deleted successfully!!")

        except Exception as e:
            msg = "Error Occured in class Raw_Data_validation method:deleteExistingGoodDataTrainingFolder Error occured while deleting :" + self.good_directory_path
            self.logger_db_writer.log(log_database, log_collection, msg)
            raise e

    def moveBadFilesToArchiveBad(self):
        """
                                            Method Name: moveBadFilesToArchiveBad
                                            Description: This method deletes the directory made  to store the Bad Data
                                                          after moving the data in an archive folder. We archive the bad
                                                          files to send them back to the client for invalid data issue.
                                            Output: None
                                            On Failure: OSError

                                             Written By: iNeuron Intelligence
                                            Version: 1.0
                                            Revisions: None

                                                    """
        now = datetime.now()
        date = now.date()
        time = now.strftime("%H%M%S")
        log_database = "strength_prediction_log"
        log_collection = "general_log"

        try:
            log_database = "strength_prediction_log"
            log_collection = "general_log"

            # source = 'Training_Raw_files_validated/Bad_Raw/'
            source = self.bad_directory_path
            destination = "lap-" + self.execution_id
            self.logger_db_writer.log(log_database, log_collection,
                                      "Started moving bad raw data..")
            for file in self.az_blob_mgt.getAllFileNameFromDirectory(source):
                self.az_blob_mgt.moveFileinDir(source, destination, file)
                self.logger_db_writer.log(
                    log_database, log_collection, "File:" + file +
                    " moved to directory:" + destination + " successfully.")

            self.logger_db_writer.log(
                log_database, log_collection,
                "All bad raw file moved to directory:" + destination)

            self.az_blob_mgt.deleteDir(source)
            self.logger_db_writer.log(log_database, log_collection,
                                      "Deleting bad raw directory:" + source)

        except Exception as e:
            self.logger_db_writer.log(
                log_database, log_collection,
                "class Raw_Data_validation method:moveBadFilesToArchiveBad Error while moving bad files to archive:"
                + str(e))
            raise e

    def validationFileNameRaw(self, regex, LengthOfDateStampInFile,
                              LengthOfTimeStampInFile):
        """
            Method Name: validationFileNameRaw
            Description: This function validates the name of the prediction csv file as per given name in the schema!
                         Regex pattern is used to do the validation.If name format do not match the file is moved
                         to Bad Raw Data folder else in Good raw data.
            Output: None
            On Failure: Exception

             Written By: iNeuron Intelligence
            Version: 1.0
            Revisions: None

        """
        # delete the directories for good and bad data in case last run was unsuccessful and folders were not deleted.
        self.createDirectoryForGoodBadRawData()
        onlyfiles = self.az_blob_mgt.getAllFileNameFromDirectory(
            self.Batch_Directory)
        try:
            log_database = "strength_prediction_log"
            log_collection = "name_validation_log"
            for filename in onlyfiles:
                if (re.match(regex, filename)):
                    splitAtDot = re.split('.csv', filename)
                    splitAtDot = (re.split('_', splitAtDot[0]))
                    if len(splitAtDot[2]) == LengthOfDateStampInFile:
                        if len(splitAtDot[3]) == LengthOfTimeStampInFile:
                            self.az_blob_mgt.CopyFileinDir(
                                self.Batch_Directory, self.good_directory_path,
                                filename)
                            self.logger_db_writer.log(
                                log_database, log_collection,
                                "Valid File name!! File moved to " +
                                self.good_directory_path + filename)

                        else:
                            self.az_blob_mgt.CopyFileinDir(
                                self.Batch_Directory, self.bad_directory_path,
                                filename)
                            msg = "Invalid File Name !! File moved to " + self.bad_directory_path + filename
                            self.logger_db_writer.log(log_database,
                                                      log_collection, msg)
                    else:
                        self.az_blob_mgt.CopyFileinDir(self.Batch_Directory,
                                                       self.bad_directory_path,
                                                       filename)
                        msg = "Invalid File Name !! File moved to " + self.bad_directory_path + filename
                        self.logger_db_writer.log(log_database, log_collection,
                                                  msg)

                else:
                    self.az_blob_mgt.CopyFileinDir(self.Batch_Directory,
                                                   self.bad_directory_path,
                                                   filename)
                    msg = "Invalid File Name !! File moved to " + self.bad_directory_path + filename
                    self.logger_db_writer.log(log_database, log_collection,
                                              msg)
        except Exception as e:
            msg = "Error occured while validating FileName " + str(e)
            self.logger_db_writer.log(log_database, log_collection, msg)
            raise e

    def validateColumnLength(self, NumberofColumns):
        """
                    Method Name: validateColumnLength
                    Description: This function validates the number of columns in the csv files.
                                 It is should be same as given in the schema file.
                                 If not same file is not suitable for processing and thus is moved to Bad Raw Data folder.
                                 If the column number matches, file is kept in Good Raw Data for processing.
                                The csv file is missing the first column name, this function changes the missing name to "Wafer".
                    Output: None
                    On Failure: Exception

                     Written By: iNeuron Intelligence
                    Version: 1.0
                    Revisions: None

             """
        try:
            log_database = "strength_prediction_log"
            log_collection = "column_validation_log"
            self.logger_db_writer.log(log_database, log_collection,
                                      "Column length validation Started!!")
            #for file in listdir('Prediction_Raw_Files_Validated/Good_Raw/'):
            for file in self.az_blob_mgt.getAllFileNameFromDirectory(
                    self.good_directory_path):
                #csv = pd.read_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file)
                csv = self.az_blob_mgt.readCSVFilefromDir(
                    self.good_directory_path, file)
                print(csv.shape)
                if csv.shape[1] == NumberofColumns:
                    #csv.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True)
                    print(csv)
                    #csv.to_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file, index=None, header=True)
                    self.az_blob_mgt.saveDataFrametoCSV(
                        self.good_directory_path,
                        file,
                        csv,
                        index=None,
                        header=True)
                else:
                    self.az_blob_mgt.moveFileinDir(self.good_directory_path,
                                                   self.bad_directory_path,
                                                   file)
                    self.logger_db_writer.log(
                        log_database, log_collection,
                        "Invalid Column Length for the file!! "
                        "File moved to Bad Raw Folder :: %s" % file)
            self.logger_db_writer.log(log_database, log_collection,
                                      "Column Length Validation Completed!!")

        except Exception as e:
            self.logger_db_writer.log(log_database, log_collection,
                                      'Error Occured::' + str(e))
            raise e

    def deletePredictionFile(self):
        try:
            log_database = "strenth_prediction_log"
            log_collection = "general_log"
            directory = "prediction-file"
            filename = "Prediction.csv"
            if directory in self.az_blob_mgt.dir_list:
                filenames = self.az_blob_mgt.getAllFileNameFromDirectory(
                    directory_name=directory)
                if filename in filenames:
                    self.az_blob_mgt.deleteFilefromDir(
                        directory_name=directory, filename=filename)
                    self.logger_db_writer.log(
                        log_database, log_collection, filename +
                        " is deleted from dir:" + directory + " successfully")
        except Exception as e:

            self.logger_db_writer.log(
                log_database, log_collection,
                "Error occure while deleting prediction file from prediction-file directory"
                + str(e))
            raise e

    def validateMissingValuesInWholeColumn(self):
        """
                                  Method Name: validateMissingValuesInWholeColumn
                                  Description: This function validates if any column in the csv file has all values missing.
                                               If all the values are missing, the file is not suitable for processing.
                                               SUch files are moved to bad raw data.
                                  Output: None
                                  On Failure: Exception

                                   Written By: iNeuron Intelligence
                                  Version: 1.0
                                  Revisions: None

                              """
        try:
            log_database = "strength_prediction_log"
            log_collection = "missing_values_in_column"
            #f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+')
            #self.logger.log(f, "Missing Values Validation Started!!")
            self.logger_db_writer.log(log_database, log_collection,
                                      "Missing Values Validation Started!!")

            #for file in listdir('Prediction_Raw_Files_Validated/Good_Raw/'):
            for file in self.az_blob_mgt.getAllFileNameFromDirectory(
                    self.good_directory_path):
                #csv = pd.read_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file)
                csv = self.az_blob_mgt.readCSVFilefromDir(
                    self.good_directory_path, file)
                print(csv)
                count = 0
                for columns in csv:
                    if (len(csv[columns]) - csv[columns].count()) == len(
                            csv[columns]):
                        count += 1
                        #shutil.move("Prediction_Raw_Files_Validated/Good_Raw/" + file,
                        #            "Prediction_Raw_Files_Validated/Bad_Raw")
                        self.az_blob_mgt.moveFileinDir(
                            self.good_directory_path, self.bad_directory_path,
                            file)
                        #self.logger.log(f,"Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file)
                        self.logger_db_writer.log(
                            log_database, log_collection,
                            "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s"
                            % file)

                        break
                if count == 0:
                    csv.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True)
                    print("column unnamed may not be present")
                    self.az_blob_mgt.saveDataFrametoCSV(
                        self.good_directory_path,
                        file,
                        csv,
                        index=None,
                        header=True)
                    #csv.to_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file, index=None, header=True)
        except Exception as e:

            self.logger_db_writer.log(log_database, log_collection,
                                      "Error occured:" + str(e))
            raise e
Exemple #4
0
class KMeansClustering:
    """
            This class shall  be used to divide the data into clusters before training.


            Written By: iNeuron Intelligence
            Version: 1.0
            Revisions: None

            """

    def __init__(self, execution_id):
        #self.file_object = file_object
        #self.logger_object = logger_object
        self.log_database="strength_training_log"
        self.log_collection="stg-training_main_log"
        self.execution_id=execution_id
        self.log_db_writer=App_LoggerDB(execution_id=execution_id)
        self.mongoDBObject = MongodbOperation()
        self.az_blob_mgt=AzureBlobManagement()

    def elbow_plot(self,data):
        """
                        Method Name: elbow_plot
                        Description: This method saves the plot to decide the optimum number of clusters to the file.
                        Output: A picture saved to the directory
                        On Failure: Raise Exception

                        Written By: iNeuron Intelligence
                        Version: 1.0
                        Revisions: None

                """
        self.log_db_writer.log(self.log_database,self.log_collection, 'Entered the elbow_plot method of the KMeansClustering class')
        wcss=[] # initializing an empty list
        try:
            for i in range (1,11):
                kmeans=KMeans(n_clusters=i,init='k-means++',random_state=42) # initializing the KMeans object
                kmeans.fit(data) # fitting the data to the KMeans Algorithm
                wcss.append(kmeans.inertia_)
            plt.plot(range(1,11),wcss) # creating the graph between WCSS and the number of clusters
            plt.title('The Elbow Method')
            plt.xlabel('Number of clusters')
            plt.ylabel('WCSS')
            #plt.show()
            plt.savefig('preprocessing_data/K-Means_Elbow.PNG') # saving the elbow plot locally
            # finding the value of the optimum cluster programmatically
            self.kn = KneeLocator(range(1, 11), wcss, curve='convex', direction='decreasing')
            self.log_db_writer.log(self.log_database,self.log_collection, 'The optimum number of clusters is: '+str(self.kn.knee)+' . Exited the elbow_plot method of the KMeansClustering class')
            return self.kn.knee

        except Exception as e:
            self.log_db_writer.log(self.log_database,self.log_collection,'Exception occured in elbow_plot method of the KMeansClustering class. Exception message:  ' + str(e))
            self.log_db_writer.log(self.log_database,self.log_collection,'Finding the number of clusters failed. Exited the elbow_plot method of the KMeansClustering class')
            raise Exception()

    def create_clusters(self,data,number_of_clusters):
        """
                                Method Name: create_clusters
                                Description: Create a new dataframe consisting of the cluster information.
                                Output: A datframe with cluster column
                                On Failure: Raise Exception

                                Written By: iNeuron Intelligence
                                Version: 1.0
                                Revisions: None

                        """
        self.log_db_writer.log(self.log_database,self.log_collection, 'Entered the create_clusters method of the KMeansClustering class')
        self.data=data
        try:
            self.kmeans = KMeans(n_clusters=number_of_clusters, init='k-means++', random_state=42)
            #self.data = self.data[~self.data.isin([np.nan, np.inf, -np.inf]).any(1)]
            self.y_kmeans=self.kmeans.fit_predict(data) #  divide data into clusters

            self.file_op = file_methods.File_Operation(self.log_database,self.log_collection,self.execution_id)
            self.save_model = self.file_op.save_model(self.kmeans, 'kkmeans') # saving the KMeans model to directory
                                                                                    # passing 'Model' as the functions need three parameters

            self.data['Cluster']=self.y_kmeans  # create a new column in dataset for storing the cluster information
            self.log_db_writer.log(self.log_database,self.log_collection, 'succesfully created '+str(self.kn.knee)+ 'clusters. Exited the create_clusters method of the KMeansClustering class')
            return self.data
        except Exception as e:
            self.log_db_writer.log(self.log_database,self.log_collection,'Exception occured in create_clusters method of the KMeansClustering class. Exception message:  ' + str(e))
            self.log_db_writer.log(self.log_database,self.log_collection,'Fitting the data to clusters failed. Exited the create_clusters method of the KMeansClustering class')
            raise Exception()
Exemple #5
0
class dataTransformPredict:
    """
                  This class shall be used for transforming the Good Raw Training Data before loading it in Database!!.

                  Written By: iNeuron Intelligence
                  Version: 1.0
                  Revisions: None

                  """
    def __init__(self, execution_id):
        self.execution_id = execution_id
        #self.goodDataPath = "Prediction_Raw_Files_Validated/Good_Raw"
        self.goodDataPath = "good-raw-file-prediction-validated"
        #self.logger = App_Logger()
        self.log_db_writer = App_LoggerDB(execution_id=execution_id)
        self.log_database = "wafer_prediction_log"
        self.az_blob_mgt = AzureBlobManagement()

    def replaceMissingWithNull(self):
        """
                                  Method Name: replaceMissingWithNull
                                  Description: This method replaces the missing values in columns with "NULL" to
                                               store in the table. We are using substring in the first column to
                                               keep only "Integer" data for ease up the loading.
                                               This column is anyways going to be removed during prediction.

                                   Written By: iNeuron Intelligence
                                  Version: 1.0
                                  Revisions: None

                                          """

        try:
            log_collection = "data_transform_log"
            #log_file = open("Prediction_Logs/dataTransformLog.txt", 'a+')
            #onlyfiles = [f for f in listdir(self.goodDataPath)]
            onlyfiles = self.az_blob_mgt.getAllFileNameFromDirectory(
                self.goodDataPath)
            for file in onlyfiles:
                #csv = pandas.read_csv(self.goodDataPath+"/" + file)
                csv = self.az_blob_mgt.readCsvFileFromDirectory(
                    self.goodDataPath, file)
                csv.fillna('NULL', inplace=True)
                # #csv.update("'"+ csv['Wafer'] +"'")
                # csv.update(csv['Wafer'].astype(str))
                csv['Wafer'] = csv['Wafer'].str[6:]
                #csv.to_csv(self.goodDataPath+ "/" + file, index=None, header=True)
                self.az_blob_mgt.saveDataFrameTocsv(self.goodDataPath,
                                                    file,
                                                    csv,
                                                    idex=None,
                                                    header=True)
                #self.logger.log(log_file," %s: File Transformed successfully!!" % file)
                self.log_db_writer.log(
                    self.log_database, log_collection,
                    "File {0} transformed successfully".format(file))
            #log_file.write("Current Date :: %s" %date +"\t" + "Current time:: %s" % current_time + "\t \t" +  + "\n")

        except Exception as e:
            #self.logger.log(log_file, "Data Transformation failed because:: %s" % e)
            #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n")
            #log_file.close()
            self.log_db_writer.log(
                self.log_database, log_collection,
                'Data Transformation failed because:' + str(e))
            raise e
class trainModel:
    def __init__(self, execution_id):
        #self.log_writer = logger.App_Logger()
        #self.file_object = open("Training_Logs/ModelTrainingLog.txt", 'a+')

        self.log_db_writer = App_LoggerDB(execution_id=execution_id)
        self.log_database = "strength_training_log"
        self.log_collection = "stg_training_main_log"
        self.execution_id = execution_id

    def trainingModel(self):
        # Logging the start of Training
        self.log_db_writer.log(self.log_database, self.log_collection,
                               "Start of Training")
        print("training started")
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.log_database,
                                                  self.log_collection,
                                                  self.execution_id)
            data = data_getter.get_data()

            if data.__len__() == 0:
                self.log_db_writer.log(self.log_database, self.log_collection,
                                       "No record found to train model")
                print("No previous file available")
                return 0
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.log_database,
                                                      self.log_collection,
                                                      self.execution_id)

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data)  # missing value imputation

            # get encoded values for categorical data

            #data = preprocessor.encodeCategoricalValues(data)

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='Concrete_compressive _strength')
            # drop the columns obtained above
            #X=preprocessor.remove_columns(X,cols_to_drop)

            X = preprocessor.logTransformation(X)
            print(X)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.execution_id)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)
            print("cluster shape details")
            print(X)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=36)

                x_train_scaled = preprocessor.standardScalingData(x_train)
                x_test_scaled = preprocessor.standardScalingData(x_test)

                model_finder = tuner.Model_Finder(
                    self.log_database, self.log_collection,
                    self.execution_id)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train_scaled, y_train, x_test_scaled, y_test)

                model_metrics = model_finder.get_model_metrics(
                    best_model_name + str(i))

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.log_database,
                                                      self.log_collection,
                                                      self.execution_id)
                print(best_model_name + str(i))
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_db_writer.log(self.log_database, self.log_collection,
                                   'Successful End of Training')
            #self.log_database.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_db_writer.log(self.log_database, self.log_collection,
                                   'Unsuccessful End of Training')
            #self.log_database.close()
            raise Exception


#trainModelObj = trainModel(1111)  # object initialization
#trainModelObj.trainingModel()  # training the model for the files in the table
Exemple #7
0
class Model_Finder:
    """
                This class shall  be used to find the model with best accuracy and AUC score.
                Written By: iNeuron Intelligence
                Version: 1.0
                Revisions: None

                """
    def __init__(self, log_database, log_collection, execution_id):
        #self.file_object = file_object
        #self.logger_object = logger_object

        self.execution_id = execution_id
        self.log_db_writer = App_LoggerDB(execution_id=execution_id)
        self.log_database = log_database
        self.log_collection = log_collection
        self.az_blob_mgt = AzureBlobManagement()
        self.mongoDBObject = MongodbOperation()

        self.linearReg = LinearRegression()
        self.RandomForestReg = RandomForestRegressor()
        self.DecisionTreeReg = DecisionTreeRegressor()
        self.XGBoostReg = XGBRegressor()
        self.AdaboostReg = AdaBoostRegressor()
        self.svm = SVC()
        #self.mse = mean_squared_error()
        #self.mae = mean_absolute_error()

    def get_best_params_for_Random_Forest_Regressor(self, train_x, train_y):
        """
                                                Method Name: get_best_params_for_Random_Forest_Regressor
                                                Description: get the parameters for Random_Forest_Regressor Algorithm which give the best accuracy.
                                                             Use Hyper Parameter Tuning.
                                                Output: The model with the best parameters
                                                On Failure: Raise Exception

                                                Written By: iNeuron Intelligence
                                                Version: 1.0
                                                Revisions: None

                                        """
        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the RandomForestReg method of the Model_Finder class')
        try:
            # initializing with different combination of parameters
            self.param_grid_Random_forest_Tree = {
                "n_estimators": [10, 20, 30],
                "max_features": ["auto", "sqrt", "log2"],
                "min_samples_split": [2, 4, 8],
                "bootstrap": [True, False]
            }

            # Creating an object of the Grid Search class
            self.grid = GridSearchCV(self.RandomForestReg,
                                     self.param_grid_Random_forest_Tree,
                                     verbose=3,
                                     cv=5)
            # finding the best parameters
            self.grid.fit(train_x, train_y)

            # extracting the best parameters
            self.n_estimators = self.grid.best_params_['n_estimators']
            self.max_features = self.grid.best_params_['max_features']
            self.min_samples_split = self.grid.best_params_[
                'min_samples_split']
            self.bootstrap = self.grid.best_params_['bootstrap']

            # creating a new model with the best parameters
            self.randomForestReg = RandomForestRegressor(
                n_estimators=self.n_estimators,
                max_features=self.max_features,
                min_samples_split=self.min_samples_split,
                bootstrap=self.bootstrap)
            # training the mew models
            self.randomForestReg.fit(train_x, train_y)
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'RandomForestReg best params: ' + str(self.grid.best_params_) +
                '. Exited the RandomForestReg method of the Model_Finder class'
            )
            return self.randomForestReg
        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exception occured in RandomForestReg method of the Model_Finder class. Exception message:  '
                + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'RandomForestReg Parameter tuning  failed. Exited the knn method of the Model_Finder class'
            )
            raise Exception()

    def get_best_params_for_xgboost(self, train_x, train_y):
        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the XG boost Reg method of the Model_Finder class')
        try:
            # initializing with different combination of parameters
            self.param_XGboost = {
                'learning_rate': [.001, 0.01, .1],
                'max_depth': [2, 6, 8, 10, 14],
                'min_child_weight': [1, 3, 5, 7]
            }
            #'gamma': [0.0,0.1,0.2]

            # Creating an object of the Grid Search class
            self.grid = GridSearchCV(self.XGBoostReg,
                                     self.param_XGboost,
                                     verbose=3,
                                     cv=5)
            # finding the best parameters
            self.grid.fit(train_x, train_y)

            # extracting the best parameters
            self.learning_rate = self.grid.best_params_['learning_rate']
            self.max_depth = self.grid.best_params_['max_depth']
            self.min_child_weight = self.grid.best_params_['min_child_weight']
            #self.gamma = self.grid.best_params_['gamma']

            # creating a new model with the best parameters
            self.xgboostReg = XGBRegressor(
                learning_rate=self.learning_rate,
                max_depth=self.max_depth,
                min_child_weight=self.min_child_weight)

            # training the mew models
            self.xgboostReg.fit(train_x, train_y)
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'xgboostReg best params: ' + str(self.grid.best_params_) +
                '. Exited the DecisionTreeReg  method of the Model_Finder class'
            )
            return self.xgboostReg
        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exception occured in xgboostReg   method of the Model_Finder class. Exception message:  '
                + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'xgboostReg   Parameter tuning  failed. Exited the knn method of the Model_Finder class'
            )
            raise Exception()

    def get_best_params_for_decisionTree(self, train_x, train_y):
        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the Decision Tree Reg method of the Model_Finder class')
        try:
            # initializing with different combination of parameters
            self.param_decision_Tree = {
                'criterion': ['mse', 'mae'],
                'max_depth': [2, 6, 8, 10, 14, 18, 20],
                'min_samples_leaf': [20, 40, 100],
                'min_samples_split': [10, 20, 40]
            }

            # Creating an object of the Grid Search class
            self.grid = GridSearchCV(self.DecisionTreeReg,
                                     self.param_decision_Tree,
                                     verbose=3,
                                     n_jobs=1,
                                     cv=5)
            # finding the best parameters
            self.grid.fit(train_x, train_y)

            # extracting the best parameters
            self.criterion = self.grid.best_params_['criterion']
            self.max_depth = self.grid.best_params_['max_depth']
            self.min_samples_leaf = self.grid.best_params_['min_samples_leaf']
            self.min_samples_split = self.grid.best_params_[
                'min_samples_split']

            # creating a new model with the best parameters
            self.decisionTreeReg = DecisionTreeRegressor(
                criterion=self.criterion,
                max_depth=self.max_depth,
                min_samples_leaf=self.min_samples_leaf,
                min_samples_split=self.min_samples_split,
            )
            # training the mew models
            self.decisionTreeReg.fit(train_x, train_y)
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'DecisionTreeReg best params: ' + str(self.grid.best_params_) +
                '. Exited the DecisionTreeReg  method of the Model_Finder class'
            )
            return self.decisionTreeReg
        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exception occured in DecisionTreeReg  method of the Model_Finder class. Exception message:  '
                + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'DecisionTreeReg  Parameter tuning  failed. Exited the knn method of the Model_Finder class'
            )
            raise Exception()

    def get_best_params_for_adaboostReg(self, train_x, train_y):

        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the Adda boost Reg method of the Model_Finder class')
        try:
            # initializing with different combination of parameters
            self.param_ada_boost = {
                'n_estimators': [500, 1000, 2000],
                'learning_rate': [.001, 0.01, .1],
                'random_state': [1]
            }

            # Creating an object of the Grid Search class
            self.grid = GridSearchCV(self.AdaboostReg,
                                     self.param_ada_boost,
                                     scoring='neg_mean_squared_error',
                                     verbose=3,
                                     n_jobs=1,
                                     cv=5)
            # finding the best parameters
            self.grid.fit(train_x, train_y)

            # extracting the best parameters
            self.n_estimators = self.grid.best_params_['n_estimators']
            self.learning_rate = self.grid.best_params_['learning_rate']

            # creating a new model with the best parameters- for adaboost  base estimator is Decision tree with 3 depth
            self.adaboostReg = AdaBoostRegressor(
                n_estimators=self.n_estimators,
                learning_rate=self.learning_rate,
                random_state=1)

            #training the mew models
            self.adaboostReg.fit(train_x, train_y)
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Ada boost best params: ' + str(self.grid.best_params_) +
                '. Exited the AdaBoost Reg  method of the Model_Finder class')
            return self.adaboostReg
        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exception occured in Ada BoostReg method of the Model_Finder class. Exception message:  '
                + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Adaboost Reg  Parameter tuning  failed. Exited the knn method of the Model_Finder class'
            )
            raise Exception()

    def get_best_params_for_linearReg(self, train_x, train_y):
        """
                                        Method Name: get_best_params_for_linearReg
                                        Description: get the parameters for LinearReg Algorithm which give the best accuracy.
                                                     Use Hyper Parameter Tuning.
                                        Output: The model with the best parameters
                                        On Failure: Raise Exception

                                        Written By: iNeuron Intelligence
                                        Version: 1.0
                                        Revisions: None

                                """
        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the get_best_params_for_linearReg method of the Model_Finder class'
        )
        try:
            # initializing with different combination of parameters
            self.param_grid_linearReg = {
                'fit_intercept': [True, False],
                'normalize': [True, False],
                'copy_X': [True, False]
            }
            # Creating an object of the Grid Search class
            self.grid = GridSearchCV(self.linearReg,
                                     self.param_grid_linearReg,
                                     verbose=3,
                                     cv=5)
            # finding the best parameters
            self.grid.fit(train_x, train_y)

            # extracting the best parameters
            self.fit_intercept = self.grid.best_params_['fit_intercept']
            self.normalize = self.grid.best_params_['normalize']
            self.copy_X = self.grid.best_params_['copy_X']

            # creating a new model with the best parameters
            self.linReg = LinearRegression(fit_intercept=self.fit_intercept,
                                           normalize=self.normalize,
                                           copy_X=self.copy_X)
            # training the mew model
            self.linReg.fit(train_x, train_y)
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'LinearRegression best params: ' +
                str(self.grid.best_params_) +
                '. Exited the get_best_params_for_linearReg method of the Model_Finder class'
            )
            return self.linReg
        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exception occured in get_best_params_for_linearReg method of the Model_Finder class. Exception message:  '
                + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'LinearReg Parameter tuning  failed. Exited the get_best_params_for_linearReg method of the Model_Finder class'
            )
            raise Exception()

    def get_model_metrics(self, name):

        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the get_model Metrics of the Model_Finder class')

        self.Reg_metrics = pd.DataFrame(self.Regression_score)
        self.Reg_metrics.to_excel(name + '.xlsx')

        return self.Reg_metrics

    def get_best_model(self, train_x, train_y, test_x, test_y):
        """
                                                Method Name: get_best_model
                                                Description: Find out the Model which has the best AUC score.
                                                Output: The best model name and the model object
                                                On Failure: Raise Exception

                                                Written By: iNeuron Intelligence
                                                Version: 1.0
                                                Revisions: None

                                        """
        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the get_best_model method of the Model_Finder class')

        try:

            self.decisionTreeReg = self.get_best_params_for_decisionTree(
                train_x, train_y)
            self.prediction_decisionTreeReg = self.decisionTreeReg.predict(
                test_x)  # Predictions using the decisiontreeReg Model
            self.prediction_decisionTreeReg_error = r2_score(
                test_y, self.prediction_decisionTreeReg)
            self.decisionTreeReg_mse = mean_squared_error(
                test_y, self.prediction_decisionTreeReg)
            self.decisionTreeReg_mae = mean_absolute_error(
                test_y, self.prediction_decisionTreeReg)

            # create best model for Linear Regression
            self.LinearReg = self.get_best_params_for_linearReg(
                train_x, train_y)
            self.prediction_LinearReg = self.LinearReg.predict(
                test_x)  # Predictions using the LinearReg Model
            self.LinearReg_error = r2_score(test_y, self.prediction_LinearReg)
            self.LinearReg_mse = mean_squared_error(test_y,
                                                    self.prediction_LinearReg)
            self.LinearReg_mae = mean_absolute_error(test_y,
                                                     self.prediction_LinearReg)

            # create best model for randomforest
            self.randomForestReg = self.get_best_params_for_Random_Forest_Regressor(
                train_x, train_y)
            self.prediction_randomForestReg = self.randomForestReg.predict(
                test_x)  # Predictions using the randomForestReg Model
            self.prediction_randomForestReg_error = r2_score(
                test_y, self.prediction_randomForestReg)
            self.randomForestReg_mse = mean_squared_error(
                test_y, self.prediction_randomForestReg)
            self.randomForestReg_mae = mean_absolute_error(
                test_y, self.prediction_randomForestReg)

            # create best model for XGBoost
            self.XGBoostReg = self.get_best_params_for_xgboost(
                train_x, train_y)
            self.prediction_xgboostReg = self.XGBoostReg.predict(
                test_x)  # Predictions using the xgboostReg Model
            self.prediction_xgboostReg_error = r2_score(
                test_y, self.prediction_xgboostReg)
            self.XGBoostReg_mse = mean_squared_error(
                test_y, self.prediction_xgboostReg)
            self.XGBoostReg_mae = mean_absolute_error(
                test_y, self.prediction_xgboostReg)
            # create best model for Decision Tree

            # create best model for Ada boost
            self.adaboostReg = self.get_best_params_for_adaboostReg(
                train_x, train_y)
            self.prediction_adaboostReg = self.adaboostReg.predict(
                test_x)  # Predictions using the adaboostReg Model
            self.prediction_adaboostReg_error = r2_score(
                test_y, self.prediction_adaboostReg)
            self.adaboostReg_mse = mean_squared_error(
                test_y, self.prediction_adaboostReg)
            self.adaboostReg_mae = mean_absolute_error(
                test_y, self.prediction_adaboostReg)

            self.Regression_score = {
                "LinearRegression": [
                    self.LinearReg_error, self.LinearReg, self.LinearReg_mse,
                    self.LinearReg_mae
                ],
                "randomForestRegressor": [
                    self.prediction_randomForestReg_error,
                    self.randomForestReg, self.randomForestReg_mse,
                    self.randomForestReg_mae
                ],
                "xg-BoostRegressor": [
                    self.prediction_xgboostReg_error, self.XGBoostReg,
                    self.XGBoostReg_mse, self.XGBoostReg_mae
                ],
                "decisionTreeRegressor": [
                    self.prediction_decisionTreeReg_error,
                    self.decisionTreeReg, self.decisionTreeReg_mse,
                    self.decisionTreeReg_mae
                ],
                "ada-BoostRegressor": [
                    self.prediction_adaboostReg_error, self.adaboostReg,
                    self.adaboostReg_mse, self.adaboostReg_mae
                ]
            }

            # metrics table

            self.select = list(self.Regression_score.values())[0][
                0]  # assigming first value from key value pair to variable
            #self.name = list(self.r2score.keys())[0][0] # assigming first key from key value pair to variable

            for i in self.Regression_score.items(
            ):  # for each i in a key value pair called by items()
                if i[1][0] >= self.select:
                    self.select = i[1][0]
                    self.name = i[0]
                    self.model = i[1][1]
            print(self.name, self.model, " r2 score=", self.select)
            return self.name, self.model

            #comparing the two models
            #if(self.LinearReg_error <  self.prediction_randomForestReg_error):
            #    return 'RandomForestRegressor',self.randomForestReg
            #else:
        # #    return 'LinearRegression',self.LinearReg

        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exception occured in get_best_model method of the Model_Finder class. Exception message:  '
                + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Model Selection Failed. Exited the get_best_model method of the Model_Finder class'
            )
            raise Exception()
Exemple #8
0
class Preprocessor:
    """
        This class shall  be used to clean and transform the data before training.

        Written By: iNeuron Intelligence
        Version: 1.0
        Revisions: None

        """
    def __init__(self, log_database, log_collection, execution_id):
        self.log_database = log_database
        self.log_collection = log_collection
        self.execution_id = execution_id
        self.mongoDBObject = MongodbOperation()
        self.log_db_writer = App_LoggerDB(execution_id=execution_id)
        self.az_blob_mgt = AzureBlobManagement()

    def remove_columns(self, data, columns):
        """
                Method Name: remove_columns
                Description: This method removes the given columns from a pandas dataframe.
                Output: A pandas DataFrame after removing the specified columns.
                On Failure: Raise Exception

                Written By: iNeuron Intelligence
                Version: 1.0
                Revisions: None

        """
        self.log_db_writer.log(
            self.log_database, self.log_collection,
            "Entered the remove_columns method of the Preprocessor class")
        self.data = data
        self.columns = columns
        try:
            self.useful_data = self.data.drop(
                labels=self.columns,
                axis=1)  # drop the labels specified in the columns
            print("Useful data in dataframe")
            print(self.useful_data)
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                "Column removal Successful.Exited the "
                "remove_columns method of the Preprocessor class")

            return self.useful_data

        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                "Exception occured in remove_columns method"
                " of the Preprocessor class. Exception message:  " + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Column removal Unsuccessful. Exited the '
                'remove_columns method of the Preprocessor class')

            raise Exception()

    def separate_label_feature(self, data, label_column_name):
        """
                        Method Name: separate_label_feature
                        Description: This method separates the features and a Label Coulmns.
                        Output: Returns two separate Dataframes, one containing features and the other containing Labels .
                        On Failure: Raise Exception

                        Written By: iNeuron Intelligence    
                        Version: 1.0
                        Revisions: None

                """
        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the separate_label_feature method of the Preprocessor class'
        )
        try:
            self.X = data.drop(
                labels=label_column_name, axis=1
            )  # drop the columns specified ,i,e output column and separate the feature columns
            self.Y = data[label_column_name]  # Filter the Label columns
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Label Separation Successful. Exited the separate_label_feature method of the Preprocessor class'
            )
            print(self.X, self.Y)
            return self.X, self.Y
        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exception occured in separate_label_feature method of the Preprocessor class. Exception message:  '
                + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Label Separation Unsuccessful. Exited the separate_label_feature'
                ' method of the Preprocessor class')
            raise Exception()

    def is_null_present(self, data):
        """
                                Method Name: is_null_present
                                Description: This method checks whether there are null values present in the pandas Dataframe or not.
                                Output: Returns a Boolean Value. True if null values are present in the DataFrame, False if they are not present.
                                On Failure: Raise Exception

                                Written By: iNeuron Intelligence
                                Version: 1.0
                                Revisions: None

                        """
        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the is_null_present method of the Preprocessor class')
        self.null_present = False
        try:
            self.null_counts = data.isna().sum(
            )  # check for the count of null values per column
            for i in self.null_counts:
                if i > 0:
                    self.null_present = True
                    break
            if (self.null_present
                ):  # write the logs to see which columns have null values
                dataframe_with_null = pd.DataFrame()
                dataframe_with_null['columns'] = data.columns
                dataframe_with_null['missing values count'] = np.asarray(
                    data.isna().sum())
                print(dataframe_with_null)
                #dataframe_with_null.to_csv('preprocessing_data/null_values.csv') # storing the null column information to file
                self.az_blob_mgt.saveDataFrametoCSV(
                    "preprocessing-data",
                    "null_values.csv",
                    data_frame=dataframe_with_null)
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Finding missing values is a success.Data written'
                '[preprocessing-data] to the null values file. '
                'Exited the is_null_present method of the Preprocessor class')
            return self.null_present
        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exception occured in is_null_present method of the Preprocessor class. Exception message:  '
                + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Finding missing values failed. Exited the is_null_present method of the Preprocessor class'
            )
            raise Exception()

    def impute_missing_values(self, data):
        """
                                        Method Name: impute_missing_values
                                        Description: This method replaces all the missing values in the Dataframe using KNN Imputer.
                                        Output: A Dataframe which has all the missing values imputed.
                                        On Failure: Raise Exception

                                        Written By: iNeuron Intelligence
                                        Version: 1.0
                                        Revisions: None
                     """
        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the impute_missing_values method of the Preprocessor class'
        )
        self.data = data
        try:
            imputer = KNNImputer(n_neighbors=3,
                                 weights='uniform',
                                 missing_values=np.nan)
            self.new_array = imputer.fit_transform(
                self.data)  # impute the missing values
            # convert the nd-array returned in the step above to a Dataframe
            self.new_data = pd.DataFrame(data=self.new_array,
                                         columns=self.data.columns)
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class'
            )
            return self.new_data
        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exception occured in impute_missing_values method of the Preprocessor class. Exception message:  '
                + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class'
            )
            raise Exception()

    def get_columns_with_zero_std_deviation(self, data):
        """
                                                Method Name: get_columns_with_zero_std_deviation
                                                Description: This method finds out the columns which have a standard deviation of zero.
                                                Output: List of the columns with standard deviation of zero
                                                On Failure: Raise Exception

                                                Written By: iNeuron Intelligence
                                                Version: 1.0
                                                Revisions: None
                             """
        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the get_columns_with_zero_std_deviation method of the Preprocessor class'
        )
        self.columns = data.columns
        self.data_n = data.describe()
        self.col_to_drop = []
        try:
            for x in self.columns:
                if (self.data_n[x]['std'] == 0
                    ):  # check if standard deviation is zero
                    self.col_to_drop.append(
                        x
                    )  # prepare the list of columns with standard deviation zero
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Column search for Standard Deviation of Zero Successful. Exited the get_columns_with_zero_std_deviation method of the Preprocessor class'
            )
            print(self.col_to_drop)
            return self.col_to_drop

        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exception occured in get_columns_with_zero_std_deviation method of the Preprocessor class. Exception message:  '
                + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Column search for Standard Deviation of Zero Failed. Exited the get_columns_with_zero_std_deviation method of the Preprocessor class'
            )
            raise Exception()
Exemple #9
0
class Preprocessor:
    """
        This class shall  be used to clean and transform the data before training.

        Written By: iNeuron Intelligence
        Version: 1.0
        Revisions: None

        """
    def __init__(self, log_database, log_collection, execution_id):
        #self.file_object = file_object
        #self.logger_object = logger_object

        self.log_database = log_database
        self.log_collection = log_collection
        #self.execution_id=execution_id
        self.mongoDBObject = MongodbOperation()
        self.log_db_writer = App_LoggerDB(execution_id=execution_id)
        self.az_blob_mgt = AzureBlobManagement()

    def remove_columns(self, data, columns):
        """
                Method Name: remove_columns
                Description: This method removes the given columns from a pandas dataframe.
                Output: A pandas DataFrame after removing the specified columns.
                On Failure: Raise Exception

                Written By: iNeuron Intelligence
                Version: 1.0
                Revisions: None

        """
        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the remove_columns method of the Preprocessor class')
        self.data = data
        self.columns = columns
        try:
            self.useful_data = self.data.drop(
                labels=self.columns,
                axis=1)  # drop the labels specified in the columns
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Column removal Successful.Exited the remove_columns method of the Preprocessor class'
            )
            return self.useful_data
        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exception occured in remove_columns method of the Preprocessor class. Exception message:  '
                + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Column removal Unsuccessful. Exited the remove_columns method of the Preprocessor class'
            )
            raise Exception()

    def separate_label_feature(self, data, label_column_name):
        """
                        Method Name: separate_label_feature
                        Description: This method separates the features and a Label Coulmns.
                        Output: Returns two separate Dataframes, one containing features and the other containing Labels .
                        On Failure: Raise Exception

                        Written By: iNeuron Intelligence
                        Version: 1.0
                        Revisions: None

                """
        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the separate_label_feature method of the Preprocessor class'
        )
        try:
            self.X = data.drop(
                labels=label_column_name, axis=1
            )  # drop the columns specified and separate the feature columns
            self.Y = data[label_column_name]  # Filter the Label columns
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Label Separation Successful. Exited the separate_label_feature method of the Preprocessor class'
            )
            return self.X, self.Y
        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exception occured in separate_label_feature method of the Preprocessor class. Exception message:  '
                + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Label Separation Unsuccessful. Exited the separate_label_feature method of the Preprocessor class'
            )
            raise Exception()

    def dropUnnecessaryColumns(self, data, columnNameList):
        """
                        Method Name: is_null_present
                        Description: This method drops the unwanted columns as discussed in EDA section.

                        Written By: iNeuron Intelligence
                        Version: 1.0
                        Revisions: None

                                """
        data = data.drop(columnNameList, axis=1)
        return data

    def replaceInvalidValuesWithNull(self, data):
        """
                               Method Name: is_null_present
                               Description: This method replaces invalid values i.e. '?' with null, as discussed in EDA.

                               Written By: iNeuron Intelligence
                               Version: 1.0
                               Revisions: None

                                       """

        for column in data.columns:
            count = data[column][data[column] == '?'].count()
            if count != 0:
                data[column] = data[column].replace('?', np.nan)
        return data

    def is_null_present(self, data):
        """
                                Method Name: is_null_present
                                Description: This method checks whether there are null values present in the pandas Dataframe or not.
                                Output: Returns True if null values are present in the DataFrame, False if they are not present and
                                        returns the list of columns for which null values are present.
                                On Failure: Raise Exception

                                Written By: iNeuron Intelligence
                                Version: 1.0
                                Revisions: None

                        """
        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the is_null_present method of the Preprocessor class')
        self.null_present = False
        self.cols_with_missing_values = []
        self.cols = data.columns
        try:
            self.null_counts = data.isna().sum(
            )  # check for the count of null values per column
            for i in range(len(self.null_counts)):
                if self.null_counts[i] > 0:
                    self.null_present = True
                    self.cols_with_missing_values.append(self.cols[i])
            if (self.null_present
                ):  # write the logs to see which columns have null values
                dataframe_with_null = pd.DataFrame()
                dataframe_with_null['columns'] = data.columns
                dataframe_with_null['missing values count'] = np.asarray(
                    data.isna().sum())
                print(dataframe_with_null)
                #dataframe_with_null.to_csv('preprocessing_data/null_values.csv') # storing the null column information to file
                self.az_blob_mgt.saveDataFrametoCSV(
                    "preprocessing-data",
                    "null_values.csv",
                    data_frame=dataframe_with_null)
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Finding missing values is a success.Data written to the null values file. Exited the is_null_present method of the Preprocessor class'
            )
            return self.null_present, self.cols_with_missing_values
        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exception occured in is_null_present method of the Preprocessor class. Exception message:  '
                + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Finding missing values failed. Exited the is_null_present method of the Preprocessor class'
            )
            raise Exception()

    def encodeCategoricalValues(self, data):
        """
                                        Method Name: encodeCategoricalValues
                                        Description: This method encodes all the categorical values in the training set.
                                        Output: A Dataframe which has all the categorical values encoded.
                                        On Failure: Raise Exception

                                        Written By: iNeuron Intelligence
                                        Version: 1.0
                                        Revisions: None
                     """
        data["class"] = data["class"].map({'p': 1, 'e': 2})

        for column in data.drop(['class'], axis=1).columns:
            data = pd.get_dummies(data, columns=[column])

        return data

    def encodeCategoricalValuesPrediction(self, data):
        """
                                               Method Name: encodeCategoricalValuesPrediction
                                               Description: This method encodes all the categorical values in the prediction set.
                                               Output: A Dataframe which has all the categorical values encoded.
                                               On Failure: Raise Exception

                                               Written By: iNeuron Intelligence
                                               Version: 1.0
                                               Revisions: None
                            """

        for column in data.columns:
            data = pd.get_dummies(data, columns=[column])

        return data

    # def handleImbalanceDataset(self,X,Y):
    #     """
    #                                                   Method Name: handleImbalanceDataset
    #                                                   Description: This method handles the imbalance in the dataset by oversampling.
    #                                                   Output: A Dataframe which is balanced now.
    #                                                   On Failure: Raise Exception
    #
    #                                                   Written By: iNeuron Intelligence
    #                                                   Version: 1.0
    #                                                   Revisions: None
    #                                """
    #
    #
    #
    #     rdsmple = RandomOverSampler()
    #     x_sampled, y_sampled = rdsmple.fit_sample(X, Y)
    #
    #     return x_sampled,y_sampled

    def standardScalingData(self, X):

        scalar = StandardScaler()
        X_scaled = scalar.fit_transform(X)

        return X_scaled

    def logTransformation(self, X):

        for column in X.columns:
            X[column] += 1
            X[column] = np.log(X[column])

        return X

    def impute_missing_values(self, data):
        """
                                        Method Name: impute_missing_values
                                        Description: This method replaces all the missing values in the Dataframe using KNN Imputer.
                                        Output: A Dataframe which has all the missing values imputed.
                                        On Failure: Raise Exception

                                        Written By: iNeuron Intelligence
                                        Version: 1.0
                                        Revisions: None
                     """
        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the impute_missing_values method of the Preprocessor class'
        )
        self.data = data
        try:
            imputer = KNNImputer(n_neighbors=3,
                                 weights='uniform',
                                 missing_values=np.nan)
            self.new_array = imputer.fit_transform(
                self.data)  # impute the missing values
            # convert the nd-array returned in the step above to a Dataframe
            self.new_data = pd.DataFrame(data=(self.new_array),
                                         columns=self.data.columns)
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class'
            )
            print(self.new_data)
            return self.new_data
        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exception occured in impute_missing_values method of the Preprocessor class. Exception message:  '
                + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class'
            )
            raise Exception()

    def get_columns_with_zero_std_deviation(self, data):
        """
                                                Method Name: get_columns_with_zero_std_deviation
                                                Description: This method finds out the columns which have a standard deviation of zero.
                                                Output: List of the columns with standard deviation of zero
                                                On Failure: Raise Exception

                                                Written By: iNeuron Intelligence
                                                Version: 1.0
                                                Revisions: None
                             """
        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the get_columns_with_zero_std_deviation method of the Preprocessor class'
        )
        self.columns = data.columns
        self.data_n = data.describe()
        self.col_to_drop = []
        try:
            for x in self.columns:
                if (self.data_n[x]['std'] == 0
                    ):  # check if standard deviation is zero
                    self.col_to_drop.append(
                        x
                    )  # prepare the list of columns with standard deviation zero
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Column search for Standard Deviation of Zero Successful. Exited the get_columns_with_zero_std_deviation method of the Preprocessor class'
            )
            print(self.col_to_drop)
            return self.col_to_drop

        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exception occured in get_columns_with_zero_std_deviation method of the Preprocessor class. Exception message:  '
                + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Column search for Standard Deviation of Zero Failed. Exited the get_columns_with_zero_std_deviation method of the Preprocessor class'
            )
            raise Exception()
Exemple #10
0
class pred_validation:
    def __init__(self, path, execution_id):
        self.raw_data = Prediction_Data_validation(path, execution_id)
        self.dataTransform = dataTransformPredict(execution_id)
        #self.dBOperation = dBOperation()
        #self.file_object = open("Prediction_Logs/Prediction_Log.txt", 'a+')
        #self.log_writer = logger.App_Logger()
        self.dBOperationMongoDB = DbOperationMongoDB(execution_id)
        self.log_database = "strength_prediction_log"
        self.log_collection = "stg-prediction_main_log"
        self.execution_id = execution_id
        self.logDB_write = App_LoggerDB(execution_id=execution_id)
        self.az_blob_mgt = AzureBlobManagement()

    def prediction_validation(self):

        try:

            self.logDB_write.log(
                self.log_database, self.log_collection,
                'Start of Validation on files for prediction!!')
            #extracting values from prediction schema
            LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, noofcolumns = self.raw_data.valuesFromSchema(
            )
            #getting the regex defined to validate filename
            regex = self.raw_data.manualRegexCreation()
            #validating filename of prediction files
            self.raw_data.validationFileNameRaw(regex, LengthOfDateStampInFile,
                                                LengthOfTimeStampInFile)
            #validating column length in the file
            self.raw_data.validateColumnLength(noofcolumns)
            #validating if any column has all values missing
            self.raw_data.validateMissingValuesInWholeColumn()
            self.logDB_write.log(self.log_database, self.log_collection,
                                 "Raw Data Validation Complete!!")

            self.logDB_write.log(
                self.log_database, self.log_collection,
                "Creating Prediction_Database and tables on the basis of given schema!!!"
            )
            #create database with given name, if present open the connection! Create table with columns given in schema
            self.dBOperationMongoDB.insertIntoTableGoodData(column_names)
            self.logDB_write.log(self.log_database, self.log_collection,
                                 "Table creation Completed!!")
            self.logDB_write.log(self.log_database, self.log_collection,
                                 "Insertion of Data into Table started!!!!")
            #insert csv files in the table
            #self.dBOperationMongoDB.insertIntoTableGoodData('Prediction') #***************************** NEED TO CHECK BEFORE RUNNING******
            #self.logDB_write.log(self.log_database,"Insertion in Table completed!!!")
            #self.logDB_write.log(self.log_database,"Deleting Good Data Folder!!!")
            #Delete the good data folder after loading files in table
            self.raw_data.deleteExistingGoodDataTrainingFolder()
            self.logDB_write.log(self.log_database, self.log_collection,
                                 "Good_Data folder deleted!!!")
            self.logDB_write.log(
                self.log_database, self.log_collection,
                "Moving bad files to Archive and deleting Bad_Data folder!!!")
            #Move the bad files to archive folder
            self.raw_data.moveBadFilesToArchiveBad()
            self.logDB_write.log(
                self.log_database, self.log_collection,
                "Bad files moved to archive!! Bad folder Deleted!!")
            self.logDB_write.log(self.log_database, self.log_collection,
                                 "Validation Operation completed!!")
            self.logDB_write.log(self.log_database, self.log_collection,
                                 "Extracting csv file from table")
            #export data in table to csvfile
            self.dBOperationMongoDB.selectingDatafromtableintocsv()

        except Exception as e:
            raise e
class DbOperationMongoDB:
    """
          This class shall be used for handling all the mongodb operations.

          Written By: iNeuron Intelligence
          Version: 1.0
          Revisions: None

          """

    def __init__(self, execution_id):
        self.mongodb=MongodbOperation()
        self.az_blob_mgt=AzureBlobManagement()
        self.logger_db_writer=App_LoggerDB(execution_id=execution_id)
        self.good_file_path="good-raw-file-prediction-validated"
        self.bad_file_path="bad-raw-file-prediction-validated"


    def insertIntoTableGoodData(self,column_names):
        """
        Description: Load all csv file into mongo db database "prediction_database" ,collection:"Good_Raw_Data"
        from azure storage -good data  and clear the storage space.

        :return:
        """
        try:
            prediction_database="prediction_database" # mongodb name
            prediction_collection="Good_Raw_Data" # mongodb name
            database_name = "wafer_prediction_log" ## logger name
            collection_name = "db_insert_log" ## logger name
            self.mongodb.dropCollection(prediction_database,prediction_collection)
            self.logger_db_writer.log(database_name,collection_name,"Droping collection:"+prediction_collection+" from database:"+prediction_database)
            self.logger_db_writer.log(database_name, collection_name,"Starting loading of good files in database:training_database and collection: Good_Raw_Data")
            files = self.az_blob_mgt.getAllFileNameFromDirectory(self.good_file_path)
            self.logger_db_writer.log(database_name, collection_name,"No of file found in good-raw-file-train-validated " + str(len(files)))
            for file in files:
                try:
                    self.logger_db_writer.log(database_name, collection_name,
                                              "Insertion of file +" + file + " started...")
                    df = self.az_blob_mgt.readCSVFilefromDir(self.good_file_path, file)
                    df.columns=column_names
                    print("dataframe before insertion")
                    print(df)
                    self.mongodb.insertDataFrame(prediction_database, prediction_collection, df)
                    self.logger_db_writer.log(database_name, collection_name,
                                              "File: {0} loaded successfully".format(file))
                except Exception as e:
                    self.logger_db_writer.log(database_name, collection_name, str(e))
                    self.az_blob_mgt.moveFileinDir(self.good_file_path, self.bad_file_path, file)
                    self.logger_db_writer.log(database_name, collection_name,
                                              "File: " + file + " was not loaded successfully hence moved to dir:" + self.bad_file_path)

        except Exception as e:
            error_message = "Error occured in class:DbOperationMongoDB method:insertIntoTableGoodData error:" + str(e)
            self.logger_db_writer.log(database_name, collection_name, error_message)

    def selectingDatafromtableintocsv(self,):
        """

        :return:
        """
        try:
            directory_name="prediction-file-from-db" # azure storage name
            file_name="InputFile.csv" # azure storage name
            database_name = "wafer_prediction_log" # logger name
            collection_name = "export_to_csv" # logger name
            prediction_database="prediction_database" # mongodb name
            prediction_collection="Good_Raw_Data" # mongodb name
            msg="starting of loading of database:"+prediction_database+",collection:"+prediction_collection+" records into file:"+file_name
            self.logger_db_writer.log(database_name,collection_name,msg)
            df=self.mongodb.getDataFrameofCollection(prediction_database,prediction_collection)
            print("after dataframe from db extraction")
            print(df)
            msg="Good_Raw_data has been loaded into pandas dataframe"
            print(msg)
            self.logger_db_writer.log(database_name,collection_name,msg)
            self.az_blob_mgt.saveDataFrametoCSV(directory_name,file_name,df,index=None,header=True)
            # since the inputFile.csv has unammed column, added index =0 later removed as unnamed 0.1 is present in predictionDatavalidation.py ln 410.
            msg = "InputFile.csv created successfully in directory"+directory_name
            print(msg)
            self.logger_db_writer.log(database_name, collection_name, msg)
        except Exception as e:
            msg="Error occured in class:DbOperationMongoDB method:insertIntoTableGoodData error:"+str(e)
            self.logger_db_writer.log(database_name,collection_name,msg)
Exemple #12
0
class DbOperationMongoDB:
    """
      This class shall be used for handling all the SQL operations.

      Written By: iNeuron Intelligence
      Version: 1.0
      Revisions: None

      """
    def __init__(self, execution_id):
        self.mongodb = MongodbOperation()
        self.az_blob_mgt = AzureBlobManagement()
        self.logger_db_writer = App_LoggerDB(execution_id=execution_id)
        self.good_file_path = "good-raw-file-train-validated"
        self.bad_file_path = "bad-raw-file-train-validated"

    def insertIntoTableGoodData(self, column_name):
        """
        Description: Load all csv file into mongo db database "training_database" ,collection:"Good_Raw_Data"

        :return:
        """
        database_name = "wafer_training_log"
        collection_name = "db_insert_log"

        try:
            database_name = "wafer_training_log"  # logger name
            collection_name = "db_insert_log"  # logger name
            self.logger_db_writer.log(
                database_name, collection_name,
                "Droping existing collection if present in database training_database"
            )
            self.mongodb.dropCollection("training_database", "Good_Raw_Data")

            self.logger_db_writer.log(
                database_name, collection_name,
                "Starting loading of good files in database:training_database and collection: Good_Raw_Data"
            )
            files = self.az_blob_mgt.getAllFileNameFromDirectory(
                self.good_file_path)
            self.logger_db_writer.log(
                database_name, collection_name,
                "No of file found in good-raw-file-train-validated " +
                str(len(files)))
            for file in files:
                try:
                    self.logger_db_writer.log(
                        database_name, collection_name,
                        "Insertion of file " + file + " started...")
                    df = self.az_blob_mgt.readCSVFilefromDir(
                        self.good_file_path, file)
                    df.columns = column_name
                    self.mongodb.insertDataFrame("training_database",
                                                 "Good_Raw_Data", df)
                    self.logger_db_writer.log(
                        database_name, collection_name,
                        "File: {0} loaded successfully".format(file))
                except Exception as e:
                    self.logger_db_writer.log(database_name, collection_name,
                                              str(e))
                    self.az_blob_mgt.moveFileinDir(self.good_file_path,
                                                   self.bad_file_path, file)
                    self.logger_db_writer.log(
                        database_name, collection_name, "File " + file +
                        " was not loaded successfully hence moved tp dir:" +
                        self.bad_file_path)

        except Exception as e:
            error_message = "Error occured in class:DbOperationMongoDB method:insertIntoTableGoodData error:" + str(
                e)
            self.logger_db_writer.log(database_name, collection_name,
                                      error_message)

    def selectingDatafromtableintocsv(self, ):
        """

        :return:
        """
        database_name = "wafer_training_log"
        collection_name = "export_to_csv"
        try:
            directory_name = "training-file-from-db"
            file_name = "InputFile"

            training_database = "training_database"
            training_collection = "Good_Raw_Data"
            msg = "starting of loading of database:training_database,collection:Good_Raw_Data records into InputFile.csv"
            print(msg)
            self.logger_db_writer.log(database_name, collection_name, msg)
            df = self.mongodb.getDataFrameofCollection(training_database,
                                                       training_collection)
            print(df)
            msg = "Good_Raw_data has been loaded into pandas dataframe"
            self.logger_db_writer.log(database_name, collection_name, msg)
            self.az_blob_mgt.saveDataFrametoCSV(directory_name, file_name, df)
            msg = "InputFile.csv created successfully in directory " + directory_name
            self.logger_db_writer.log(database_name, collection_name, msg)
        except Exception as e:
            msg = "Error occured in class:DbOperationMongoDB method:insertIntoTableGoodData error:" + str(
                e)
            self.logger_db_writer.log(database_name, collection_name, msg)
class prediction:
    def __init__(self, path, execution_id):
        #self.file_object = open("Prediction_Logs/Prediction_Log.txt", 'a+')
        #self.log_writer = logger.App_Logger()
        #self.pred_data_val = Prediction_Data_validation(path)
        self.execution_id = execution_id
        self.log_database = "strength_prediction_log"
        self.log_collection = "prediction_log"
        self.log_db_writer = App_LoggerDB(execution_id)
        self.az_blob_mgt = AzureBlobManagement()
        if path is not None:
            self.pred_data_val = Prediction_Data_validation(path, execution_id)

    def predictionFromModel(self):

        try:
            self.pred_data_val.deletePredictionFile(
            )  #deletes the existing prediction file from last run!
            self.log_db_writer.log(self.log_database, self.log_collection,
                                   'Start of Prediction')
            print("start of prediction")
            data_getter = data_loader_prediction.Data_Getter_Pred(
                self.log_database, self.log_collection, self.execution_id)
            data = data_getter.get_data()

            path = ""
            if data.__len__() == 0:
                self.log_db_writer.log(
                    self.log_database, self.log_collection,
                    "No data was present to perform prediction existing prediction method"
                )
                return path, "No data was present to perform prediction"

            #code change
            # wafer_names=data['Wafer']
            # data=data.drop(labels=['Wafer'],axis=1)

            preprocessor = preprocessing.Preprocessor(self.log_database,
                                                      self.log_collection,
                                                      self.execution_id)

            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)
            if (is_null_present):
                data = preprocessor.impute_missing_values(data)

            data = preprocessor.logTransformation(data)
            print("after log Transformation")
            print(data)

            #scale the prediction data
            data_scaled = pandas.DataFrame(
                preprocessor.standardScalingData(data), columns=data.columns)

            print("standard scaling for data completed")
            print(data_scaled)

            #data=data.to_numpy()
            file_loader = file_methods.File_Operation(self.log_database,
                                                      self.log_collection,
                                                      self.execution_id)
            kmeans = file_loader.load_model('kkmeans')

            ##Code changed
            #pred_data = data.drop(['Wafer'],axis=1)
            clusters = kmeans.predict(
                data_scaled)  #drops the first column for cluster prediction
            data_scaled['clusters'] = clusters
            clusters = data_scaled['clusters'].unique()
            result = []  # initialize blank list for storing predicitons
            # with open('EncoderPickle/enc.pickle', 'rb') as file: #let's load the encoder pickle file to decode the values
            #     encoder = pickle.load(file)

            for i in clusters:
                cluster_data = data_scaled[data_scaled['clusters'] == i]
                cluster_data = cluster_data.drop(['clusters'], axis=1)
                model_name = file_loader.find_correct_model_file(i)
                print(model_name)
                model = file_loader.load_model(model_name)
                for val in (model.predict(cluster_data.values)):
                    result.append(val)

            result = pandas.DataFrame(result, columns=['strength-Predictions'])

            #result = list(model.predict(cluster_data))
            #self.result = pandas.DataFrame(list(zip(result)), columns=['Prediction'])
            #for val in (model.predict(cluster_data.values)):
            #    result.append(val)
            #print(self.result.shape)
            print("results after prediction with prediction columns")
            print(result)

            path = "Prediction-Output-File"
            #result.to_csv("Prediction_Output_File/Predictions.csv",header=True) #appends result to prediction file
            self.az_blob_mgt.saveDataFrametoCSV(
                path,
                "cement-strength-prediction.csv",
                result,
                header=True,
                mode="a+")

            self.log_db_writer.log(self.log_database, self.log_collection,
                                   'End of Prediction')
        except Exception as ex:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex
        return path, result.head().to_json(orient="records")
class train_validation:
    def __init__(self, path, execution_id):
        self.raw_data = Raw_Data_validation(path, execution_id)
        self.dataTransform = dataTransform(execution_id)

        self.dBOperationMongoDB = DbOperationMongoDB(execution_id)
        #self.file_object = open("Training_Logs/Training_Main_Log.txt", 'a+')
        self.log_database = "wafer_training_log"
        self.log_collection = "training_main_log"
        self.execution_id = execution_id
        #self.log_writer = logger.App_Logger()
        self.logDB_write = App_LoggerDB(execution_id=execution_id)
        self.az_blob_mgt = AzureBlobManagement()

    def train_validation(self):
        try:
            self.logDB_write.log(self.log_database, self.log_collection,
                                 'Start of Validation on files!!')
            # extracting values from prediction schema
            LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, noofcolumns = self.raw_data.valuesFromSchema(
            )
            # getting the regex defined to validate filename
            regex = self.raw_data.manualRegexCreation()
            # validating filename of prediction files
            self.raw_data.validationFileNameRaw(regex, LengthOfDateStampInFile,
                                                LengthOfTimeStampInFile)
            # validating column length in the file
            self.raw_data.validateColumnLength(noofcolumns)
            # validating if any column has all values missing
            self.raw_data.validateMissingValuesInWholeColumn()
            self.logDB_write.log(self.log_database, self.log_collection,
                                 "Raw Data Validation Complete!!")

            self.logDB_write.log(self.log_database, self.log_collection,
                                 "Starting Data Transforamtion!!")
            # replacing blanks in the csv file with "Null" values to insert in table
            self.dataTransform.replaceMissingWithNull()
            print("Missing value with NULL completed")

            self.logDB_write.log(self.log_database, self.log_collection,
                                 "DataTransformation Completed!!!")

            self.logDB_write.log(
                self.log_database, self.log_collection,
                "Creating database and collection if not exist then insert record"
            )
            # create database with given name, if present open the connection! Create table with columns given in schema
            #self.dBOperationMongoDB.insertIntoTableGoodData(column_names)
            #self.logDB_write.log(self.log_database, self.log_collection, "Table creation Completed!!")
            #self.logDB_write.log(self.log_database, self.log_collection, "Insertion of Data into Table started!!!!")
            # insert csv files stored in azure storage in the table in mongodb location
            self.dBOperationMongoDB.insertIntoTableGoodData(column_names)
            self.logDB_write.log(self.log_database, self.log_collection,
                                 "Insertion in Table completed!!!")
            self.logDB_write.log(self.log_database, self.log_collection,
                                 "Deleting Good Data Folder!!!")
            # Delete the good data folder after loading files in table
            #self.raw_data.deleteExistingGoodDataTrainingFolder()
            self.logDB_write.log(self.log_database, self.log_collection,
                                 "Good_Data folder deleted!!!")
            self.logDB_write.log(
                self.log_database, self.log_collection,
                "Moving bad files to Archive and deleting Bad_Data folder!!!")
            # Move the bad files to archive folder
            self.raw_data.moveBadFilesToArchiveBad()
            self.logDB_write.log(
                self.log_database, self.log_collection,
                "Bad files moved to archive!! Bad folder Deleted!!")
            self.logDB_write.log(self.log_database, self.log_collection,
                                 "Validation Operation completed!!")
            self.logDB_write.log(self.log_database, self.log_collection,
                                 "Extracting csv file from table")
            # export data in table from mongodb to csvfile
            self.dBOperationMongoDB.selectingDatafromtableintocsv()
            #self.file_object.close()

        except Exception as e:
            raise e
Exemple #15
0
class Model_Finder:
    """
                This class shall  be used to find the model with best accuracy and AUC score.
                Written By: iNeuron Intelligence
                Version: 1.0
                Revisions: None

                """
    def __init__(self, log_database, log_collection, execution_id):
        #self.file_object = file_object
        #self.logger_object = logger_object

        self.execution_id = execution_id
        self.log_db_writer = App_LoggerDB(execution_id=execution_id)
        self.log_database = log_database
        self.log_collection = log_collection
        self.az_blob_mgt = AzureBlobManagement()
        self.mongoDBObject = MongodbOperation()

        self.clf = RandomForestClassifier()
        self.xgb = XGBClassifier(objective='binary:logistic')

    def get_best_params_for_random_forest(self, train_x, train_y):
        """
                                Method Name: get_best_params_for_random_forest
                                Description: get the parameters for Random Forest Algorithm which give the best accuracy.
                                             Use Hyper Parameter Tuning.
                                Output: The model with the best parameters
                                On Failure: Raise Exception

                                Written By: iNeuron Intelligence
                                Version: 1.0
                                Revisions: None

                        """
        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the get_best_params_for_random_forest method of the Model_Finder class'
        )
        try:
            # initializing with different combination of parameters
            self.param_grid = {
                "n_estimators": [10, 50, 100, 130],
                "criterion": ['gini', 'entropy'],
                "max_depth": range(2, 4, 1),
                "max_features": ['auto', 'log2']
            }

            #Creating an object of the Grid Search class
            self.grid = GridSearchCV(estimator=self.clf,
                                     param_grid=self.param_grid,
                                     cv=5,
                                     verbose=3)
            #finding the best parameters
            self.grid.fit(train_x, train_y)

            #extracting the best parameters
            self.criterion = self.grid.best_params_['criterion']
            self.max_depth = self.grid.best_params_['max_depth']
            self.max_features = self.grid.best_params_['max_features']
            self.n_estimators = self.grid.best_params_['n_estimators']

            #creating a new model with the best parameters
            self.clf = RandomForestClassifier(n_estimators=self.n_estimators,
                                              criterion=self.criterion,
                                              max_depth=self.max_depth,
                                              max_features=self.max_features)
            # training the mew model
            self.clf.fit(train_x, train_y)
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Random Forest best params: ' + str(self.grid.best_params_) +
                '. Exited the get_best_params_for_random_forest method of the Model_Finder class'
            )

            return self.clf
        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exception occured in get_best_params_for_random_forest method of the Model_Finder class. Exception message:  '
                + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Random Forest Parameter tuning  failed. Exited the get_best_params_for_random_forest method of the Model_Finder class'
            )
            raise Exception()

    def get_best_params_for_xgboost(self, train_x, train_y):
        """
                                        Method Name: get_best_params_for_xgboost
                                        Description: get the parameters for XGBoost Algorithm which give the best accuracy.
                                                     Use Hyper Parameter Tuning.
                                        Output: The model with the best parameters
                                        On Failure: Raise Exception

                                        Written By: iNeuron Intelligence
                                        Version: 1.0
                                        Revisions: None

                                """
        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the get_best_params_for_xgboost method of the Model_Finder class'
        )
        try:
            # initializing with different combination of parameters
            self.param_grid_xgboost = {
                'learning_rate': [0.5, 0.1, 0.01, 0.001],
                'max_depth': [3, 5, 10, 20],
                'n_estimators': [10, 50, 100, 200]
            }
            # Creating an object of the Grid Search class
            self.grid = GridSearchCV(
                XGBClassifier(objective='binary:logistic'),
                self.param_grid_xgboost,
                verbose=3,
                cv=5)
            # finding the best parameters
            self.grid.fit(train_x, train_y)

            # extracting the best parameters
            self.learning_rate = self.grid.best_params_['learning_rate']
            self.max_depth = self.grid.best_params_['max_depth']
            self.n_estimators = self.grid.best_params_['n_estimators']

            # creating a new model with the best parameters
            self.xgb = XGBClassifier(learning_rate=self.learning_rate,
                                     max_depth=self.max_depth,
                                     n_estimators=self.n_estimators)
            # training the mew model
            self.xgb.fit(train_x, train_y)
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'XGBoost best params: ' + str(self.grid.best_params_) +
                '. Exited the get_best_params_for_xgboost method of the Model_Finder class'
            )
            return self.xgb
        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exception occured in get_best_params_for_xgboost method of the Model_Finder class. Exception message:  '
                + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'XGBoost Parameter tuning  failed. Exited the get_best_params_for_xgboost method of the Model_Finder class'
            )
            raise Exception()

    def get_best_model(self, train_x, train_y, test_x, test_y):
        """
                                                Method Name: get_best_model
                                                Description: Find out the Model which has the best AUC score.
                                                Output: The best model name and the model object
                                                On Failure: Raise Exception

                                                Written By: iNeuron Intelligence
                                                Version: 1.0
                                                Revisions: None

                                        """
        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the get_best_model method of the Model_Finder class')
        # create best model for XGBoost
        try:
            self.xgboost = self.get_best_params_for_xgboost(train_x, train_y)
            self.prediction_xgboost = self.xgboost.predict(
                test_x)  # Predictions using the XGBoost Model

            if len(
                    test_y.unique()
            ) == 1:  #if there is only one label in y, then roc_auc_score returns error. We will use accuracy in that case
                self.xgboost_score = accuracy_score(test_y,
                                                    self.prediction_xgboost)
                self.log_db_writer.log(self.log_database, self.log_collection,
                                       'Accuracy for XGBoost:' +
                                       str(self.xgboost_score))  # Log AUC
            else:
                self.xgboost_score = roc_auc_score(
                    test_y, self.prediction_xgboost)  # AUC for XGBoost
                self.log_db_writer.log(self.log_database, self.log_collection,
                                       'AUC for XGBoost:' +
                                       str(self.xgboost_score))  # Log AUC

            # create best model for Random Forest
            self.random_forest = self.get_best_params_for_random_forest(
                train_x, train_y)
            self.prediction_random_forest = self.random_forest.predict(
                test_x)  # prediction using the Random Forest Algorithm

            if len(
                    test_y.unique()
            ) == 1:  #if there is only one label in y, then roc_auc_score returns error. We will use accuracy in that case
                self.random_forest_score = accuracy_score(
                    test_y, self.prediction_random_forest)
                self.log_db_writer.log(
                    self.log_database, self.log_collection,
                    'Accuracy for RF:' + str(self.random_forest_score))
            else:
                self.random_forest_score = roc_auc_score(
                    test_y,
                    self.prediction_random_forest)  # AUC for Random Forest
                self.log_db_writer.log(
                    self.log_database, self.log_collection,
                    'AUC for RF:' + str(self.random_forest_score))

            #comparing the two models
            if (self.random_forest_score < self.xgboost_score):
                return 'XGBoost', self.xgboost
            else:
                return 'RandomForest', self.random_forest

        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exception occured in get_best_model method of the Model_Finder class. Exception message:  '
                + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Model Selection Failed. Exited the get_best_model method of the Model_Finder class'
            )
            raise Exception()
class DbOperationMongoDB:
    def __init__(self, execution_id):
        self.mongodb = MongoDBOperation()
        self.az_blob_mgt = AzureBlobManagement()
        self.logger_db_writer = App_LoggerDB(execution_id=execution_id)
        self.good_file_path = "good-raw-file-prediction-validated"
        self.bad_file_path = "bad-raw-file-prediction-validated"

    def insertIntoTableGoodData(self, column_names):
        """
        Description: Load all csv file into mongo db database "training_database" ,collection:"Good_Raw_Data"


        :return:
        """
        try:
            prediction_database = "prediction_database"
            prediction_collection = "Good_Raw_Data"
            database_name = "wafer_prediction_log"
            collection_name = "db_insert_log"
            self.mongodb.dropCollection(prediction_database,
                                        prediction_collection)
            self.logger_db_writer.log(
                database_name, collection_name,
                "Droping collection:" + prediction_collection +
                " from database:" + prediction_database)
            self.logger_db_writer.log(
                database_name, collection_name,
                "Starting loading of good files in database:training_database and collection: Good_Raw_Data"
            )
            files = self.az_blob_mgt.getAllFileNameFromDirectory(
                self.good_file_path)
            self.logger_db_writer.log(
                database_name, collection_name,
                "No of file found in good-raw-file-train-validated " +
                str(len(files)))
            for file in files:
                try:
                    self.logger_db_writer.log(
                        database_name, collection_name,
                        "Insertion of file +" + file + " started...")
                    df = self.az_blob_mgt.readCsvFileFromDirectory(
                        self.good_file_path, file)
                    df.columns = column_names
                    self.mongodb.insertDataFrame(prediction_database,
                                                 prediction_collection, df)
                    self.logger_db_writer.log(
                        database_name, collection_name,
                        "File: {0} loaded successfully".format(file))
                except Exception as e:
                    self.logger_db_writer.log(database_name, collection_name,
                                              str(e))
                    self.az_blob_mgt.moveFileInDirectory(
                        self.good_file_path, self.bad_file_path, file)
                    self.logger_db_writer.log(
                        database_name, collection_name, "File: " + file +
                        " was not loaded successfully hence moved tp dir:" +
                        self.bad_file_path)

        except Exception as e:
            error_message = "Error occured in class:DbOperationMongoDB method:insertIntoTableGoodData error:" + str(
                e)
            self.logger_db_writer.log(database_name, collection_name,
                                      error_message)

    def selectingDatafromtableintocsv(self, ):
        """

        :return:
        """
        try:
            directory_name = "prediction-file-from-db"
            file_name = "InputFile.csv"
            database_name = "wafer_prediction_log"
            collection_name = "export_to_csv"
            prediction_database = "prediction_database"
            prediction_collection = "Good_Raw_Data"
            msg = "starting of loading of database:" + prediction_database + ",collection:" + prediction_collection + " records into file:" + file_name
            self.logger_db_writer.log(database_name, collection_name, msg)
            df = self.mongodb.getDataFrameofCollection(prediction_database,
                                                       prediction_collection)
            msg = "Good_Raw_data has been loaded into pandas dataframe"
            self.logger_db_writer.log(database_name, collection_name, msg)
            self.az_blob_mgt.saveDataFrameTocsv(directory_name, file_name, df)
            msg = "InputFile.csv created successfully in directory" + directory_name
            self.logger_db_writer.log(database_name, collection_name, msg)
        except Exception as e:
            msg = "Error occured in class:DbOperationMongoDB method:insertIntoTableGoodData error:" + str(
                e)
            self.logger_db_writer.log(database_name, collection_name, msg)
class File_Operation:
    """
                This class shall be used to save the model after training
                and load the saved model for prediction.

                Written By: iNeuron Intelligence
                Version: 1.0
                Revisions: None

                """
    def __init__(self, log_database, log_collection, execution_id):
        #self.file_object = file_object
        #self.logger_object = logger_object
        #self.model_directory='models/'

        self.log_database = log_database
        self.log_collection = log_collection
        self.execution_id = execution_id
        self.log_db_writer = App_LoggerDB(execution_id=self.execution_id)
        self.model_directory = 'model'
        self.az_blob_mgt = AzureBlobManagement()

    def save_model(self, model, filename):
        """`
            Method Name: save_model
            Description: Save the model file to directory
            Outcome: File gets saved
            On Failure: Raise Exception

            Written By: iNeuron Intelligence
            Version: 1.0
            Revisions: None
"""
        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the save_model method of the File_Operation class')
        directory_name = self.model_directory + '-' + filename
        try:
            self.az_blob_mgt.createDir(
                directory_name, is_replace=True)  # create or replace directory
            # path = os.path.join(self.model_directory,filename) #create seperate directory for each cluster
            # if os.path.isdir(path): #remove previously existing models for each clusters
            #     shutil.rmtree(self.model_directory)
            #     os.makedirs(path)
            #else:
            #     os.makedirs(path) #

            # with open(path +'/' + filename+'.sav',
            #           'wb') as f:
            #     pickle.dump(model, f) # save the model to file
            self.az_blob_mgt.saveObject(directory_name=directory_name,
                                        filename=filename + '.sav',
                                        object_name=model)
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Model File ' + filename +
                ' saved. Exited the save_model method of the Model_Finder class'
            )

            return 'success'
        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exception occured in save_model method of the'
                ' Model_Finder class. Exception message:  ' + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Model File ' + filename +
                ' could not be saved. Exited the save_model method of the Model_Finder class'
            )
            raise Exception()

    def load_model(self, filename):
        """
                    Method Name: load_model
                    Description: load the model file to memory
                    Output: The Model file loaded in memory
                    On Failure: Raise Exception

                    Written By: iNeuron Intelligence
                    Version: 1.0
                    Revisions: None
        """
        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the load_model method of the File_Operation class')
        try:
            directory = self.model_directory + '-' + filename
            filename = filename + '.sav'
            object_model = self.az_blob_mgt.loadObject(directory, filename)
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Model File ' + filename + ' loaded. Exited '
                'the load_model method of the Model_Finder class')

            #with open(self.model_directory + filename + '/' + filename + '.sav',
            #          'rb') as f:
            #    self.log_db_writer.log(self.log_database,self.log_collection,
            #                           'Model File ' + filename + ' loaded. Exited the load_model method of the Model_Finder class')
            return object_model
        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exception occured in load_model method of the Model_Finder class. Exception message:  '
                + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Model File ' + filename +
                ' could not be saved. Exited the load_model method of the Model_Finder class'
            )
            raise Exception()

    def find_correct_model_file(self, cluster_number):
        """
                            Method Name: find_correct_model_file
                            Description: Select the correct model based on cluster number
                            Output: The Model file
                            On Failure: Raise Exception

                            Written By: iNeuron Intelligence
                            Version: 1.0
                            Revisions: None
                """
        self.log_db_writer.log(
            self.log_database, self.log_collection,
            'Entered the find_correct_model_file method of the File_Operation class'
        )
        try:
            self.cluster_number = cluster_number
            self.folder_name = self.model_directory
            self.list_of_model_files = []
            self.required_files = self.az_blob_mgt.dir_list
            self.list_of_files = []
            # selecting model directory only
            #self.list_of_files = os.listdir(self.folder_name)
            for dir in self.required_files:
                if re.search("^model[-][a-zA-z]{2,17}[0-9]", dir):
                    self.list_of_files.append(dir)

            for self.file in self.list_of_files:
                try:
                    #selecting model file name in models
                    models = self.az_blob_mgt.getAllFileNameFromDirectory(
                        self.file)
                    for model_name_ in models:
                        if (model_name_.index(str(self.cluster_number)) != -1):
                            self.model_name = model_name_
                except:
                    continue
            self.model_name = self.model_name.split('.')[0]
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exited the find_correct_model_file method of the Model_Finder class.'
            )
            return self.model_name
        except Exception as e:
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exception occured in find_correct_model_file method of the Model_Finder class. Exception message:  '
                + str(e))
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Exited the find_correct_model_file method of the Model_Finder class with Failure'
            )
            raise Exception()
class Raw_Data_validation:
    """
             This class shall be used for handling all the validation done on the Raw Training Data!!.

             Written By: iNeuron Intelligence
             Version: 1.0
             Revisions: None

             """
    def __init__(self, path, execution_id):
        self.Batch_Directory = path
        self.exexcution_id = execution_id
        #self.schema_path = 'schema_training.json'
        self.collection_name = "schema_training"  #code added by Avnish yadav
        self.database_name = "wafer_sys"  #code added by Avnish yadav
        #self.logger = App_Logger()
        self.logger_db_writer = App_LoggerDB(
            execution_id=execution_id)  #code added by Avnish yadav
        self.mongdb = MongoDBOperation()
        self.az_blob_mgt = AzureBlobManagement()
        self.good_directory_path = "good-raw-file-train-validated"
        self.bad_directory_path = "bad-raw-file-train-validated"

    def valuesFromSchema(self):
        """
                        Method Name: valuesFromSchema
                        Description: This method extracts all the relevant information from the pre-defined "Schema" file.
                        Output: LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, Number of Columns
                        On Failure: Raise ValueError,KeyError,Exception

                         Written By: iNeuron Intelligence
                        Version: 1.0
                        Revisions: None

                                """
        try:
            """code commented by Avnish Yadav
            with open(self.schema_path, 'r') as f:
                dic = json.load(f)
                f.close()
            """
            #code started by Avnish Yadav
            log_database = "wafer_training_log"
            log_collection = "values_from_schema_validation"

            df_schema_training = self.mongdb.getDataFrameofCollection(
                self.database_name, self.collection_name)
            dic = {}
            [
                dic.update({i: df_schema_training.loc[0, i]})
                for i in df_schema_training.columns
            ]
            del df_schema_training
            #code ended by Avnish Yadav

            pattern = dic['SampleFileName']
            LengthOfDateStampInFile = dic['LengthOfDateStampInFile']
            LengthOfTimeStampInFile = dic['LengthOfTimeStampInFile']
            column_names = dic['ColName']
            NumberofColumns = dic['NumberofColumns']

            #file = open("Training_Logs/valuesfromSchemaValidationLog.txt", 'a+')
            message = "LengthOfDateStampInFile:: %s" % LengthOfDateStampInFile + "\t" + "LengthOfTimeStampInFile:: %s" % LengthOfTimeStampInFile + "\t " + "NumberofColumns:: %s" % NumberofColumns + "\n"
            #self.logger.log(file,message) code commented by Avnish Yadav
            self.logger_db_writer.log(log_database, log_collection, message)

            #file.close()

        except ValueError:
            #file = open("Training_Logs/valuesfromSchemaValidationLog.txt", 'a+')
            #self.logger.log(file,"ValueError:Value not found inside schema_training.json")
            #file.close()
            self.logger_db_writer.log(
                log_database, log_collection,
                "Error occured in class:Raw_Data_validation method: valuesFromSchema  ValueError:Value not found inside collection schema_training"
            )
            raise ValueError

        except KeyError:
            #file = open("Training_Logs/valuesfromSchemaValidationLog.txt", 'a+')
            #self.logger.log(file, "KeyError:Key value error incorrect key passed")
            #file.close()
            self.logger_db_writer.log(
                log_database, log_collection,
                "Error occured in class:Raw_Data_validation method: valuesFromSchema KeyError:Key value error incorrect key passed"
            )

            raise KeyError

        except Exception as e:
            #file = open("Training_Logs/valuesfromSchemaValidationLog.txt", 'a+')
            #self.logger.log(file, str(e))
            #file.close()
            self.logger_db_writer.log(
                log_database, log_collection,
                "Error occured in class:Raw_Data_validation method: valuesFromSchema error"
                + str(e))
            raise e

        return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns

    def manualRegexCreation(self):
        """
                                Method Name: manualRegexCreation
                                Description: This method contains a manually defined regex based on the "FileName" given in "Schema" file.
                                            This Regex is used to validate the filename of the training data.
                                Output: Regex pattern
                                On Failure: None

                                 Written By: iNeuron Intelligence
                                Version: 1.0
                                Revisions: None

                                        """
        regex = "['wafer']+['\_'']+[\d_]+[\d]+\.csv"
        return regex

    def createDirectoryForGoodBadRawData(self):
        """
                                      Method Name: createDirectoryForGoodBadRawData
                                      Description: This method creates directories to store the Good Data and Bad Data
                                                    after validating the training data.

                                      Output: None
                                      On Failure: OSError

                                       Written By: iNeuron Intelligence
                                      Version: 1.0
                                      Revisions: None

                                              """
        """
        try:
            path = os.path.join("Training_Raw_files_validated/", "Good_Raw/")
            if not os.path.isdir(path):
                os.makedirs(path)
            path = os.path.join("Training_Raw_files_validated/", "Bad_Raw/")
            if not os.path.isdir(path):
                os.makedirs(path)

        except OSError as ex:
            file = open("Training_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file,"Error while creating Directory %s:" % ex)
            file.close()
            raise OSError
        """
        try:
            log_database = "wafer_training_log"
            log_collection = "general_log"
            self.az_blob_mgt.createDirectory(self.good_directory_path,
                                             is_replace=True)
            self.az_blob_mgt.createDirectory(self.bad_directory_path,
                                             is_replace=True)
            msg = self.good_directory_path + " and " + self.bad_directory_path + " created successfully."
            self.logger_db_writer.log(log_database, log_collection, msg)
        except Exception as e:
            msg = "Error Occured in class Raw_Data_validation method:createDirectoryForGoodBadRawData error: Failed to create directory " + self.good_directory_path + " and " + self.bad_directory_path
            self.logger_db_writer.log(log_database, log_collection, msg)
            raise e

    def deleteExistingGoodDataTrainingFolder(self):
        """
                                            Method Name: deleteExistingGoodDataTrainingFolder
                                            Description: This method deletes the directory made  to store the Good Data
                                                          after loading the data in the table. Once the good files are
                                                          loaded in the DB,deleting the directory ensures space optimization.
                                            Output: None
                                            On Failure: OSError

                                             Written By: iNeuron Intelligence
                                            Version: 1.0
                                            Revisions: None

                                                    """

        try:
            """
            path = 'Training_Raw_files_validated/'
            # if os.path.isdir("ids/" + userName):
            # if os.path.isdir(path + 'Bad_Raw/'):
            #     shutil.rmtree(path + 'Bad_Raw/')
            if os.path.isdir(path + 'Good_Raw/'):
                shutil.rmtree(path + 'Good_Raw/')
                file = open("Training_Logs/GeneralLog.txt", 'a+')
                self.logger.log(file,"GoodRaw directory deleted successfully!!!")
                file.close()
            """
            log_database = "wafer_training_log"
            log_collection = "general_log"
            self.az_blob_mgt.deleteDirectory(self.good_directory_path)
            self.logger_db_writer.log(
                log_database, log_collection,
                self.good_directory_path + " deleted successfully!!")
            """
        except OSError as s:
            file = open("Training_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file,"Error while Deleting Directory : %s" %s)
            file.close()
            raise OSError
            """
        except Exception as e:
            msg = "Error Occured in class Raw_Data_validation method:deleteExistingGoodDataTrainingFolder Error occured while deleting :" + self.good_directory_path
            self.logger_db_writer.log(log_database, log_collection, msg)
            raise e

    def deleteExistingBadDataTrainingFolder(self):
        """
                                            Method Name: deleteExistingBadDataTrainingFolder
                                            Description: This method deletes the directory made to store the bad Data.
                                            Output: None
                                            On Failure: OSError

                                             Written By: iNeuron Intelligence
                                            Version: 1.0
                                            Revisions: None

                                                    """

        try:
            """
            path = 'Training_Raw_files_validated/'
            if os.path.isdir(path + 'Bad_Raw/'):
                shutil.rmtree(path + 'Bad_Raw/')
                file = open("Training_Logs/GeneralLog.txt", 'a+')
                self.logger.log(file,"BadRaw directory deleted before starting validation!!!")
                file.close()
        except OSError as s:
            file = open("Training_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file,"Error while Deleting Directory : %s" %s)
            file.close()
            raise OSError"""
            log_database = "wafer_training_log"
            log_collection = "general_log"
            self.az_blob_mgt.deleteDirectory(self.bad_directory_path)
            self.logger_db_writer.log(
                log_database, log_collection,
                self.bad_directory_path + " deleted successfully!!")

        except Exception as e:
            msg = "Error Occured in class Raw_Data_validation method:deleteExistingGoodDataTrainingFolder Error occured while deleting :" + self.good_directory_path
            self.logger_db_writer.log(log_database, log_collection, msg)
            raise e

    def moveBadFilesToArchiveBad(self):
        """
                                            Method Name: moveBadFilesToArchiveBad
                                            Description: This method deletes the directory made  to store the Bad Data
                                                          after moving the data in an archive folder. We archive the bad
                                                          files to send them back to the client for invalid data issue.
                                            Output: None
                                            On Failure: OSError

                                             Written By: iNeuron Intelligence
                                            Version: 1.0
                                            Revisions: None

                                                    """
        now = datetime.now()
        date = now.date()
        time = now.strftime("%H%M%S")
        try:
            log_database = "wafer_training_log"
            log_collection = "general_log"

            #source = 'Training_Raw_files_validated/Bad_Raw/'
            source = self.bad_directory_path
            destination = "lat-" + self.exexcution_id
            self.logger_db_writer.log(log_database, log_collection,
                                      "Started moving bad raw data..")
            for file in self.az_blob_mgt.getAllFileNameFromDirectory(source):

                self.az_blob_mgt.moveFileInDirectory(source, destination, file)
                self.logger_db_writer.log(
                    log_database, log_collection, "File:" + file +
                    " moved to directory:" + destination + " successfully.")

            self.logger_db_writer.log(
                log_database, log_collection,
                "All bad raw file moved to directory:" + destination)

            self.az_blob_mgt.deleteDirectory(source)
            self.logger_db_writer.log(log_database, log_collection,
                                      "Deleting bad raw directory:" + source)
            """
            if os.path.isdir(source):

                path = "TrainingArchiveBadData"

                if not os.path.isdir(path):
                    os.makedirs(path)
                dest = 'TrainingArchiveBadData/BadData_' + str(date)+"_"+str(time)
                if not os.path.isdir(dest):
                    os.makedirs(dest)
                files = os.listdir(source)
                for f in files:
                    if f not in os.listdir(dest):
                        shutil.move(source + f, dest)
                file = open("Training_Logs/GeneralLog.txt", 'a+')
                self.logger.log(file,"Bad files moved to archive")
                path = 'Training_Raw_files_validated/'
                if os.path.isdir(path + 'Bad_Raw/'):
                    shutil.rmtree(path + 'Bad_Raw/')
                self.logger.log(file,"Bad Raw Data Folder Deleted successfully!!")
                file.close()
                """
        except Exception as e:
            """
            file = open("Training_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file, "Error while moving bad files to archive:: %s" % e)
            file.close()
            """
            self.logger_db_writer.log(
                log_database, log_collection,
                "class Raw_Data_validation method:moveBadFilesToArchiveBad Error while moving bad files to archive:"
                + str(e))
            raise e

    def validationFileNameRaw(self, regex, LengthOfDateStampInFile,
                              LengthOfTimeStampInFile):
        """
                    Method Name: validationFileNameRaw
                    Description: This function validates the name of the training csv files as per given name in the schema!
                                 Regex pattern is used to do the validation.If name format do not match the file is moved
                                 to Bad Raw Data folder else in Good raw data.
                    Output: None
                    On Failure: Exception

                     Written By: iNeuron Intelligence
                    Version: 1.0
                    Revisions: None

                """

        #pattern = "['Wafer']+['\_'']+[\d_]+[\d]+\.csv"
        # delete the directories for good and bad data in case last run was unsuccessful and folders were not deleted.
        """
        self.deleteExistingBadDataTrainingFolder()
        self.deleteExistingGoodDataTrainingFolder()
        #create new directories
        """
        self.createDirectoryForGoodBadRawData()
        #onlyfiles = [f for f in listdir(self.Batch_Directory)]
        onlyfiles = self.az_blob_mgt.getAllFileNameFromDirectory(
            self.Batch_Directory)
        try:
            log_database = "wafer_training_log"
            log_collection = "name_validation_log"
            #f = open("Training_Logs/nameValidationLog.txt", 'a+')
            for filename in onlyfiles:
                if (re.match(regex, filename)):
                    splitAtDot = re.split('.csv', filename)
                    splitAtDot = (re.split('_', splitAtDot[0]))
                    if len(splitAtDot[1]) == LengthOfDateStampInFile:
                        if len(splitAtDot[2]) == LengthOfTimeStampInFile:
                            """
                            shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Good_Raw")
                            self.logger.log(f,"Valid File name!! File moved to GoodRaw Folder :: %s" % filename)
"""
                            self.az_blob_mgt.copyFileInDirectory(
                                self.Batch_Directory, self.good_directory_path,
                                filename)
                            self.logger_db_writer.log(
                                log_database, log_collection,
                                "Valid File name!! File moved to " +
                                self.good_directory_path + filename)

                        else:
                            """
                            shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Bad_Raw")
                            self.logger.log(f,"Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename)
                            """
                            self.az_blob_mgt.copyFileInDirectory(
                                self.Batch_Directory, self.bad_directory_path,
                                filename)
                            msg = "Invalid File Name !! File moved to " + self.bad_directory_path + filename
                            self.logger_db_writer.log(log_database,
                                                      log_collection, msg)
                    else:
                        """
                        shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Bad_Raw")
                        self.logger.log(f,"Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename)
                        """
                        self.az_blob_mgt.copyFileInDirectory(
                            self.Batch_Directory, self.bad_directory_path,
                            filename)
                        msg = "Invalid File Name !! File moved to " + self.bad_directory_path + filename
                        self.logger_db_writer.log(log_database, log_collection,
                                                  msg)

                else:
                    """
                    shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Bad_Raw")
                    self.logger.log(f, "Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename)
                    """
                    self.az_blob_mgt.copyFileInDirectory(
                        self.Batch_Directory, self.bad_directory_path,
                        filename)
                    msg = "Invalid File Name !! File moved to " + self.bad_directory_path + filename
                    self.logger_db_writer.log(log_database, log_collection,
                                              msg)

            #f.close()

        except Exception as e:
            """
            f = open("Training_Logs/nameValidationLog.txt", 'a+')
            self.logger.log(f, "Error occured while validating FileName %s" % e)
            f.close()
            """
            msg = "Error occured while validating FileName " + str(e)
            self.logger_db_writer.log(log_database, log_collection, msg)
            raise e

    def validateColumnLength(self, NumberofColumns):
        """
                          Method Name: validateColumnLength
                          Description: This function validates the number of columns in the csv files.
                                       It is should be same as given in the schema file.
                                       If not same file is not suitable for processing and thus is moved to Bad Raw Data folder.
                                       If the column number matches, file is kept in Good Raw Data for processing.
                                      The csv file is missing the first column name, this function changes the missing name to "Wafer".
                          Output: None
                          On Failure: Exception

                           Written By: iNeuron Intelligence
                          Version: 1.0
                          Revisions: None

                      """
        try:
            log_collection = "column_validation_log"
            log_database = "wafer_training_log"
            #f = open("Training_Logs/columnValidationLog.txt", 'a+')
            #self.logger.log(f,"Column Length Validation Started!!")
            self.logger_db_writer.log(log_database, log_collection,
                                      "Column Length Validation Started!!")
            #for file in listdir('Training_Raw_files_validated/Good_Raw/'):
            for file in self.az_blob_mgt.getAllFileNameFromDirectory(
                    self.good_directory_path):
                #csv = pd.read_csv("Training_Raw_files_validated/Good_Raw/" + file)
                csv = self.az_blob_mgt.readCsvFileFromDirectory(
                    self.good_directory_path, file)
                if csv.shape[1] == NumberofColumns:
                    pass
                else:
                    """
                    shutil.move("Training_Raw_files_validated/Good_Raw/" + file, "Training_Raw_files_validated/Bad_Raw")
                    self.logger.log(f, "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file)
            """
                    self.az_blob_mgt.moveFileInDirectory(
                        self.good_directory_path, self.bad_directory_path,
                        file)
                    msg = "Invalid Column Length for the file!! File moved to " + self.bad_directory_path + "file:" + file
                    self.logger_db_writer.log(log_database, log_collection,
                                              msg)
            #self.logger.log(f, "Column Length Validation Completed!!")
            self.logger_db_writer.log(log_database, log_collection,
                                      "Column Length Validation Completed!!")
            """ except OSError:
            f = open("Training_Logs/columnValidationLog.txt", 'a+')
            self.logger.log(f, "Error Occured while moving the file :: %s" % OSError)
            f.close()
            raise OSError
            """
        except Exception as e:
            """
            f = open("Training_Logs/columnValidationLog.txt", 'a+')
            self.logger.log(f, "Error Occured:: %s" % e)
            f.close()
            """
            self.logger_db_writer.log(
                log_database, log_collection,
                "Error Occured in class Raw_Data_validation method: validateColumnLength error:"
                + str(e))

            raise e
        #f.close()

    def validateMissingValuesInWholeColumn(self):
        """
                                  Method Name: validateMissingValuesInWholeColumn
                                  Description: This function validates if any column in the csv file has all values missing.
                                               If all the values are missing, the file is not suitable for processing.
                                               SUch files are moved to bad raw data.
                                  Output: None
                                  On Failure: Exception

                                   Written By: iNeuron Intelligence
                                  Version: 1.0
                                  Revisions: None

                              """
        try:
            log_database = "wafer_training_log"
            log_collection = "missing_values_in_column"
            #f = open("Training_Logs/missingValuesInColumn.txt", 'a+')
            #self.logger.log(f,"Missing Values Validation Started!!")
            self.logger_db_writer.log(log_database, log_collection,
                                      "Missing Values Validation Started!!")

            #for file in listdir('Training_Raw_files_validated/Good_Raw/'):
            for file in self.az_blob_mgt.getAllFileNameFromDirectory(
                    self.good_directory_path):
                """
                csv = pd.read_csv("Training_Raw_files_validated/Good_Raw/" + file)
                count = 0
                """
                csv = self.az_blob_mgt.readCsvFileFromDirectory(
                    self.good_directory_path,
                    file,
                )
                count = 0
                for columns in csv:
                    if (len(csv[columns]) - csv[columns].count()) == len(
                            csv[columns]):
                        count += 1
                        """
                        shutil.move("Training_Raw_files_validated/Good_Raw/" + file,
                                    "Training_Raw_files_validated/Bad_Raw")
                        self.logger.log(f,"Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file)
                        """

                        self.az_blob_mgt.moveFileInDirectory(
                            self.good_directory_path, self.bad_directory_path,
                            file)
                        msg = "Invalid Column Length for the file!! File moved to " + self.bad_directory_path + ":: %s" % file
                        self.logger_db_writer.log(log_database, log_collection,
                                                  msg)
                        break
                if count == 0:
                    csv.rename(columns={"Unnamed: 1": "Wafer"}, inplace=True)
                    self.az_blob_mgt.saveDataFrameTocsv(
                        self.good_directory_path,
                        file,
                        csv,
                        index=None,
                        header=True)

                    #csv.to_csv("Training_Raw_files_validated/Good_Raw/" + file, index=None, header=True)
                """
        except OSError:
            f = open("Training_Logs/missingValuesInColumn.txt", 'a+')
            self.logger.log(f, "Error Occured while moving the file :: %s" % OSError)
            f.close()
            raise OSError
            """
        except Exception as e:
            #f = open("Training_Logs/missingValuesInColumn.txt", 'a+')
            #self.logger.log(f, "Error Occured:: %s" % e)
            #f.close()
            self.logger_db_writer.log(
                log_database, log_collection,
                "Error Occured class:Raw_Data_validation method:validateMissingValuesInWholeColumn error:"
                + str(e))
            raise e
class prediction:
    def __init__(self, path, execution_id):
        self.execution_id = execution_id
        #self.file_object = open("Prediction_Logs/Prediction_Log.txt", 'a+')
        #self.log_writer = logger.App_Logger()
        self.log_database = "wafer_prediction_log"
        self.log_collection = "prediction_log"
        self.log_db_writer = App_LoggerDB(execution_id)
        self.az_blob_mgt = AzureBlobManagement()
        if path is not None:
            self.pred_data_val = Prediction_Data_validation(path, execution_id)

    def predictionFromModel(self):

        try:
            self.pred_data_val.deletePredictionFile(
            )  #deletes the existing prediction file from last run!
            self.log_db_writer.log(self.log_database, self.log_collection,
                                   'Start of Prediction')
            data_getter = data_loader_prediction.Data_Getter_Pred(
                self.log_database, self.log_collection, self.execution_id)
            data = data_getter.get_data()
            path = ""
            if data.__len__() == 0:
                self.log_db_writer.log(
                    self.log_database, self.log_collection,
                    "No data was present to perform prediction existing prediction method"
                )
                return path, "No data was present to perform prediction"

            #code change
            # wafer_names=data['Wafer']
            # data=data.drop(labels=['Wafer'],axis=1)

            preprocessor = preprocessing.Preprocessor(self.log_database,
                                                      self.log_collection,
                                                      self.execution_id)
            is_null_present = preprocessor.is_null_present(data)
            if (is_null_present):
                data = preprocessor.impute_missing_values(data)

            cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(
                data)
            data = preprocessor.remove_columns(data, cols_to_drop)

            #data=data.to_numpy()
            file_loader = file_methods.File_Operation(self.log_database,
                                                      self.log_collection,
                                                      self.execution_id)
            kmeans = file_loader.load_model('KMeans')
            #print(kmeans)

            ##Code changed
            pred_data = data.drop(['Wafer'], axis=1)
            clusters = kmeans.predict(
                pred_data)  #drops the first column for cluster prediction
            data['clusters'] = clusters
            clust = data['clusters'].unique()
            for i in clust:
                cluster_data = data[
                    data['clusters'] ==
                    i]  # filteing of data with clusters no 0,1,2,etc.
                wafer_names = list(cluster_data['Wafer'])
                cluster_data = data.drop(labels=['Wafer'], axis=1)
                cluster_data = cluster_data.drop(['clusters'], axis=1)
                model_name = file_loader.find_correct_model_file(i)
                model = file_loader.load_model(model_name)
                result = list(model.predict(cluster_data))
                self.result = pandas.DataFrame(list(zip(wafer_names, result)),
                                               columns=['Wafer', 'Prediction'])
                print(self.result.shape)
                print(self.result)
                #path="Prediction_Output_File/Predictions.csv"
                path = "prediction-output-file"
                self.az_blob_mgt.saveDataFrametoCSV(path,
                                                    "prediction.csv",
                                                    self.result,
                                                    header=True,
                                                    mode="a+")

                #result.to_csv("Prediction_Output_File/Predictions.csv",header=True,mode='a+') #appends result to prediction file
            #self.log_writer.log(self.file_object,'End of Prediction')
            self.log_db_writer.log(self.log_database, self.log_collection,
                                   'End of prediction')
        except Exception as ex:

            #self.log_writer.log(self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex)
            self.log_db_writer.log(
                self.log_database, self.log_collection,
                'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex

        return path, self.result.head().to_json(orient="records")