def __init__(self):
     self.path = 'Training_Database'
     self.badFilePath = "Training_Bad_Raw_Files_Validated"
     self.goodFilePath = "Training_Good_Raw_Files_Validated"
     self.logger = App_Logger()
     self.awsObj = AwsStorageManagement()
     self.dbObj = mongoDBOperation()
 def __init__(self, path):
     self.raw_data = Raw_Data_validation(path)
     self.dataTransform = dataTransform()
     self.dBOperation = dBOperation()
     self.file_object = 'Training_Main_Log'
     self.log_writer = logger.App_Logger()
     self.emailObj = email()
     self.awsObj = AwsStorageManagement()
 def __init__(self):
     self.log_writer = logger.App_Logger()
     self.file_object = 'ModelTrainingLog'
     self.emailObj = email()
     self.awsObj = AwsStorageManagement()
     self.performance_list = []
     self.dbObj = mongoDBOperation()
class Data_Getter:
    """
    This class shall  be used for obtaining the data from the source for training.
    """
    def __init__(self, file_object, logger_object):
        self.training_file = 'InputFile.csv'
        self.file_object = file_object
        self.logger_object = logger_object
        self.awsObj = AwsStorageManagement()

    def get_data(self):
        """
            Method Name: get_data
            Description: This method reads the data from source.
            Output: A pandas DataFrame.
            On Failure: Raise Exception
        """
        self.logger_object.log(
            self.file_object,
            'Entered the get_data method of the Data_Getter class')
        try:
            print('Loading Dataframe')
            self.data = self.awsObj.csvToDataframe('Training_FileFromDB',
                                                   self.training_file)
            print('Dataframe Loaded')
            self.logger_object.log(
                self.file_object,
                'Data Load Successful.Exited the get_data method of the Data_Getter class'
            )
            return self.data
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in get_data method of the Data_Getter class. Exception message: '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Data Load Unsuccessful.Exited the get_data method of the Data_Getter class'
            )
            raise Exception()
Exemple #5
0
 def __init__(self):
     self.goodDataPath = "Prediction_Good_Raw_Files_Validated"
     self.logger = App_Logger()
     self.awsObj = AwsStorageManagement()
Exemple #6
0
 def __init__(self, path):
     self.Batch_Directory = path
     self.schema_path = 'schema_prediction.json'
     self.logger = App_Logger()
     self.awsObj = AwsStorageManagement()
     self.dbObj = mongoDBOperation()
Exemple #7
0
class Prediction_Data_validation:
    """
        This class shall be used for handling all the validation done on the Raw Prediction Data!!.
    """
    def __init__(self, path):
        self.Batch_Directory = path
        self.schema_path = 'schema_prediction.json'
        self.logger = App_Logger()
        self.awsObj = AwsStorageManagement()
        self.dbObj = mongoDBOperation()

    def valuesFromSchema(self):
        """
            Method Name: valuesFromSchema
            Description: This method extracts all the relevant information from the pre-defined "Schema" file.
            Output: LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, Number of Columns
            On Failure: Raise ValueError,KeyError,Exception
        """
        try:
            if not self.dbObj.isCollectionPresent('mushroomClassifierDB',
                                                  'predict_schema'):
                with open(self.schema_path, 'r') as f:
                    dic = json.load(f)
                    f.close()
                self.dbObj.insertOneRecord('mushroomClassifierDB',
                                           'predict_schema', dic)
            dic = self.dbObj.getRecords('mushroomClassifierDB',
                                        'predict_schema')
            pattern = dic['SampleFileName']
            LengthOfDateStampInFile = dic['LengthOfDateStampInFile']
            LengthOfTimeStampInFile = dic['LengthOfTimeStampInFile']
            column_names = dic['ColName']
            NumberofColumns = dic['NumberofColumns']

            file = 'valuesfromSchemaValidationLog'
            message = "LengthOfDateStampInFile:: %s" % LengthOfDateStampInFile + "\t" + "LengthOfTimeStampInFile:: %s" % LengthOfTimeStampInFile + "\t " + "NumberofColumns:: %s" % NumberofColumns + "\n"
            self.logger.log(file, message)

        except ValueError:
            file = 'valuesfromSchemaValidationLog'
            self.logger.log(
                file, "ValueError:Value not found inside schema_training.json")
            raise ValueError

        except KeyError:
            file = 'valuesfromSchemaValidationLog'
            self.logger.log(file,
                            "KeyError:Key value error incorrect key passed")
            raise KeyError

        except Exception as e:
            file = 'valuesfromSchemaValidationLog'
            self.logger.log(file, str(e))
            raise e

        return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns

    def manualRegexCreation(self):
        """
          Method Name: manualRegexCreation
          Description: This method contains a manually defined regex based on the "FileName" given in "Schema" file.
                      This Regex is used to validate the filename of the prediction data.
          Output: Regex pattern
          On Failure: None
        """
        regex = "['mushroom']+['\_'']+[\d_]+[\d]+\.csv"
        return regex

    def createDirectoryForGoodBadRawData(self):
        """
            Method Name: createDirectoryForGoodBadRawData
            Description: This method creates directories to store the Good Data and Bad Data
                          after validating the prediction data.

            Output: None
            On Failure: Exception
        """
        try:
            self.awsObj.createS3Directory(
                'Prediction_Good_Raw_Files_Validated')
            self.awsObj.createS3Directory('Prediction_Bad_Raw_Files_Validated')
        except Exception as ex:
            file = 'GeneralLog'
            self.logger.log(file, "Error while creating Directory %s:" % ex)

    def deleteExistingGoodDataTrainingFolder(self):
        """
            Method Name: deleteExistingGoodDataTrainingFolder
            Description: This method deletes the directory made to store the Good Data
                          after loading the data in the table. Once the good files are
                          loaded in the DB,deleting the directory ensures space optimization.
            Output: None
            On Failure: Exception
        """
        try:
            file = 'GeneralLog'
            self.logger.log(file, "GoodRaw directory deleted successfully!!!")
            self.awsObj.deleteDirectory('Prediction_Good_Raw_Files_Validated')
        except Exception as s:
            file = 'GeneralLog'
            self.logger.log(file, "Error while Deleting Directory : %s" % s)
            raise s

    def deleteExistingBadDataTrainingFolder(self):
        """
            Method Name: deleteExistingBadDataTrainingFolder
            Description: This method deletes the directory made to store the bad Data.
            Output: None
            On Failure: Exception
        """

        try:
            file = 'GeneralLog'
            self.logger.log(
                file, "BadRaw directory deleted before starting validation!!!")
            self.awsObj.deleteDirectory('Prediction_Bad_Raw_Files_Validated')
        except Exception as s:
            file = 'GeneralLog'
            self.logger.log(file, "Error while Deleting Directory : %s" % s)
            raise s

    def moveBadFilesToArchiveBad(self):
        """
            Method Name: moveBadFilesToArchiveBad
            Description: This method deletes the directory made  to store the Bad Data
                          after moving the data in an archive folder. We archive the bad
                          files to send them back to the client for invalid data issue.
            Output: None
            On Failure: Exception
        """
        now = datetime.now()
        date = now.date()
        time = now.strftime("%H%M%S")
        try:
            target_folder = 'PredictionArchivedBadData/BadData_' + str(
                date) + "_" + str(time)
            self.awsObj.copyFileToFolder('Prediction_Bad_Raw_Files_Validated',
                                         target_folder)

            file = 'GeneralLog'
            self.logger.log(file, "Bad files moved to archive")

            self.logger.log(file, "Bad Raw Data Folder Deleted successfully!!")
        except Exception as e:
            file = 'GeneralLog'
            self.logger.log(file,
                            "Error while moving bad files to archive:: %s" % e)
            raise e

    def validationFileNameRaw(self, regex, LengthOfDateStampInFile,
                              LengthOfTimeStampInFile):
        """
            Method Name: validationFileNameRaw
            Description: This function validates the name of the prediction csv file as per given name in the schema!
                         Regex pattern is used to do the validation.If name format do not match the file is moved
                         to Bad Raw Data folder else in Good raw data.
            Output: None
            On Failure: Exception
        """
        # delete the directories for good and bad data in case last run was unsuccessful and folders were not deleted.
        self.deleteExistingBadDataTrainingFolder()
        self.deleteExistingGoodDataTrainingFolder()
        self.createDirectoryForGoodBadRawData()
        batch_dir = self.Batch_Directory.strip('/').strip('\\')
        print('Prediction File Path: ', batch_dir)
        self.awsObj.uploadFiles(batch_dir, batch_dir)
        onlyfiles = self.awsObj.listDirFiles(batch_dir)
        try:
            f = 'nameValidationLog'
            for filename in onlyfiles:
                if (re.match(regex, filename)):
                    splitAtDot = re.split('.csv', filename)
                    splitAtDot = (re.split('_', splitAtDot[0]))
                    if len(splitAtDot[1]) == LengthOfDateStampInFile:
                        if len(splitAtDot[2]) == LengthOfTimeStampInFile:
                            self.awsObj.copyFileToFolder(
                                batch_dir,
                                'Prediction_Good_Raw_Files_Validated',
                                filename)
                            self.logger.log(
                                f,
                                "Valid File name!! File moved to GoodRaw Folder :: %s"
                                % filename)

                        else:
                            self.awsObj.copyFileToFolder(
                                self.Batch_Directory,
                                'Prediction_Bad_Raw_Files_Validated', filename)
                            self.logger.log(
                                f,
                                "Invalid File Name!! File moved to Bad Raw Folder :: %s"
                                % filename)
                    else:
                        self.awsObj.copyFileToFolder(
                            self.Batch_Directory,
                            'Prediction_Bad_Raw_Files_Validated', filename)
                        self.logger.log(
                            f,
                            "Invalid File Name!! File moved to Bad Raw Folder :: %s"
                            % filename)
                else:
                    self.awsObj.copyFileToFolder(
                        self.Batch_Directory,
                        'Prediction_Bad_Raw_Files_Validated', filename)
                    self.logger.log(
                        f,
                        "Invalid File Name!! File moved to Bad Raw Folder :: %s"
                        % filename)

        except Exception as e:
            f = 'nameValidationLog'
            self.logger.log(f,
                            "Error occured while validating FileName %s" % e)
            raise e

    def validateColumnLength(self, NumberofColumns):
        """
            Method Name: validateColumnLength
            Description: This function validates the number of columns in the csv files.
                         It is should be same as given in the schema file.
                         If not same file is not suitable for processing and thus is moved to Bad Raw Data folder.
                         If the column number matches, file is kept in Good Raw Data for processing.
                        The csv file is missing the first column name, this function changes the missing name to "Wafer".
            Output: None
            On Failure: Exception
        """
        try:
            f = 'columnValidationLog'
            self.logger.log(f, "Column Length Validation Started!!")
            file_list = self.awsObj.listDirFiles(
                'Prediction_Good_Raw_Files_Validated')
            for file in file_list:
                csv = self.awsObj.csvToDataframe(
                    'Prediction_Good_Raw_Files_Validated', file)
                if csv.shape[1] == NumberofColumns:
                    self.awsObj.saveDataframeToCsv(
                        'Prediction_Good_Raw_Files_Validated', file, csv)
                else:
                    self.awsObj.moveFileToFolder(
                        'Prediction_Good_Raw_Files_Validated',
                        'Prediction_Bad_Raw_Files_Validated', file)
                    self.logger.log(
                        f,
                        "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s"
                        % file)

            self.logger.log(f, "Column Length Validation Completed!!")
        except OSError:
            f = 'columnValidationLog'
            self.logger.log(
                f, "Error Occurred while moving the file :: %s" % OSError)
            raise OSError
        except Exception as e:
            f = 'columnValidationLog'
            self.logger.log(f, "Error Occurred:: %s" % e)
            raise e

    def deletePredictionFile(self):

        self.awsObj.deleteFile('Prediction_Output_File', 'Predictions.csv')

    def validateMissingValuesInWholeColumn(self):
        """
              Method Name: validateMissingValuesInWholeColumn
              Description: This function validates if any column in the csv file has all values missing.
                           If all the values are missing, the file is not suitable for processing.
                           SUch files are moved to bad raw data.
              Output: None
              On Failure: Exception
        """
        try:
            f = 'missingValuesInColumn'
            self.logger.log(f, "Missing Values Validation Started!!")
            file_list = self.awsObj.listDirFiles(
                'Prediction_Good_Raw_Files_Validated')
            for file in file_list:
                csv = self.awsObj.csvToDataframe(
                    'Prediction_Good_Raw_Files_Validated', file)
                count = 0
                for columns in csv:
                    if (len(csv[columns]) - csv[columns].count()) == len(
                            csv[columns]):
                        count += 1
                        self.awsObj.moveFileToFolder(
                            'Prediction_Good_Raw_Files_Validated',
                            'Prediction_Bad_Raw_Files_Validated', file)
                        self.logger.log(
                            f,
                            "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s"
                            % file)
                        break
                if count == 0:
                    self.awsObj.saveDataframeToCsv(
                        'Prediction_Good_Raw_Files_Validated', file, csv)
        except OSError:
            f = 'missingValuesInColumn'
            self.logger.log(
                f, "Error Occurred while moving the file :: %s" % OSError)
            raise OSError
        except Exception as e:
            f = 'missingValuesInColumn'
            self.logger.log(f, "Error Occurred:: %s" % e)
            raise e
Exemple #8
0
class KMeansClustering:
    """
        This class shall  be used to divide the data into clusters before training.
    """
    def __init__(self, file_object, logger_object):
        self.file_object = file_object
        self.logger_object = logger_object
        self.awsObj = AwsStorageManagement()

    def elbow_plot(self, data):
        """
            Method Name: elbow_plot
            Description: This method saves the plot to decide the optimum number of clusters to the file.
            Output: A picture saved to the directory
            On Failure: Raise Exception
        """
        self.logger_object.log(
            self.file_object,
            'Entered the elbow_plot method of the KMeansClustering class')
        wcss = []  # initializing an empty list
        try:
            for i in range(1, 11):
                kmeans = KMeans(
                    n_clusters=i, init='k-means++',
                    random_state=42)  # initializing the KMeans object
                kmeans.fit(data)  # fitting the data to the KMeans Algorithm
                wcss.append(kmeans.inertia_)
            plt.plot(
                range(1, 11), wcss
            )  # creating the graph between WCSS and the number of clusters
            plt.title('The Elbow Method')
            plt.xlabel('Number of clusters')
            plt.ylabel('WCSS')
            #plt.show()
            img_buffer = io.BytesIO()
            plt.savefig(img_buffer, format='png')
            img_buffer.seek(0)
            self.awsObj.deleteFile('models')
            self.awsObj.deleteFile('preprocessing_data')
            self.awsObj.saveObject('preprocessing_data', 'K-Means_Elbow.PNG',
                                   img_buffer, 'image/png')
            print('Saving kneeplot to aws')
            #plt.savefig('preprocessing_data/K-Means_Elbow.PNG') # saving the elbow plot locally
            # finding the value of the optimum cluster programmatically
            self.kn = KneeLocator(range(1, 11),
                                  wcss,
                                  curve='convex',
                                  direction='decreasing')
            self.logger_object.log(
                self.file_object,
                'The optimum number of clusters is: ' + str(self.kn.knee) +
                ' . Exited the elbow_plot method of the KMeansClustering class'
            )
            return self.kn.knee

        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in elbow_plot method of the KMeansClustering class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Finding the number of clusters failed. Exited the elbow_plot method of the KMeansClustering class'
            )
            raise Exception()

    def create_clusters(self, data, number_of_clusters):
        """
            Method Name: create_clusters
            Description: Create a new dataframe consisting of the cluster information.
            Output: A dataframe with cluster column
            On Failure: Raise Exception
        """
        self.logger_object.log(
            self.file_object,
            'Entered the create_clusters method of the KMeansClustering class')
        self.data = data
        try:
            self.kmeans = KMeans(n_clusters=number_of_clusters,
                                 init='k-means++',
                                 random_state=42)
            #self.data = self.data[~self.data.isin([np.nan, np.inf, -np.inf]).any(1)]
            self.y_kmeans = self.kmeans.fit_predict(
                data)  #  divide data into clusters

            self.file_op = file_methods.File_Operation(self.file_object,
                                                       self.logger_object)
            print('Saving Kmeans Model')
            self.save_model = self.file_op.save_model(
                self.kmeans, 'KMeans')  # saving the KMeans model to directory
            # passing 'Model' as the functions need three parameters

            self.data[
                'Cluster'] = self.y_kmeans  # create a new column in dataset for storing the cluster information
            self.logger_object.log(
                self.file_object, 'successfully created ' + str(self.kn.knee) +
                'clusters. Exited the create_clusters method of the KMeansClustering class'
            )
            return self.data
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occurred in create_clusters method of the KMeansClustering class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Fitting the data to clusters failed. Exited the create_clusters method of the KMeansClustering class'
            )
            raise Exception()
Exemple #9
0
 def __init__(self, file_object, logger_object):
     self.file_object = file_object
     self.logger_object = logger_object
     self.awsObj = AwsStorageManagement()
 def __init__(self, file_object, logger_object):
     self.training_file = 'InputFile.csv'
     self.file_object = file_object
     self.logger_object = logger_object
     self.awsObj = AwsStorageManagement()
class Preprocessor:
    """
        This class shall  be used to clean and transform the data before training.
    """

    def __init__(self, file_object, logger_object):
        self.file_object = file_object
        self.logger_object = logger_object
        self.awsObj = AwsStorageManagement()

    def remove_columns(self,data,columns):
        """
            Method Name: remove_columns
            Description: This method removes the given columns from a pandas dataframe.
            Output: A pandas DataFrame after removing the specified columns.
            On Failure: Raise Exception
        """
        self.logger_object.log(self.file_object, 'Entered the remove_columns method of the Preprocessor class')
        self.data=data
        self.columns=columns
        try:
            self.useful_data=self.data.drop(labels=self.columns, axis=1) # drop the labels specified in the columns
            self.logger_object.log(self.file_object,
                                   'Column removal Successful.Exited the remove_columns method of the Preprocessor class')
            return self.useful_data
        except Exception as e:
            self.logger_object.log(self.file_object,'Exception occured in remove_columns method of the Preprocessor class. Exception message:  '+str(e))
            self.logger_object.log(self.file_object,
                                   'Column removal Unsuccessful. Exited the remove_columns method of the Preprocessor class')
            raise Exception()

    def separate_label_feature(self, data, label_column_name):
        """
            Method Name: separate_label_feature
            Description: This method separates the features and a Label Coulmns.
            Output: Returns two separate Dataframes, one containing features and the other containing Labels .
            On Failure: Raise Exception
        """
        self.logger_object.log(self.file_object, 'Entered the separate_label_feature method of the Preprocessor class')
        try:
            self.X=data.drop(label_column_name,axis=1) # drop the columns specified and separate the feature columns
            self.Y=data[label_column_name] # Filter the Label columns
            print(self.Y)
            self.logger_object.log(self.file_object,
                                   'Label Separation Successful. Exited the separate_label_feature method of the Preprocessor class')
            return self.X,self.Y
        except Exception as e:
            self.logger_object.log(self.file_object,'Exception occured in separate_label_feature method of the Preprocessor class. Exception message:  ' + str(e))
            self.logger_object.log(self.file_object, 'Label Separation Unsuccessful. Exited the separate_label_feature method of the Preprocessor class')
            raise Exception()

    def dropUnnecessaryColumns(self,data,columnNameList):
        """
            Method Name: is_null_present
            Description: This method drops the unwanted columns as discussed in EDA section.
        """
        data = data.drop(columnNameList,axis=1)
        return data


    def replaceInvalidValuesWithNull(self,data):

        """
           Method Name: is_null_present
           Description: This method replaces invalid values i.e. '?' with null, as discussed in EDA.
        """

        for column in data.columns:
            count = data[column][data[column] == '?'].count()
            if count != 0:
                data[column] = data[column].replace('?', np.nan)
        return data

    def is_null_present(self,data):
        """
            Method Name: is_null_present
            Description: This method checks whether there are null values present in the pandas Dataframe or not.
            Output: Returns True if null values are present in the DataFrame, False if they are not present and
                    returns the list of columns for which null values are present.
            On Failure: Raise Exception
        """
        self.logger_object.log(self.file_object, 'Entered the is_null_present method of the Preprocessor class')
        self.null_present = False
        self.cols_with_missing_values=[]
        self.cols = data.columns
        try:
            self.null_counts=data.isna().sum() # check for the count of null values per column
            for i in range(len(self.null_counts)):
                if self.null_counts[i]>0:
                    self.null_present=True
                    self.cols_with_missing_values.append(self.cols[i])
            if(self.null_present): # write the logs to see which columns have null values
                self.dataframe_with_null = pd.DataFrame()
                self.dataframe_with_null['columns'] = data.columns
                self.dataframe_with_null['missing values count'] = np.asarray(data.isna().sum())
                self.awsObj.saveDataframeToCsv('preprocessing_data', 'null_values.csv', self.dataframe_with_null)
                #self.dataframe_with_null.to_csv('preprocessing_data/null_values.csv') # storing the null column information to file
            self.logger_object.log(self.file_object,'Finding missing values is a success.Data written to the null values file. Exited the is_null_present method of the Preprocessor class')
            return self.null_present, self.cols_with_missing_values
        except Exception as e:
            self.logger_object.log(self.file_object,'Exception occured in is_null_present method of the Preprocessor class. Exception message:  ' + str(e))
            self.logger_object.log(self.file_object,'Finding missing values failed. Exited the is_null_present method of the Preprocessor class')
            raise Exception()

    def encodeCategoricalValues(self,data):
     """
        Method Name: encodeCategoricalValues
        Description: This method encodes all the categorical values in the training set.
        Output: A Dataframe which has all the categorical values encoded.
        On Failure: Raise Exception
    """
     print('Encode Categorical Values')
     print(data['class'].value_counts())
     data['class'] = data['class'].map({"'p'": 1, "'e'": 2})
     print(data['class'].value_counts())
     for column in data.drop(['class'],axis=1).columns:
            data = pd.get_dummies(data, columns=[column])

     return data


    def encodeCategoricalValuesPrediction(self,data):
        """
           Method Name: encodeCategoricalValuesPrediction
           Description: This method encodes all the categorical values in the prediction set.
           Output: A Dataframe which has all the categorical values encoded.
           On Failure: Raise Exception
        """

        for column in data.columns:
            data = pd.get_dummies(data, columns=[column],drop_first=True)

        return data

    def impute_missing_values(self, data, cols_with_missing_values):
        """
            Method Name: impute_missing_values
            Description: This method replaces all the missing values in the Dataframe using KNN Imputer.
            Output: A Dataframe which has all the missing values imputed.
            On Failure: Raise Exception
        """
        self.logger_object.log(self.file_object, 'Entered the impute_missing_values method of the Preprocessor class')
        self.data= data
        self.cols_with_missing_values=cols_with_missing_values
        try:
            self.imputer = CategoricalImputer()
            for col in self.cols_with_missing_values:
                self.data[col] = self.imputer.fit_transform(self.data[col])
            self.logger_object.log(self.file_object, 'Imputing missing values Successful. Exited the impute_missing_values method of the Preprocessor class')
            return self.data
        except Exception as e:
            self.logger_object.log(self.file_object,'Exception occured in impute_missing_values method of the Preprocessor class. Exception message:  ' + str(e))
            self.logger_object.log(self.file_object,'Imputing missing values failed. Exited the impute_missing_values method of the Preprocessor class')
            raise Exception()

    def get_columns_with_zero_std_deviation(self,data):
        """
            Method Name: get_columns_with_zero_std_deviation
            Description: This method finds out the columns which have a standard deviation of zero.
            Output: List of the columns with standard deviation of zero
            On Failure: Raise Exception
        """
        self.logger_object.log(self.file_object, 'Entered the get_columns_with_zero_std_deviation method of the Preprocessor class')
        self.columns=data.columns
        self.data_n = data.describe()
        self.col_to_drop=[]
        try:
            for x in self.columns:
                if (self.data_n[x]['std'] == 0): # check if standard deviation is zero
                    self.col_to_drop.append(x)  # prepare the list of columns with standard deviation zero
            self.logger_object.log(self.file_object, 'Column search for Standard Deviation of Zero Successful. Exited the get_columns_with_zero_std_deviation method of the Preprocessor class')
            return self.col_to_drop

        except Exception as e:
            self.logger_object.log(self.file_object,'Exception occured in get_columns_with_zero_std_deviation method of the Preprocessor class. Exception message:  ' + str(e))
            self.logger_object.log(self.file_object, 'Column search for Standard Deviation of Zero Failed. Exited the get_columns_with_zero_std_deviation method of the Preprocessor class')
            raise Exception()
class File_Operation:
    """
        This class shall be used to save the model after training
        and load the saved model for prediction.
    """
    def __init__(self, file_object, logger_object):
        self.file_object = file_object
        self.logger_object = logger_object
        self.model_directory = 'models'
        self.awsObj = AwsStorageManagement()

    def save_model(self, model, filename):
        """
            Method Name: save_model
            Description: Save the model file to directory
            Outcome: File gets saved
            On Failure: Raise Exception
        """
        self.logger_object.log(
            self.file_object,
            'Entered the save_model method of the File_Operation class')
        try:
            print('Start Saving Model')
            with io.BytesIO() as f:
                pickle.dump(model, f)  # save the model to file
                f.seek(0)
                self.awsObj.saveObject(self.model_directory, filename + '.sav',
                                       f, 'text/html')
            print('Model Saved')
            self.logger_object.log(
                self.file_object, 'Model File ' + filename +
                ' saved. Exited the save_model method of the Model_Finder class'
            )

            return 'success'
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in save_model method of the Model_Finder class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object, 'Model File ' + filename +
                ' could not be saved. Exited the save_model method of the Model_Finder class'
            )
            raise Exception()

    def load_model(self, filename):
        """
            Method Name: load_model
            Description: load the model file to memory
            Output: The Model file loaded in memory
            On Failure: Raise Exception
        """
        self.logger_object.log(
            self.file_object,
            'Entered the load_model method of the File_Operation class')
        try:
            f = self.awsObj.loadObject(self.model_directory, filename + '.sav')
            print('Model load done')
            self.logger_object.log(
                self.file_object, 'Model File ' + filename +
                ' loaded. Exited the load_model method of the Model_Finder class'
            )
            return f
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in load_model method of the Model_Finder class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object, 'Model File ' + filename +
                ' could not be saved. Exited the load_model method of the Model_Finder class'
            )
            raise Exception()

    def find_correct_model_file(self, cluster_number):
        """
            Method Name: find_correct_model_file
            Description: Select the correct model based on cluster number
            Output: The Model file
            On Failure: Raise Exception
        """
        self.logger_object.log(
            self.file_object,
            'Entered the find_correct_model_file method of the File_Operation class'
        )
        try:
            self.cluster_number = cluster_number
            self.folder_name = self.model_directory
            self.list_of_model_files = []
            self.list_of_files = self.awsObj.listDirFiles(self.folder_name)
            for self.file in self.list_of_files:
                try:
                    if (self.file.index(str(self.cluster_number)) != -1):
                        self.model_name = self.file
                except:
                    continue
            self.model_name = self.model_name.split('.')[0]
            self.logger_object.log(
                self.file_object,
                'Exited the find_correct_model_file method of the Model_Finder class.'
            )
            return self.model_name
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in find_correct_model_file method of the Model_Finder class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Exited the find_correct_model_file method of the Model_Finder class with Failure'
            )
            raise Exception()
 def __init__(self, file_object, logger_object):
     self.file_object = file_object
     self.logger_object = logger_object
     self.model_directory = 'models'
     self.awsObj = AwsStorageManagement()
Exemple #14
0
class prediction:
    def __init__(self, path):
        self.file_object = 'Prediction_Log'
        self.log_writer = logger.App_Logger()
        self.awsObj = AwsStorageManagement()
        self.emailObj = email()
        if path is not None:
            self.pred_data_val = Prediction_Data_validation(path)

    def predictionFromModel(self):

        try:
            self.pred_data_val.deletePredictionFile(
            )  #deletes the existing prediction file from last run!
            self.log_writer.log(self.file_object, 'Start of Prediction')
            data_getter = data_loader_prediction.Data_Getter_Pred(
                self.file_object, self.log_writer)
            data = data_getter.get_data()

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            data = preprocessor.dropUnnecessaryColumns(data, ['veil-type'])

            # replacing '?' values with np.nan as discussed in the EDA part

            data = preprocessor.replaceInvalidValuesWithNull(data)

            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data, cols_with_missing_values)

            # get encoded values for categorical data
            data = preprocessor.encodeCategoricalValuesPrediction(data)

            #data=data.to_numpy()
            file_loader = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
            kmeans = file_loader.load_model('KMeans')
            print(kmeans.labels_)
            ##Code changed
            #pred_data = data.drop(['Wafer'],axis=1)
            clusters = kmeans.predict(data)
            data['clusters'] = clusters
            clusters = data['clusters'].unique()
            result = []  # initialize blank list for storing predicitons
            # with open('EncoderPickle/enc.pickle', 'rb') as file: #let's load the encoder pickle file to decode the values
            #     encoder = pickle.load(file)

            for i in clusters:
                cluster_data = data[data['clusters'] == i]
                cluster_data = cluster_data.drop(['clusters'], axis=1)
                model_name = file_loader.find_correct_model_file(i)
                model = file_loader.load_model(model_name)
                for val in (model.predict(cluster_data)):
                    result.append(val)
            result = pandas.DataFrame(result, columns=['Predictions'])
            path = "Prediction_Output_File/Predictions.csv"
            self.awsObj.saveDataframeToCsv('Prediction_Output_File',
                                           'Predictions.csv', result)
            self.log_writer.log(self.file_object, 'End of Prediction')

            msg = MIMEMultipart()
            msg['Subject'] = 'MushroomTypeClassifier - Prediction Done | ' + str(
                datetime.now())
            body = 'Model Prediction Done Successfully... <br><br> Thanks and Regards, <br> Rahul Garg'
            msg.attach(MIMEText(body, 'html'))
            to_addr = ['*****@*****.**']
            self.emailObj.trigger_mail(to_addr, [], msg)
        except Exception as ex:
            self.log_writer.log(
                self.file_object,
                'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex
        return path
class dBOperation:
    """
      This class shall be used for handling all the SQL operations.
    """
    def __init__(self):
        self.path = 'Training_Database'
        self.badFilePath = "Training_Bad_Raw_Files_Validated"
        self.goodFilePath = "Training_Good_Raw_Files_Validated"
        self.logger = App_Logger()
        self.awsObj = AwsStorageManagement()
        self.dbObj = mongoDBOperation()

    def createTableDb(self, DatabaseName, column_names):
        """
            Method Name: createTableDb
            Description: This method creates a table in the given database which will be used to insert the Good data after raw data validation.
            Output: None
            On Failure: Raise Exception
        """
        try:
            self.dbObj.createOrGetCollection(DatabaseName, 'Good_Raw_Data')
            file = 'DbTableCreateLog'
            self.logger.log(file, "Tables created successfully!!")

            file = 'DataBaseConnectionLog'
            self.logger.log(file,
                            "Closed %s database successfully" % DatabaseName)

        except Exception as e:
            file = 'DbTableCreateLog'
            self.logger.log(file, "Error while creating table: %s " % e)

            file = 'DataBaseConnectionLog'
            self.logger.log(file,
                            "Closed %s database successfully" % DatabaseName)
            raise e

    def insertIntoTableGoodData(self, Database):
        """
           Method Name: insertIntoTableGoodData
           Description: This method inserts the Good data files from the Good_Raw folder into the
                        above created table.
           Output: None
           On Failure: Raise Exception
        """

        goodFilePath = self.goodFilePath
        badFilePath = self.badFilePath
        onlyfiles = self.awsObj.listDirFiles(goodFilePath)
        log_file = 'DbInsertLog'
        self.dbObj.dropCollection(Database, 'Good_Raw_Data')
        print(onlyfiles)
        for file in onlyfiles:
            try:
                df_csv = self.awsObj.csvToDataframe(self.goodFilePath, file)
                print('df_csv: ', df_csv.shape)
                self.dbObj.dataframeToRecords(Database, 'Good_Raw_Data',
                                              df_csv)

            except Exception as e:
                self.logger.log(log_file,
                                "Error while creating table: %s " % e)
                self.awsObj.moveFileToFolder(goodFilePath, badFilePath, file)
                self.logger.log(log_file, "File Moved Successfully %s" % file)
        print('Data pushed to mongodb...')

    def selectingDatafromtableintocsv(self, Database):
        """
           Method Name: selectingDatafromtableintocsv
           Description: This method exports the data in GoodData table as a CSV file. in a given location.
                        above created .
           Output: None
           On Failure: Raise Exception
        """

        self.fileFromDb = 'Training_FileFromDB'
        self.fileName = 'InputFile.csv'
        self.awsObj.createS3Directory(self.fileFromDb)
        log_file = 'ExportToCsv'
        try:
            tmp_csv = self.dbObj.recordsToDataFrame(Database, 'Good_Raw_Data')
            self.awsObj.saveDataframeToCsv('Training_FileFromDB',
                                           self.fileName, tmp_csv)

            self.logger.log(log_file, "File exported successfully!!!")
            print('Saving data to final csv')

        except Exception as e:
            self.logger.log(log_file, "File exporting failed. Error : %s" % e)
class train_validation:
    def __init__(self, path):
        self.raw_data = Raw_Data_validation(path)
        self.dataTransform = dataTransform()
        self.dBOperation = dBOperation()
        self.file_object = 'Training_Main_Log'
        self.log_writer = logger.App_Logger()
        self.emailObj = email()
        self.awsObj = AwsStorageManagement()

    def train_validation(self):
        try:
            self.log_writer.log(self.file_object,
                                'Start of Validation on files for Training!!')
            # extracting values from prediction schema
            LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, noofcolumns = self.raw_data.valuesFromSchema(
            )
            # getting the regex defined to validate filename
            regex = self.raw_data.manualRegexCreation()
            # validating filename of prediction files
            self.raw_data.validationFileNameRaw(regex, LengthOfDateStampInFile,
                                                LengthOfTimeStampInFile)
            # validating column length in the file
            self.raw_data.validateColumnLength(noofcolumns)
            # validating if any column has all values missing
            self.raw_data.validateMissingValuesInWholeColumn()
            self.log_writer.log(self.file_object,
                                "Raw Data Validation Complete!!")

            self.log_writer.log(self.file_object,
                                "Starting Data Transforamtion!!")
            # below function adds quotes to the '?' values in some columns.
            self.dataTransform.addQuotesToStringValuesInColumn()

            self.log_writer.log(self.file_object,
                                "DataTransformation Completed!!!")

            self.log_writer.log(
                self.file_object,
                "Creating Training_Database and tables on the basis of given schema!!!"
            )
            # create database with given name, if present open the connection! Create table with columns given in schema
            self.log_writer.log(self.file_object, "Table creation Completed!!")
            self.log_writer.log(self.file_object,
                                "Insertion of Data into Table started!!!!")
            # insert csv files in the table
            self.dBOperation.insertIntoTableGoodData('mushroomClassifierDB')
            self.log_writer.log(self.file_object,
                                "Insertion in Table completed!!!")
            self.log_writer.log(self.file_object,
                                "Deleting Good Data Folder!!!")
            # Delete the good data folder after loading files in table
            self.raw_data.deleteExistingGoodDataTrainingFolder()
            self.log_writer.log(self.file_object,
                                "Good_Data folder deleted!!!")
            self.log_writer.log(
                self.file_object,
                "Moving bad files to Archive and deleting Bad_Data folder!!!")
            # Move the bad files to archive folder
            self.raw_data.moveBadFilesToArchiveBad()
            self.log_writer.log(
                self.file_object,
                "Bad files moved to archive!! Bad folder Deleted!!")
            self.log_writer.log(self.file_object,
                                "Validation Operation completed!!")
            self.log_writer.log(self.file_object,
                                "Extracting csv file from table")
            # export data in table to csvfile
            self.dBOperation.selectingDatafromtableintocsv(
                'mushroomClassifierDB')

            # Triggering Email
            msg = MIMEMultipart()
            msg['Subject'] = 'MushroomTypeClassifier - Train Validation | ' + str(
                datetime.now())
            file_list = self.awsObj.listDirFiles(
                'Training_Bad_Raw_Files_Validated')
            if len(file_list) >= 1:
                file_str = ','.join(file_list)
            else:
                file_str = 'No Bad Files'
            body = 'Model Train Validation Done Successfully... <br><br> Fault File List: <br>' + file_str + '<br><br>Thanks and Regards, <br> Rahul Garg'
            msg.attach(MIMEText(body, 'html'))
            to_addr = ['*****@*****.**']
            self.emailObj.trigger_mail(to_addr, [], msg)

        except Exception as e:
            raise e