Exemple #1
0
 def __init__(self, path):
     self.file_object = open("Prediction_Logs/Prediction_Log.txt", 'a+')
     self.log_writer = App_Logger()
     self.pred_data_val = Prediction_Data_Validation(path)
class dBOperation:
    """
      This class shall be used for handling all the SQL operations.



      """
    def __init__(self):
        self.path = 'Training_Database/'
        self.badFilePath = "Training_Raw_files_validated/Bad_Raw"
        self.goodFilePath = "Training_Raw_files_validated/Good_Raw"
        self.logger = App_Logger()

    def dataBaseConnection(self, DatabaseName):
        """
                Method Name: dataBaseConnection
                Description: This method creates the database with the given name and if Database already exists then opens the connection to the DB.
                Output: Connection to the DB
                On Failure: Raise ConnectionError


                """
        try:
            conn = sqlite3.connect(self.path + DatabaseName + '.db')

            file = open("Training_Logs/DataBaseConnectionLog.txt", 'a+')
            self.logger.log(file,
                            "Opened %s database successfully" % DatabaseName)
            file.close()
        except ConnectionError:
            file = open("Training_Logs/DataBaseConnectionLog.txt", 'a+')
            self.logger.log(
                file,
                "Error while connecting to database: %s" % ConnectionError)
            file.close()
            raise ConnectionError
        return conn

    def createTableDb(self, DatabaseName, column_names):
        """
                        Method Name: createTableDb
                        Description: This method creates a table in the given database which will be used to insert the Good data after raw data validation.
                        Output: None
                        On Failure: Raise Exception


                        """
        try:
            conn = self.dataBaseConnection(DatabaseName)
            c = conn.cursor()
            c.execute(
                "SELECT count(name)  FROM sqlite_master WHERE type = 'table'AND name = 'Good_Raw_Data'"
            )
            if c.fetchone()[0] == 1:
                conn.close()
                file = open("Training_Logs/DbTableCreateLog.txt", 'a+')
                self.logger.log(file, "Tables created successfully!!")
                file.close()

                file = open("Training_Logs/DataBaseConnectionLog.txt", 'a+')
                self.logger.log(
                    file, "Closed %s database successfully" % DatabaseName)
                file.close()

            else:

                for key in column_names.keys():
                    type = column_names[key]

                    #in try block we check if the table exists, if yes then add columns to the table
                    # else in catch block we will create the table
                    try:
                        #cur = cur.execute("SELECT name FROM {dbName} WHERE type='table' AND name='Good_Raw_Data'".format(dbName=DatabaseName))
                        conn.execute(
                            'ALTER TABLE Good_Raw_Data ADD COLUMN "{column_name}" {dataType}'
                            .format(column_name=key, dataType=type))
                    except:
                        conn.execute(
                            'CREATE TABLE  Good_Raw_Data ({column_name} {dataType})'
                            .format(column_name=key, dataType=type))

                    # try:
                    #     #cur.execute("SELECT name FROM {dbName} WHERE type='table' AND name='Bad_Raw_Data'".format(dbName=DatabaseName))
                    #     conn.execute('ALTER TABLE Bad_Raw_Data ADD COLUMN "{column_name}" {dataType}'.format(column_name=key,dataType=type))
                    #
                    # except:
                    #     conn.execute('CREATE TABLE Bad_Raw_Data ({column_name} {dataType})'.format(column_name=key, dataType=type))

                conn.close()

                file = open("Training_Logs/DbTableCreateLog.txt", 'a+')
                self.logger.log(file, "Tables created successfully!!")
                file.close()

                file = open("Training_Logs/DataBaseConnectionLog.txt", 'a+')
                self.logger.log(
                    file, "Closed %s database successfully" % DatabaseName)
                file.close()

        except Exception as e:
            file = open("Training_Logs/DbTableCreateLog.txt", 'a+')
            self.logger.log(file, "Error while creating table: %s " % e)
            file.close()
            conn.close()
            file = open("Training_Logs/DataBaseConnectionLog.txt", 'a+')
            self.logger.log(file,
                            "Closed %s database successfully" % DatabaseName)
            file.close()
            raise e

    def insertIntoTableGoodData(self, Database):
        """
                               Method Name: insertIntoTableGoodData
                               Description: This method inserts the Good data files from the Good_Raw folder into the
                                            above created table.
                               Output: None
                               On Failure: Raise Exception


        """

        conn = self.dataBaseConnection(Database)
        goodFilePath = self.goodFilePath
        badFilePath = self.badFilePath
        onlyfiles = [f for f in listdir(goodFilePath)]
        log_file = open("Training_Logs/DbInsertLog.txt", 'a+')

        for file in onlyfiles:
            try:
                with open(goodFilePath + '/' + file, "r") as f:
                    next(f)
                    reader = csv.reader(f, delimiter="\n")
                    for line in enumerate(reader):
                        for list_ in (line[1]):
                            try:
                                conn.execute(
                                    'INSERT INTO Good_Raw_Data values ({values})'
                                    .format(values=(list_)))
                                self.logger.log(
                                    log_file,
                                    " %s: File loaded successfully!!" % file)
                                conn.commit()
                            except Exception as e:
                                raise e

            except Exception as e:

                conn.rollback()
                self.logger.log(log_file,
                                "Error while creating table: %s " % e)
                shutil.move(goodFilePath + '/' + file, badFilePath)
                self.logger.log(log_file, "File Moved Successfully %s" % file)
                log_file.close()
                conn.close()

        conn.close()
        log_file.close()

    def selectingDatafromtableintocsv(self, Database):
        """
                               Method Name: selectingDatafromtableintocsv
                               Description: This method exports the data in GoodData table as a CSV file. in a given location.
                                            above created .
                               Output: None
                               On Failure: Raise Exception


        """

        self.fileFromDb = 'Training_FileFromDB/'
        self.fileName = 'InputFile.csv'
        log_file = open("Training_Logs/ExportToCsv.txt", 'a+')
        try:
            conn = self.dataBaseConnection(Database)
            sqlSelect = "SELECT *  FROM Good_Raw_Data"
            cursor = conn.cursor()

            cursor.execute(sqlSelect)

            results = cursor.fetchall()
            # Get the headers of the csv file
            headers = [i[0] for i in cursor.description]

            #Make the CSV ouput directory
            if not os.path.isdir(self.fileFromDb):
                os.makedirs(self.fileFromDb)

            # Open CSV file for writing.
            csvFile = csv.writer(open(self.fileFromDb + self.fileName,
                                      'w',
                                      newline=''),
                                 delimiter=',',
                                 lineterminator='\r\n',
                                 quoting=csv.QUOTE_ALL,
                                 escapechar='\\')

            # Add the headers and data to the CSV file.
            csvFile.writerow(headers)
            csvFile.writerows(results)

            self.logger.log(log_file, "File exported successfully!!!")
            log_file.close()

        except Exception as e:
            self.logger.log(log_file, "File exporting failed. Error : %s" % e)
            log_file.close()
 def __init__(self, path):
     self.Batch_Directory = path
     #my_file = rootProjPath+'\\schema_training.json'
     #self.schema_path = my_file
     self.schema_path = 'schema_training.json'
     self.logger = App_Logger()
 def __init__(self, path):
     self.Batch_Directory = path
     self.schema_path = 'schema_prediction.json'
     self.logger = App_Logger()
 def __init__(self, path):
     self.Batch_Directory = path
     self.schema_path = 'schema_training.json'
     self.logger = App_Logger()
Exemple #6
0
 def __init__(self):
     self.path = 'Prediction_Database/'
     self.badFilePath = "Prediction_Raw_Files_Validated/Bad_Raw"
     self.goodFilePath = "Prediction_Raw_Files_Validated/Good_Raw"
     self.logger = App_Logger()
class Training_Model:
    def __init__(
        self,
        models_list,
        sampling_method,
    ):
        self.logger_object = App_Logger()
        self.file_object = open('Training_Logs/ModelTrainingLog.txt', 'a+')
        self.sampling_method = sampling_method
        self.models_list = models_list

    def train_model(self):
        file = open('Training_Logs/General_Log.txt', 'a+')
        self.logger_object.log(
            file, 'Entered train_model() method of Training_Model class')
        file.close()

        try:
            data_getter = Data_Getter(self.file_object, self.logger_object)
            data = data_getter.get_data()

            preprocessor = PreProcessor(self.file_object, self.logger_object,
                                        self.sampling_method)
            data = preprocessor.remove_null(data)
            data = preprocessor.clean_reviews(data)
            data = preprocessor.remove_StopWords(data)
            data = preprocessor.remove_punctuations(data)
            data = preprocessor.pos_tagging_lemmatizeText(data)
            data = preprocessor.encode_label(data)
            x, y = preprocessor.separate_feature_label(data)
            x_train, x_test, y_train, y_test = train_test_split(
                x, y, test_size=0.3, random_state=100)
            x_train, x_test = preprocessor.count_vectorizer(x_train, x_test)
            #x_train,x_test = preprocessor.tfidf_vectorizer(x_train,x_test)
            x_train, x_test = preprocessor.tfidfTransformer_vectorizer(
                x_train, x_test)

            if self.sampling_method == 'us':
                x_train, y_train = preprocessor.under_sampling(
                    x_train, y_train)
            elif self.sampling_method == 'os':
                x_train, y_train = preprocessor.over_sampling(x_train, y_train)
            elif self.sampling_method == 'no':
                pass
            else:
                pass

            tuner = Model_Tuner(self.file_object, self.logger_object)
            self.trained_models_dict = {
                'svm': None,
                'rf': None,
                'xg': None,
                'mnb': None
            }

            for m in self.models_list:
                if m == 'svm':
                    self.trained_models_dict['svm'] = tuner.get_params_svm(
                        x_train, y_train)
                elif m == 'rf':
                    self.trained_models_dict[
                        'rf'] = tuner.get_params_for_RandomForest(
                            x_train, y_train)
                elif m == 'xg':
                    self.trained_models_dict[
                        'xg'] = tuner.get_best_params_for_XGBoost(
                            x_train, y_train)
                elif m == 'mnb':
                    self.trained_models_dict[
                        'mnb'] = tuner.get_params_bagging_naive_bayes(
                            x_train, y_train)
                else:
                    pass

            model_evaluation = Model_Evaluation(self.trained_models_dict,
                                                x_test, y_test,
                                                self.file_object,
                                                self.logger_object)
            self.model_evaluation_report_dict = model_evaluation.generate_models_evaluation_report_dict(
                self.trained_models_dict)
            self.ordered_model_evaluation_report_dict = sorted(
                self.model_evaluation_report_dict.items(),
                key=lambda x: x[1]['f1_score'],
                reverse=True)

            for m in self.ordered_model_evaluation_report_dict:
                model_to_save = m[0]
                break

            file_operation = File_Operation(self.file_object,
                                            self.logger_object)
            is_model_saved = file_operation.save_model(
                self.trained_models_dict[model_to_save], model_to_save)

            if (is_model_saved == 'success'):
                self.logger_object.log(self.file_object,
                                       'Successfull End of Training')
            else:
                self.logger_object.log(
                    self.file_object,
                    'Error while saving model to models directory')
            return is_model_saved
        except Exception as e:
            self.logger_object.log(self.file_object,
                                   'Unsuccessfull End of Training')
            self.file_object.close()
            raise e
class Prediction_Data_validation:
    """
               This class shall be used for handling all the validation done on the Raw Prediction Data!!.

               Written By: Rajat Bisoi
               Version: 1.0
               Revisions: None

               """
    def __init__(self):
        # self.Batch_Directory = path
        self.schema_path = 'schema_prediction.json'
        self.logger = App_Logger()
        self.mongo = To_mongo_db('wafer')
        self.aws = Aws_Bucket_operation(
            local_file_name_address='config/bucket_name')

    def valuesFromSchema(self):
        """
                                Method Name: valuesFromSchema
                                Description: This method extracts all the relevant information from the pre-defined "Schema" file.
                                Output: LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, Number of Columns
                                On Failure: Raise ValueError,KeyError,Exception

                                 Written By: Rajat Bisoi
                                Version: 1.0
                                Revisions: None

                                        """
        try:
            # with open(self.schema_path, 'r') as f:
            #     dic = json.load(f)
            #     f.close()

            id = self.mongo.Get_ID('schema_wafer_prediction', 'temp_db')
            dic = self.mongo.downlaod_from_mongo_raw('schema_wafer_prediction',
                                                     'temp_db', id[0])
            pattern = dic['SampleFileName']
            LengthOfDateStampInFile = dic['LengthOfDateStampInFile']
            LengthOfTimeStampInFile = dic['LengthOfTimeStampInFile']
            column_names = dic['ColName']
            NumberofColumns = dic['NumberofColumns']

            # file = open("Training_Logs/valuesfromSchemaValidationLog.txt", 'a+')
            message = "LengthOfDateStampInFile:: %s" % LengthOfDateStampInFile + "\t" + "LengthOfTimeStampInFile:: %s" % LengthOfTimeStampInFile + "\t " + "NumberofColumns:: %s" % NumberofColumns + "\n"
            self.logger.log('wafer_log', message)

            # file.close()

        except ValueError:
            # 'wafer_log' = open("Prediction_Logs/valuesfromSchemaValidationLog.txt", 'a+')
            self.logger.log(
                'wafer_log',
                "ValueError:Value not found inside schema_training.json")
            # 'wafer_log'.close()
            raise ValueError

        except KeyError:
            # 'wafer_log' = open("Prediction_Logs/valuesfromSchemaValidationLog.txt", 'a+')
            self.logger.log('wafer_log',
                            "KeyError:Key value error incorrect key passed")
            # 'wafer_log'.close()
            raise KeyError

        except Exception as e:
            # 'wafer_log' = open("Prediction_Logs/valuesfromSchemaValidationLog.txt", 'a+')
            self.logger.log('wafer_log', str(e))
            # 'wafer_log'.close()
            raise e

        return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns

    def manualRegexCreation(self):
        """
                                      Method Name: manualRegexCreation
                                      Description: This method contains a manually defined regex based on the "FileName" given in "Schema" file.
                                                  This Regex is used to validate the filename of the prediction data.
                                      Output: Regex pattern
                                      On Failure: None

                                       Written By: Rajat Bisoi
                                      Version: 1.0
                                      Revisions: None

                                              """
        regex = "['wafer']+['\_'']+[\d_]+[\d]+\.csv"
        return regex

    # Not used
    def createDirectoryForGoodBadRawData(self):
        """
                                        Method Name: createDirectoryForGoodBadRawData
                                        Description: This method creates directories to store the Good Data and Bad Data
                                                      after validating the prediction data.

                                        Output: None
                                        On Failure: OSError

                                         Written By: Rajat Bisoi
                                        Version: 1.0
                                        Revisions: None

                                                """
        try:
            path = os.path.join("Prediction_Raw_Files_Validated/", "Good_Raw/")
            if not os.path.isdir(path):
                os.makedirs(path)
            path = os.path.join("Prediction_Raw_Files_Validated/", "Bad_Raw/")
            if not os.path.isdir(path):
                os.makedirs(path)

        except OSError as ex:
            file = open("Prediction_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file, "Error while creating Directory %s:" % ex)
            file.close()
            raise OSError

    # Not used
    def deleteExistingGoodDataTrainingFolder(self):
        """
                                            Method Name: deleteExistingGoodDataTrainingFolder
                                            Description: This method deletes the directory made to store the Good Data
                                                          after loading the data in the table. Once the good files are
                                                          loaded in the DB,deleting the directory ensures space optimization.
                                            Output: None
                                            On Failure: OSError

                                             Written By: Rajat Bisoi
                                            Version: 1.0
                                            Revisions: None

                                                    """
        try:
            path = 'Prediction_Raw_Files_Validated/'
            # if os.path.isdir("ids/" + userName):
            # if os.path.isdir(path + 'Bad_Raw/'):
            #     shutil.rmtree(path + 'Bad_Raw/')
            if os.path.isdir(path + 'Good_Raw/'):
                shutil.rmtree(path + 'Good_Raw/')
                file = open("Prediction_Logs/GeneralLog.txt", 'a+')
                self.logger.log(file,
                                "GoodRaw directory deleted successfully!!!")
                file.close()
        except OSError as s:
            file = open("Prediction_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file, "Error while Deleting Directory : %s" % s)
            file.close()
            raise OSError
        # Not used
    def deleteExistingBadDataTrainingFolder(self):
        """
                                            Method Name: deleteExistingBadDataTrainingFolder
                                            Description: This method deletes the directory made to store the bad Data.
                                            Output: None
                                            On Failure: OSError

                                             Written By: Rajat Bisoi
                                            Version: 1.0
                                            Revisions: None

                                                    """

        try:
            path = 'Prediction_Raw_Files_Validated/'
            if os.path.isdir(path + 'Bad_Raw/'):
                shutil.rmtree(path + 'Bad_Raw/')
                file = open("Prediction_Logs/GeneralLog.txt", 'a+')
                self.logger.log(
                    file,
                    "BadRaw directory deleted before starting validation!!!")
                file.close()
        except OSError as s:
            file = open("Prediction_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file, "Error while Deleting Directory : %s" % s)
            file.close()
            raise OSError
        #Not used
    def moveBadFilesToArchiveBad(self):
        """
                                            Method Name: moveBadFilesToArchiveBad
                                            Description: This method deletes the directory made  to store the Bad Data
                                                          after moving the data in an archive folder. We archive the bad
                                                          files to send them back to the client for invalid data issue.
                                            Output: None
                                            On Failure: OSError

                                             Written By: Rajat Bisoi
                                            Version: 1.0
                                            Revisions: None

                                                    """
        now = datetime.now()
        date = now.date()
        time = now.strftime("%H%M%S")
        try:
            path = "PredictionArchivedBadData"
            if not os.path.isdir(path):
                os.makedirs(path)
            source = 'Prediction_Raw_Files_Validated/Bad_Raw/'
            dest = 'PredictionArchivedBadData/BadData_' + str(
                date) + "_" + str(time)
            if not os.path.isdir(dest):
                os.makedirs(dest)
            files = os.listdir(source)
            for f in files:
                if f not in os.listdir(dest):
                    shutil.move(source + f, dest)
            file = open("Prediction_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file, "Bad files moved to archive")
            path = 'Prediction_Raw_Files_Validated/'
            if os.path.isdir(path + 'Bad_Raw/'):
                shutil.rmtree(path + 'Bad_Raw/')
            self.logger.log(file, "Bad Raw Data Folder Deleted successfully!!")
            file.close()
        except OSError as e:
            file = open("Prediction_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file,
                            "Error while moving bad files to archive:: %s" % e)
            file.close()
            raise OSError

    def validationFileNameRaw(self, regex, LengthOfDateStampInFile,
                              LengthOfTimeStampInFile):
        """
            Method Name: validationFileNameRaw
            Description: This function validates the name of the prediction csv file as per given name in the schema!
                         Regex pattern is used to do the validation.If name format do not match the file is moved
                         to Bad Raw Data folder else in Good raw data.
            Output: None
            On Failure: Exception

             Written By: Rajat Bisoi
            Version: 1.0
            Revisions: None

        """
        # delete the directories for good and bad data in case last run was unsuccessful and folders were not deleted.
        # self.deleteExistingBadDataTrainingFolder()
        # self.deleteExistingGoodDataTrainingFolder()
        # self.createDirectoryForGoodBadRawData()
        self.mongo.Delete_collection('temp_db', 'wafer_bad_data_prediction')
        self.mongo.Delete_collection('temp_db', 'wafer_good_data_prediction')

        # onlyfiles = [f for f in listdir(self.Batch_Directory)]
        '''
        try:
            f = open("Prediction_Logs/nameValidationLog.txt", 'a+')
            for filename in onlyfiles:
                if (re.match(regex, filename)):
                    splitAtDot = re.split('.csv', filename)
                    splitAtDot = (re.split('_', splitAtDot[0]))
                    if len(splitAtDot[1]) == LengthOfDateStampInFile:
                        if len(splitAtDot[2]) == LengthOfTimeStampInFile:
                            shutil.copy("Prediction_Batch_files/" + filename, "Prediction_Raw_Files_Validated/Good_Raw")
                            self.logger.log(f,"Valid File name!! File moved to GoodRaw Folder :: %s" % filename)

                        else:
                            shutil.copy("Prediction_Batch_files/" + filename, "Prediction_Raw_Files_Validated/Bad_Raw")
                            self.logger.log(f,"Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename)
                    else:
                        shutil.copy("Prediction_Batch_files/" + filename, "Prediction_Raw_Files_Validated/Bad_Raw")
                        self.logger.log(f,"Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename)
                else:
                    shutil.copy("Prediction_Batch_files/" + filename, "Prediction_Raw_Files_Validated/Bad_Raw")
                    self.logger.log(f, "Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename)

            f.close()
            
            '''

        bucket_inst = self.aws.Create_S3_Bucket_Instance(
            bucket_prefix='wafer-prediction')
        try:
            # f = open("Training_Logs/nameValidationLog.txt", 'a+')
            for obj in bucket_inst.objects.all():
                data = self.aws.Download_From_S3(obj.key)
                if (re.match(regex, obj.key)):
                    splitAtDot = re.split('.csv', obj.key)
                    splitAtDot = (re.split('_', splitAtDot[0]))
                    if len(splitAtDot[1]) == LengthOfDateStampInFile:
                        if len(splitAtDot[2]) == LengthOfTimeStampInFile:
                            # shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Good_Raw")
                            self.mongo.send_to_mongo(
                                'wafer_good_data_prediction', 'temp_db', data)
                            self.logger.log(
                                'wafer_log',
                                f'file {obj.key} uploaded to collection wafer_good_data'
                            )

                        else:
                            # shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Bad_Raw")
                            data = data.to_json()
                            data = json.loads(data)
                            self.mongo.send_to_mongo_raw(
                                'wafer_bad_data_prediction', 'temp_db', data)
                            self.logger.log(
                                'wafer_log',
                                f'invalid file name  {obj.key} uploaded to collection wafer_bad_data'
                            )
                    else:
                        # shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Bad_Raw")
                        data = data.to_json()
                        data = json.loads(data)
                        self.mongo.send_to_mongo_raw(
                            'wafer_bad_data_prediction', 'temp_db', data)
                        self.logger.log(
                            'wafer_log',
                            f'invalid file name  {obj.key} uploaded to collection wafer_bad_data'
                        )
                else:
                    # shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Bad_Raw")
                    data = data.to_json()
                    data = json.loads(data)
                    self.mongo.send_to_mongo_raw('wafer_bad_data_prediction',
                                                 'temp_db', data)
                    self.logger.log(
                        'wafer_log',
                        f'invalid file name  {obj.key} uploaded to collection wafer_bad_data'
                    )

        except Exception as e:
            # f = open("Prediction_Logs/nameValidationLog.txt", 'a+')
            self.logger.log('wafer_log',
                            "Error occured while validating FileName %s" % e)
            # f.close()
            raise e

    def validateColumnLength(self, NumberofColumns):
        """
                    Method Name: validateColumnLength
                    Description: This function validates the number of columns in the csv files.
                                 It is should be same as given in the schema file.
                                 If not same file is not suitable for processing and thus is moved to Bad Raw Data folder.
                                 If the column number matches, file is kept in Good Raw Data for processing.
                                The csv file is missing the first column name, this function changes the missing name to "Wafer".
                    Output: None
                    On Failure: Exception

                     Written By: Rajat Bisoi
                    Version: 1.0
                    Revisions: None

             """
        '''
        try:
            f = open("Prediction_Logs/columnValidationLog.txt", 'a+')
            self.logger.log(f,"Column Length Validation Started!!")
            for file in listdir('Prediction_Raw_Files_Validated/Good_Raw/'):
                csv = pd.read_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file)
                if csv.shape[1] == NumberofColumns:
                    csv.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True)
                    csv.to_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file, index=None, header=True)
                else:
                    shutil.move("Prediction_Raw_Files_Validated/Good_Raw/" + file, "Prediction_Raw_Files_Validated/Bad_Raw")
                    self.logger.log(f, "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file)

            self.logger.log(f, "Column Length Validation Completed!!")
        except OSError:
            f = open("Prediction_Logs/columnValidationLog.txt", 'a+')
            self.logger.log(f, "Error Occured while moving the file :: %s" % OSError)
            f.close()
            raise OSError
        except Exception as e:
            f = open("Prediction_Logs/columnValidationLog.txt", 'a+')
            self.logger.log(f, "Error Occured:: %s" % e)
            f.close()
            raise e

        f.close()
        '''
        try:
            # f = open("Training_Logs/columnValidationLog.txt", 'a+')
            self.logger.log('wafer_log', "Column Length Validation Started!!")
            idx = self.mongo.Get_ID('wafer_good_data_prediction', 'temp_db')
            for file in idx:
                # csv = pd.read_csv("Training_Raw_files_validated/Good_Raw/" + file)
                try:
                    testfile = self.mongo.downlaod_one_from_mongo(
                        'wafer_good_data_prediction', 'temp_db', file)
                except Exception as err:
                    try:
                        testfile = self.mongo.downlaod_from_mongo_raw(
                            'wafer_good_raw_prediction', 'temp_db', file)
                    except Exception as err1:
                        self.mongo.Move_data_in_collections(
                            'wafer_good_data', 'wafer_bad_data_prediction',
                            'temp_db', file)
                        self.logger.log(
                            'wafer_log',
                            "Invalid Column Length for the file !! File moved to "
                            "wafer_Bad_Raw_prediction collection ")
                        raise [err, err1]
                testfile = pd.DataFrame(testfile)
                if testfile.shape[1] == NumberofColumns:
                    pass
                else:
                    # shutil.move("Training_Raw_files_validated/Good_Raw/" + file, "Training_Raw_files_validated/Bad_Raw")
                    self.mongo.Move_data_in_collections(
                        'wafer_good_data_prediction',
                        'wafer_bad_data_prediction', 'temp_db', file)
                    self.logger.log(
                        'wafer_log',
                        "Invalid Column Length for the file !! File moved to "
                        "wafer_Bad_Raw_prediction collection ")
            self.logger.log('wafer_log',
                            "Column Length Validation Completed!!")
        except OSError:
            # f = open("Training_Logs/columnValidationLog.txt", 'a+')
            self.logger.log('wafer_log',
                            f"Error Occured while moving the file {OSError}")
            # f.close()
            raise OSError
        except Exception as e:
            # f = open("Training_Logs/columnValidationLog.txt", 'a+')
            self.logger.log("wafer_log", f"Error Occured {e}")
            # f.close()
            raise e

    def deletePredictionFile(self):

        # if os.path.exists('Prediction_Output_File/Predictions.csv'):
        #     os.remove('Prediction_Output_File/Predictions.csv')
        self.mongo.Delete_collection('temp_db', 'prediction_output')

    def validateMissingValuesInWholeColumn(self):
        """
                                  Method Name: validateMissingValuesInWholeColumn
                                  Description: This function validates if any column in the csv file has all values missing.
                                               If all the values are missing, the file is not suitable for processing.
                                               SUch files are moved to bad raw data.
                                  Output: None
                                  On Failure: Exception

                                   Written By: Rajat Bisoi
                                  Version: 1.0
                                  Revisions: None

                              """
        '''
        
        try:
            f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+')
            self.logger.log(f, "Missing Values Validation Started!!")

            for file in listdir('Prediction_Raw_Files_Validated/Good_Raw/'):
                csv = pd.read_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file)
                count = 0
                for columns in csv:
                    if (len(csv[columns]) - csv[columns].count()) == len(csv[columns]):
                        count+=1
                        shutil.move("Prediction_Raw_Files_Validated/Good_Raw/" + file,
                                    "Prediction_Raw_Files_Validated/Bad_Raw")
                        self.logger.log(f,"Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file)
                        break
                if count==0:
                    csv.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True)
                    csv.to_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file, index=None, header=True)
        except OSError:
            f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+')
            self.logger.log(f, "Error Occured while moving the file :: %s" % OSError)
            f.close()
            raise OSError
        except Exception as e:
            f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+')
            self.logger.log(f, "Error Occured:: %s" % e)
            f.close()
            raise e
        f.close()
        '''
        try:
            # f = open("Training_Logs/missingValuesInColumn.txt", 'a+')
            self.logger.log('wafer_log', "Missing Values Validation Started!!")

            idx = self.mongo.Get_ID('wafer_good_data_prediction', 'temp_db')
            for file in idx:
                # csv = pd.read_csv("Training_Raw_files_validated/Good_Raw/" + file)
                testfile = self.mongo.downlaod_one_from_mongo(
                    'wafer_good_data_prediction', 'temp_db', file)
                testfile = pd.DataFrame(testfile)
                count = 0
                for columns in testfile:
                    if (len(testfile[columns]) -
                            testfile[columns].count()) == len(
                                testfile[columns]):
                        count += 1
                        # shutil.move("Training_Raw_files_validated/Good_Raw/" + file, "Training_Raw_files_validated/Bad_Raw")
                        self.mongo.Move_data_in_collections(
                            'wafer_good_data_prediction',
                            'wafer_bad_data_prediction', 'temp_db', file)
                        self.logger.log(
                            'wafer_log',
                            f"Invalid Column Length for the file!! File moved to wafer_bad_data_prediction :: {file}"
                        )
                        break
                if count == 0:
                    # testfile.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True)
                    # testfile.to_csv("Training_Raw_files_validated/Good_Raw/" + file, index=None, header=True)
                    self.mongo.send_to_mongo('wafer_good_data_prediction',
                                             'temp_db', testfile)
        except OSError:
            # f = open("Training_Logs/missingValuesInColumn.txt", 'a+')
            self.logger.log(
                'wafer_log',
                "Error Occured while moving the file :: %s" % OSError)
            # f.close()
            raise OSError
        except Exception as e:
            # f = open("Training_Logs/missingValuesInColumn.txt", 'a+')
            self.logger.log('wafer_log', "Error Occured:: %s" % e)
            # f.close()
            raise e
Exemple #9
0
class Prediction_Data_validation:
    """
        This class shall be used for handling all the validation done on the Raw Prediction Data!!.
    """
    def __init__(self, path):
        self.Batch_Directory = path
        self.schema_path = 'schema_prediction.json'
        self.logger = App_Logger()
        self.awsObj = AwsStorageManagement()
        self.dbObj = mongoDBOperation()

    def valuesFromSchema(self):
        """
            Method Name: valuesFromSchema
            Description: This method extracts all the relevant information from the pre-defined "Schema" file.
            Output: LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, Number of Columns
            On Failure: Raise ValueError,KeyError,Exception
        """
        try:
            if not self.dbObj.isCollectionPresent('mushroomClassifierDB',
                                                  'predict_schema'):
                with open(self.schema_path, 'r') as f:
                    dic = json.load(f)
                    f.close()
                self.dbObj.insertOneRecord('mushroomClassifierDB',
                                           'predict_schema', dic)
            dic = self.dbObj.getRecords('mushroomClassifierDB',
                                        'predict_schema')
            pattern = dic['SampleFileName']
            LengthOfDateStampInFile = dic['LengthOfDateStampInFile']
            LengthOfTimeStampInFile = dic['LengthOfTimeStampInFile']
            column_names = dic['ColName']
            NumberofColumns = dic['NumberofColumns']

            file = 'valuesfromSchemaValidationLog'
            message = "LengthOfDateStampInFile:: %s" % LengthOfDateStampInFile + "\t" + "LengthOfTimeStampInFile:: %s" % LengthOfTimeStampInFile + "\t " + "NumberofColumns:: %s" % NumberofColumns + "\n"
            self.logger.log(file, message)

        except ValueError:
            file = 'valuesfromSchemaValidationLog'
            self.logger.log(
                file, "ValueError:Value not found inside schema_training.json")
            raise ValueError

        except KeyError:
            file = 'valuesfromSchemaValidationLog'
            self.logger.log(file,
                            "KeyError:Key value error incorrect key passed")
            raise KeyError

        except Exception as e:
            file = 'valuesfromSchemaValidationLog'
            self.logger.log(file, str(e))
            raise e

        return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns

    def manualRegexCreation(self):
        """
          Method Name: manualRegexCreation
          Description: This method contains a manually defined regex based on the "FileName" given in "Schema" file.
                      This Regex is used to validate the filename of the prediction data.
          Output: Regex pattern
          On Failure: None
        """
        regex = "['mushroom']+['\_'']+[\d_]+[\d]+\.csv"
        return regex

    def createDirectoryForGoodBadRawData(self):
        """
            Method Name: createDirectoryForGoodBadRawData
            Description: This method creates directories to store the Good Data and Bad Data
                          after validating the prediction data.

            Output: None
            On Failure: Exception
        """
        try:
            self.awsObj.createS3Directory(
                'Prediction_Good_Raw_Files_Validated')
            self.awsObj.createS3Directory('Prediction_Bad_Raw_Files_Validated')
        except Exception as ex:
            file = 'GeneralLog'
            self.logger.log(file, "Error while creating Directory %s:" % ex)

    def deleteExistingGoodDataTrainingFolder(self):
        """
            Method Name: deleteExistingGoodDataTrainingFolder
            Description: This method deletes the directory made to store the Good Data
                          after loading the data in the table. Once the good files are
                          loaded in the DB,deleting the directory ensures space optimization.
            Output: None
            On Failure: Exception
        """
        try:
            file = 'GeneralLog'
            self.logger.log(file, "GoodRaw directory deleted successfully!!!")
            self.awsObj.deleteDirectory('Prediction_Good_Raw_Files_Validated')
        except Exception as s:
            file = 'GeneralLog'
            self.logger.log(file, "Error while Deleting Directory : %s" % s)
            raise s

    def deleteExistingBadDataTrainingFolder(self):
        """
            Method Name: deleteExistingBadDataTrainingFolder
            Description: This method deletes the directory made to store the bad Data.
            Output: None
            On Failure: Exception
        """

        try:
            file = 'GeneralLog'
            self.logger.log(
                file, "BadRaw directory deleted before starting validation!!!")
            self.awsObj.deleteDirectory('Prediction_Bad_Raw_Files_Validated')
        except Exception as s:
            file = 'GeneralLog'
            self.logger.log(file, "Error while Deleting Directory : %s" % s)
            raise s

    def moveBadFilesToArchiveBad(self):
        """
            Method Name: moveBadFilesToArchiveBad
            Description: This method deletes the directory made  to store the Bad Data
                          after moving the data in an archive folder. We archive the bad
                          files to send them back to the client for invalid data issue.
            Output: None
            On Failure: Exception
        """
        now = datetime.now()
        date = now.date()
        time = now.strftime("%H%M%S")
        try:
            target_folder = 'PredictionArchivedBadData/BadData_' + str(
                date) + "_" + str(time)
            self.awsObj.copyFileToFolder('Prediction_Bad_Raw_Files_Validated',
                                         target_folder)

            file = 'GeneralLog'
            self.logger.log(file, "Bad files moved to archive")

            self.logger.log(file, "Bad Raw Data Folder Deleted successfully!!")
        except Exception as e:
            file = 'GeneralLog'
            self.logger.log(file,
                            "Error while moving bad files to archive:: %s" % e)
            raise e

    def validationFileNameRaw(self, regex, LengthOfDateStampInFile,
                              LengthOfTimeStampInFile):
        """
            Method Name: validationFileNameRaw
            Description: This function validates the name of the prediction csv file as per given name in the schema!
                         Regex pattern is used to do the validation.If name format do not match the file is moved
                         to Bad Raw Data folder else in Good raw data.
            Output: None
            On Failure: Exception
        """
        # delete the directories for good and bad data in case last run was unsuccessful and folders were not deleted.
        self.deleteExistingBadDataTrainingFolder()
        self.deleteExistingGoodDataTrainingFolder()
        self.createDirectoryForGoodBadRawData()
        batch_dir = self.Batch_Directory.strip('/').strip('\\')
        print('Prediction File Path: ', batch_dir)
        self.awsObj.uploadFiles(batch_dir, batch_dir)
        onlyfiles = self.awsObj.listDirFiles(batch_dir)
        try:
            f = 'nameValidationLog'
            for filename in onlyfiles:
                if (re.match(regex, filename)):
                    splitAtDot = re.split('.csv', filename)
                    splitAtDot = (re.split('_', splitAtDot[0]))
                    if len(splitAtDot[1]) == LengthOfDateStampInFile:
                        if len(splitAtDot[2]) == LengthOfTimeStampInFile:
                            self.awsObj.copyFileToFolder(
                                batch_dir,
                                'Prediction_Good_Raw_Files_Validated',
                                filename)
                            self.logger.log(
                                f,
                                "Valid File name!! File moved to GoodRaw Folder :: %s"
                                % filename)

                        else:
                            self.awsObj.copyFileToFolder(
                                self.Batch_Directory,
                                'Prediction_Bad_Raw_Files_Validated', filename)
                            self.logger.log(
                                f,
                                "Invalid File Name!! File moved to Bad Raw Folder :: %s"
                                % filename)
                    else:
                        self.awsObj.copyFileToFolder(
                            self.Batch_Directory,
                            'Prediction_Bad_Raw_Files_Validated', filename)
                        self.logger.log(
                            f,
                            "Invalid File Name!! File moved to Bad Raw Folder :: %s"
                            % filename)
                else:
                    self.awsObj.copyFileToFolder(
                        self.Batch_Directory,
                        'Prediction_Bad_Raw_Files_Validated', filename)
                    self.logger.log(
                        f,
                        "Invalid File Name!! File moved to Bad Raw Folder :: %s"
                        % filename)

        except Exception as e:
            f = 'nameValidationLog'
            self.logger.log(f,
                            "Error occured while validating FileName %s" % e)
            raise e

    def validateColumnLength(self, NumberofColumns):
        """
            Method Name: validateColumnLength
            Description: This function validates the number of columns in the csv files.
                         It is should be same as given in the schema file.
                         If not same file is not suitable for processing and thus is moved to Bad Raw Data folder.
                         If the column number matches, file is kept in Good Raw Data for processing.
                        The csv file is missing the first column name, this function changes the missing name to "Wafer".
            Output: None
            On Failure: Exception
        """
        try:
            f = 'columnValidationLog'
            self.logger.log(f, "Column Length Validation Started!!")
            file_list = self.awsObj.listDirFiles(
                'Prediction_Good_Raw_Files_Validated')
            for file in file_list:
                csv = self.awsObj.csvToDataframe(
                    'Prediction_Good_Raw_Files_Validated', file)
                if csv.shape[1] == NumberofColumns:
                    self.awsObj.saveDataframeToCsv(
                        'Prediction_Good_Raw_Files_Validated', file, csv)
                else:
                    self.awsObj.moveFileToFolder(
                        'Prediction_Good_Raw_Files_Validated',
                        'Prediction_Bad_Raw_Files_Validated', file)
                    self.logger.log(
                        f,
                        "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s"
                        % file)

            self.logger.log(f, "Column Length Validation Completed!!")
        except OSError:
            f = 'columnValidationLog'
            self.logger.log(
                f, "Error Occurred while moving the file :: %s" % OSError)
            raise OSError
        except Exception as e:
            f = 'columnValidationLog'
            self.logger.log(f, "Error Occurred:: %s" % e)
            raise e

    def deletePredictionFile(self):

        self.awsObj.deleteFile('Prediction_Output_File', 'Predictions.csv')

    def validateMissingValuesInWholeColumn(self):
        """
              Method Name: validateMissingValuesInWholeColumn
              Description: This function validates if any column in the csv file has all values missing.
                           If all the values are missing, the file is not suitable for processing.
                           SUch files are moved to bad raw data.
              Output: None
              On Failure: Exception
        """
        try:
            f = 'missingValuesInColumn'
            self.logger.log(f, "Missing Values Validation Started!!")
            file_list = self.awsObj.listDirFiles(
                'Prediction_Good_Raw_Files_Validated')
            for file in file_list:
                csv = self.awsObj.csvToDataframe(
                    'Prediction_Good_Raw_Files_Validated', file)
                count = 0
                for columns in csv:
                    if (len(csv[columns]) - csv[columns].count()) == len(
                            csv[columns]):
                        count += 1
                        self.awsObj.moveFileToFolder(
                            'Prediction_Good_Raw_Files_Validated',
                            'Prediction_Bad_Raw_Files_Validated', file)
                        self.logger.log(
                            f,
                            "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s"
                            % file)
                        break
                if count == 0:
                    self.awsObj.saveDataframeToCsv(
                        'Prediction_Good_Raw_Files_Validated', file, csv)
        except OSError:
            f = 'missingValuesInColumn'
            self.logger.log(
                f, "Error Occurred while moving the file :: %s" % OSError)
            raise OSError
        except Exception as e:
            f = 'missingValuesInColumn'
            self.logger.log(f, "Error Occurred:: %s" % e)
            raise e
Exemple #10
0
import pandas as pd
from utility import TextSumarization
import argparse
from application_logging.logger import App_Logger
import time

logger_object = App_Logger()
general_logs = open("Logs/generallogs.txt", '+a')
success_file = open("Logs/successlogs.txt", '+a')
error_file = open("Logs/errorlogs.txt", '+a')


def run():
    parser = argparse.ArgumentParser(
        description='Summarization of TextData using pretrained models')
    parser.add_argument('-path',
                        dest='path',
                        default='Final_news.csv',
                        help='File path')
    parser.add_argument(
        '-model',
        dest='model',
        default='bert-base-uncased',
        help='[xlnet-base-cased,distilbert-base-uncased,albert-base-v1]')
    args = parser.parse_args()
    try:
        logger_object.log(
            general_logs,
            "Entered the runner file --> Running the script now !")
        if not args.path:
            raise RuntimeError("Must supply text path.")
Exemple #11
0
class trainValidation:
    

    def __init__(self,path):
        self.raw_data = Raw_Data_Validation(path)
        self.dataTransform = dataTransform()  #No Use
        self.dBOperation = dBOperation()
        self.file_object = open('Training_Logs/Training_Main_Log.txt', 'a+')
        self.log_writer = App_Logger()

    
    def train_validation(self):
        try:
            self.log_writer.log(self.file_object,'Start of Validation on files for prediction!!')
            # extracting values from prediction schema
            LengthOfDateStampInFile, LengthOfTimeStampInFile,column_names,noofcolumns = self.raw_data.valuesFromSchema()
            # getting the regex defined to validate filename
            regex = self.raw_data.manualRegexCreation()
            # validating filename of prediction files
            self.raw_data.validationFileNameRaw(regex,LengthOfDateStampInFile,LengthOfTimeStampInFile)
            # validating column length in the file
            print(regex)
            print('hello')
            print(noofcolumns)
            self.raw_data.validateColumnLength(noofcolumns)
            # validating if any column has all values missing
            self.raw_data.validateMissingValuesInWholeColumn()
            self.log_writer.log(self.file_object, "Raw Data Validation Complete!!")

            # storing in the database

            self.log_writer.log(self.file_object,
                                "Creating Training_Database and tables on the basis of given schema!!!")
            
             # create database with given name, if present open the connection! Create table with columns given in schema
            self.dBOperation.createTableDb('Training', column_names)

            self.log_writer.log(self.file_object, "Table creation Completed!!")
            self.log_writer.log(self.file_object, "Insertion of Data into Table started!!!!")

            #insert into the database
            self.dBOperation.insertIntoTableGoodData('Training')
            self.log_writer.log(self.file_object, "Insertion in Table completed!!!")


            self.log_writer.log(self.file_object, "Deleting Good Data Folder!!!")
            # Delete the good data folder after loading files in table
            self.raw_data.deleteExistingGoodDataTrainingFolder()

            self.log_writer.log(self.file_object, "Good_Data folder deleted!!!")
            self.log_writer.log(self.file_object, "Moving bad files to Archive and deleting Bad_Data folder!!!")
            # Move the bad files to archive folder
            self.raw_data.movebadFilesToArchiveBad()
            self.log_writer.log(self.file_object, "Bad files moved to archive!! Bad folder Deleted!!")
            self.log_writer.log(self.file_object, "Validation Operation completed!!")

            self.log_writer.log(self.file_object, "Extracting csv file from table")
            # export data in table to csvfile
            self.dBOperation.selectingDatafromtableintocsv('Training')
            self.file_object.close()


        except Exception as e:
            raise e
class Prediction_Data_Validation:
    '''
        This class will be used for validation on the raw Testing data or Prediction data

        @author Niranjan
        version : 1.0
        Revision : None

    '''
    def __init__(self, path):
        self.Batch_Directory = path
        self.schema_path = 'schema_prediction.json'
        self.logger = App_Logger()

    def valuesFromSchema(self):
        """ 
            Method Name : valuesFromSchema
            Description : This method extracts all the relevant  information from the predefined "Schema" file 
            Output : LengthOfDateStampInFile, LengthOfTimeStampInFile, NumberofColumns,ColName
            On Failure : Raise ValueError, KeyError, Exception

            Author : Niranjan
            Version : 1.0
            Revision : None

        """

        try:
            with open(self.schema_path, 'r') as file:
                dic = json.load(file)
                file.close()

            pattern = dic['SampleFileName']
            LengthOfDateStampInFile = dic['LengthOfDateStampInFile']
            LengthOfTimeStampInFile = dic['LengthOfTimeStampInFile']
            column_names = dic['ColName']
            NumberofColumns = dic['NumberofColumns']

            file = open('Prediction_Logs/valuesfromSchemaValidationLog.txt',
                        'a+')
            message = "LengthOfDateStampInFile:: %s" % LengthOfDateStampInFile + "\t" + "LengthOfTimeStampInFile:: %s" % LengthOfTimeStampInFile + "\t " + "NumberofColumns:: %s" % NumberofColumns + "\n"

            self.logger.log(file, message)

            file.close()

        except ValueError:
            file = open('Prediction_Logs/valuesfromSchemaValidationLog.txt',
                        'a+')
            self.logger.log(file, " ValueError: Value not found inside schema")
            file.close()
            raise ValueError
        except KeyError:
            file = open('Prediction_Logs/valuesfromSchemaValidationLog.txt',
                        'a+')
            self.logger.log(file,
                            " KeyError: Key value error incorrect key passed")
            raise KeyError

        except Exception as e:
            file = open('Prediction_Logs/valuesfromSchemaValidationLog.txt',
                        'a+')
            self.logger.log(file, str(e))

            raise e

        return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns

    def manualRegexCreation(self):
        """ 
            Method Name : manualRegexCreation
            Description : This method contains a manually defines regex based on the "Filename" given in "Schema" File
                          This Regex is used to validate the filename of the prediction data.

            Output  : Regex pattern
            On Failure: None

            Author : Niranjan
            Version : 1.0
            Revision : None
            
        """
        regex = r"['cement_strength']+['\_'']+[\d_]+[\d]+\.csv"

        return regex

    def createDirectoryForGoodBadRawData(self):
        """ 
            Method Name : createDirectoryForGoodBadRawData
            Description : This method creates directories to store the Good Data and Bad Data after validating the prediction data.

            Output  : None
            On Failure: OsError

            Author : Niranjan
            Version : 1.0
            Revision : None
            
        """
        try:
            path = os.path.join("Prediction_Raw_Files_Validated/", "Good_Raw/")
            if not os.path.isdir(path):
                os.makedirs(path)
            path = os.path.join("Prediction_Raw_Files_Validated/", "Bad_Raw/")
            if not os.path.isdir(path):
                os.makedirs(path)

        except OSError as ex:
            file = open("Prediction_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file, "Error while creating Directory %s:" % ex)
            file.close()
            raise OSError

    def deleteExistingGoodDataTrainingFolder(self):
        """ 
            Method Name : deleteExistingGoodDataTrainingFolder
            Description : This method deltes directory made to store the Good Data after loading the data in the table. Once the good files are loaded in the DB,deleting the directory ensures space optimization.

            Output  : None
            On Failure: OsError

            Author : Niranjan
            Version : 1.0
            Revision : None
            
        """
        try:
            path = 'Prediction_Raw_Files_validated/'

            if os.path.isdir(path + "Good_Raw/"):

                shutil.rmtree(path + "Good_Raw/")
                file = open("Prediction_Logs/GeneralLog.txt", 'a+')
                self.logger.log(file,
                                "Good_Raw directory  delted successfully !!!")
                file.close()
        except OSError as ex:
            file = open("Prediction_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file,
                            "Error while Deleting Good_Raw Directory %s:" % ex)
            file.close()

            raise OSError

    def deleteExistingBadDataTrainingFolder(self):
        """ 
            Method Name : deleteExistingBadDataTrainingFolder
            Description : This method deletes directory made to store the Bad Data.

            Output  : None
            On Failure: OsError

            Author : Niranjan
            Version : 1.0
            Revision : None
            
        """
        try:
            path = 'Prediction_Raw_Files_validated/'

            if os.path.isdir(path + "Bad_Raw/"):

                shutil.rmtree(path + "Bad_Raw/")
                file = open("Prediction_Logs/GeneralLog.txt", 'a+')
                self.logger.log(file,
                                "Bad_Raw directory  delted successfully !!!")
                file.close()
        except OSError as ex:
            file = open("Prediction_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file,
                            "Error while Deleting Bad_Raw Directory %s:" % ex)
            file.close()

            raise OSError

    def moveBadFilesToArchiveBad(self):
        """ 
            Method Name : movebadFilesToArchiveBad
            Description : This method deletes the directory made  to store the Bad Data
                          after moving the data in an archive folder. We archive the bad
                          files to send them back to the client for invalid data issue..

            Output  : None
            On Failure: OsError

            Author : Niranjan
            Version : 1.0
            Revision : None
            
        """

        now = datetime.now()
        date = now.date()
        time = now.strftime("%H%M%S")

        try:
            source = 'Prediction_Raw_Files_validated/Bad_Raw/'

            if os.path.isdir(source):
                path = 'PredictionArchivedBadData'

                if not os.path.isdir(path):
                    os.makedirs(path)

                dest = 'PredictionArchivedBadData/BadData_' + str(
                    date) + '_' + str(time)

                if not os.path.isdir(dest):
                    os.makedirs(dest)

                files = os.listdir(source)

                for f in files:
                    if f not in os.listdir(dest):
                        shutil.move(source + f,
                                    dest)  #move each file to destination

                file = open("Prediction_Logs/GeneralLog.txt", 'a+')
                self.logger.log(file, "All Bad files are moved archive folder")

                path = 'Prediction_Raw_Files_validated/'

                if os.path.isdir(path + 'Bad_Raw/'):
                    shutil.rmtree(path + 'Bad_Raw/')

                self.logger.log(file,
                                "Bad Raw Data Folder Deleted successfully!!")
                file.close()
        except OSError as e:
            file = open("Prediction_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file,
                            "Error while moving bad files to archive:: %s" % e)
            file.close()
            raise e

    def validationFileNameRaw(self, regex, LengthOfDateStampInFile,
                              LengthOfTimeStampInFile):
        """ 
            Method Name : validationFileNameRaw
            Description : This function validates the name of the training csv files as per given name in the schema!
                          Regex pattern is used to do the validation.If name format do not match the file is moved
                                 to Bad Raw Data folder else in Good raw data.

            Output  : None
            On Failure: Exception

            Author : Niranjan
            Version : 1.0
            Revision : None
            
        """
        # delete the directories for good and bad data in case last run was unsuccessful and folders were not deleted.
        self.deleteExistingBadDataTrainingFolder()
        self.deleteExistingGoodDataTrainingFolder()
        # create new directories
        self.createDirectoryForGoodBadRawData()

        onlyfiles = [f for f in os.listdir(self.Batch_Directory)]

        try:
            # create new directories
            # self.createDirectoryForGoodBadRawData()
            f = open("Prediction_Logs/nameValidationLog.txt", 'a+')
            for filename in onlyfiles:
                if re.match(regex, filename):
                    splitAtDot = re.split('.csv', filename)
                    splitAtDot = (re.split('_', splitAtDot[0]))
                    if len(splitAtDot[2]) == LengthOfDateStampInFile:
                        if len(splitAtDot[3]) == LengthOfTimeStampInFile:
                            shutil.copy(
                                "Prediction_Batch_Files/" + filename,
                                "Prediction_Raw_Files_validated/Good_Raw")
                            self.logger.log(
                                f,
                                "Valid File Name !! File moved Good Raw folder :: %s"
                                % filename)

                        else:
                            shutil.copy(
                                "Prediction_Batch_Files/" + filename,
                                "Prediction_Raw_Files_validated/Bad_Raw")
                            self.logger.log(
                                f,
                                "Invalid File Name !! File moved Good Raw folder :: %s"
                                % filename)

                    else:
                        shutil.copy("Prediction_Batch_Files/" + filename,
                                    "Prediction_Raw_Files_validated/Bad_Raw")
                        self.logger.log(
                            f,
                            "Invalid File Name !! File moved Good Raw folder :: %s"
                            % filename)
                else:
                    shutil.copy("Prediction_Batch_Files/" + filename,
                                "Prediction_Raw_Files_validated/Bad_Raw")
                    self.logger.log(
                        f,
                        "Invalid File Name !! File moved Good Raw folder :: %s"
                        % filename)
            f.close()

        except Exception as e:
            f = open("Prediction_Logs/nameValidationLog.txt", 'a+')
            self.logger.log(f,
                            "Error occured while validating FileName %s" % e)
            f.close()
            raise e

    def validateColumnLength(self, NumberofColumns):
        """ 
            Method Name : validateColumnLength
            Description : This function validates the number of of columns in the csv files.
                          It is should be same as given in the schema file.
                                       If not same file is not suitable for processing and thus is moved to Bad Raw Data folder.
                                       If the column number matches, file is kept in Good Raw Data for processing.

            Output  : None
            On Failure: OSError, Exception

            Author : Niranjan
            Version : 1.0
            Revision : None
            
        """
        try:
            f = open("Prediction_Logs/columnValidationLog.txt", 'a+')
            self.logger.log(f, "Column Length validation started !!")

            for file in os.listdir('Prediction_Raw_Files_validated/Good_Raw/'):
                csv = pd.read_csv("Prediction_Raw_Files_validated/Good_Raw/" +
                                  file)
                # print(NumberofColumns)
                # print(csv.shape[1])
                if csv.shape[1] == NumberofColumns:
                    pass
                else:
                    shutil.move(
                        "Prediction_Raw_Files_validated/Good_Raw/" + file,
                        "Prediction_Raw_Files_validated/Bad_Raw/" + file)
                    self.logger.log(
                        f,
                        "Invalid column length!! File moved to Bad_Raw folder")
            self.logger.log(f, "Column Length validation completed")

            f.close()  #checkpoint

        except OSError as e:

            f = open("Prediction_Logs/columnValidationLog.txt", 'a+')
            self.logger.log(f, "Error occured while moving the file :: %s" % e)
            f.close()
        except Exception as e:
            f = open("Prediction_Logs/columnValidationLog.txt", 'a+')
            self.logger.log(f, "Error occured:: %s" % e)
            f.close()

    def deletePredictionFile(self):
        if os.path.exists('Prediction_Output_File/Predictions.csv'):
            os.remove('Prediction_Output_File/Predictions.csv')

    def validateMissingValuesInWholeColumn(self):
        """
            Method Name: validateMissingValuesInWholeColumn
            Description: This function validates if any column in the csv file has all values missing.
                        If all the values are missing, the file is not suitable for processing.
                        SUch files are moved to bad raw data.
            Output: None
            On Failure: Exception

            Written By: Niranjan
            Version: 1.0
            Revisions: None

        """
        try:
            f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+')
            self.logger.log(f, "Missing Values Validation Started!!")

            for file in listdir('Prediction_Raw_Files_validated/Good_Raw/'):
                csv = pd.read_csv("Prediction_Raw_Files_validated/Good_Raw/" +
                                  file)
                count = 0
                for columns in csv:
                    if (len(csv[columns]) - csv[columns].count()) == len(
                            csv[columns]):
                        count += 1
                        shutil.move(
                            "Prediction_Raw_Files_validated/Good_Raw/" + file,
                            "Prediction_Raw_Files_validated/Bad_Raw")
                        self.logger.log(
                            f,
                            "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s"
                            % file)
                        break
                if count == 0:
                    csv.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True)
                    csv.to_csv("Prediction_Raw_Files_validated/Good_Raw/" +
                               file,
                               index=None,
                               header=True)
        except OSError:
            f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+')
            self.logger.log(
                f, "Error Occured while moving the file :: %s" % OSError)
            f.close()
            raise OSError
        except Exception as e:
            f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+')
            self.logger.log(f, "Error Occured:: %s" % e)
            f.close()
            raise e
        f.close()
class dataTransform:

     """
               This class shall be used for transforming the Good Raw Training Data before loading it in Database!!.

               Written By: Malini
               Version: 1.0
               Revisions: None

               """

     def __init__(self):
          self.goodDataPath = "Training_Raw_files_validated/Good_Raw"
          self.logger = App_Logger()


     def addQuotesToStringValuesInColumn(self):
          """
                                           Method Name: addQuotesToStringValuesInColumn
                                           Description: This method converts all the columns with string datatype such that
                                                       each value for that column is enclosed in quotes. This is done
                                                       to avoid the error while inserting string values in table as varchar.

                                            Written By: Malini
                                           Version: 1.0
                                           Revisions: None

                                                   """

          log_file = open("Training_Logs/addQuotesToStringValuesInColumn.txt", 'a+')
          try:
               onlyfiles = [f for f in listdir(self.goodDataPath)]
               for file in onlyfiles:
                    data = pd.read_csv(self.goodDataPath+"/" + file)
                    #data = self.removeHyphenFromColumnNames(data)
                    # for col in data.columns:
                    #      # if col in column: # add quotes in string value
                    data['Id'] = data["Id"].apply(lambda x: "'" + str(x) + "'")
                    data['ActivityDate'] = data["ActivityDate"].apply(lambda x: "'" + str(x) + "'")
                         # if col not in column: # add quotes to '?' values in integer/float columns
                    # for column in data.columns:
                    #      count = data[column][data[column] == '?'].count()
                    #      if count != 0:
                    #           data[column] = data[column].replace('?', "'?'")
                    # #csv.update("'"+ csv['Wafer'] +"'")
                    # csv.update(csv['Wafer'].astype(str))
                    #csv['Wafer'] = csv['Wafer'].str[6:]
                    data.to_csv(self.goodDataPath+ "/" + file, index=None, header=True)
                    self.logger.log(log_file," %s: Quotes added successfully!!" % file)
               #log_file.write("Current Date :: %s" %date +"\t" + "Current time:: %s" % current_time + "\t \t" +  + "\n")
          except Exception as e:
               self.logger.log(log_file, "Data Transformation failed because:: %s" % e)
               #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n")
               log_file.close()
          log_file.close()

    # def removeHyphenFromColumnNames(self,data):
     #      """
     #                                                Method Name: addQuotesToStringValuesInColumn
     #                                                Description: This method changing the column names by replacing the '-'.
     #
     #                                                 Written By: Malini
     #                                                Version: 1.0
     #                                                Revisions: None
     #
     #                                                        """
     #      log_file = open("Training_Logs/removeHyphenFromColumnNames.txt", 'a+')
     #      try:
     #
     #           # there are "hyphen" in our column name which results in failure when inserting the column names in the table
     #           # so we are changing the column names by replacing the '-'
     #           for col in data.columns:
     #                new_col = col.replace('-', '')
     #                data.rename(columns={col: new_col},inplace=True)
     #           return data
     #
     #      except Exception as e:
     #           self.logger.log(log_file, "Data Transformation failed because:: %s" % e)
     #           #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n")
     #           log_file.close()
     #      log_file.close()
     #      return data
 def __init__(self):
     # self.goodDataPath = "Training_Raw_files_validated/Good_Raw"
     self.logger = App_Logger('wafer')
     self.mongo = To_mongo_db()
Exemple #15
0
class dataTransformPredict:

     """
                  This class shall be used for transforming the Good Raw Training Data before loading it in Database!!.

                  Written By: iNeuron Intelligence
                  Version: 1.0
                  Revisions: None

                  """

     def __init__(self):
          self.goodDataPath = "Prediction_Raw_Files_Validated/Good_Raw"
          self.logger = App_Logger()


     def addQuotesToStringValuesInColumn(self):

          """
                                  Method Name: addQuotesToStringValuesInColumn
                                  Description: This method replaces the missing values in columns with "NULL" to
                                               store in the table. We are using substring in the first column to
                                               keep only "Integer" data for ease up the loading.
                                               This column is anyways going to be removed during prediction.

                                   Written By: iNeuron Intelligence
                                  Version: 1.0
                                  Revisions: None

                                          """

          try:
               log_file = open("Prediction_Logs/dataTransformLog.txt", 'a+')
               onlyfiles = [f for f in listdir(self.goodDataPath)]
               for file in onlyfiles:
                    data = pandas.read_csv(self.goodDataPath + "/" + file)
                    # list of columns with string datatype variables
                    column = ['sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_medication', 'sick',
                              'pregnant',
                              'thyroid_surgery', 'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium',
                              'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured', 'T3_measured',
                              'TT4_measured',
                              'T4U_measured', 'FTI_measured', 'TBG_measured', 'TBG', 'referral_source', 'Class']

                    for col in data.columns:
                         if col in column:  # add quotes in string value
                              data[col] = data[col].apply(lambda x: "'" + str(x) + "'")
                         if col not in column:  # add quotes to '?' values in integer/float columns
                              data[col] = data[col].replace('?', "'?'")
                    # #csv.update("'"+ csv['Wafer'] +"'")
                    # csv.update(csv['Wafer'].astype(str))
                    # csv['Wafer'] = csv['Wafer'].str[6:]
                    data.to_csv(self.goodDataPath + "/" + file, index=None, header=True)
                    self.logger.log(log_file, " %s: Quotes added successfully!!" % file)

          except Exception as e:
               log_file = open("Prediction_Logs/dataTransformLog.txt", 'a+')
               self.logger.log(log_file, "Data Transformation failed because:: %s" % e)
               #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n")
               log_file.close()
               raise e
          log_file.close()
Exemple #16
0
 def __init__(self, path):
     self.Batch_Directory = path
     self.schema_path = 'schema_prediction.json'
     self.logger = App_Logger()
     self.awsObj = AwsStorageManagement()
     self.dbObj = mongoDBOperation()
class dataTransformPredict:
    """
                  This class shall be used for transforming the Good Raw Training Data before loading it in Database!!.

                  Written By: iNeuron Intelligence
                  Version: 1.0
                  Revisions: None

                  """
    def __init__(self):
        self.goodDataPath = "Prediction_Raw_Files_Validated/Good_Raw"
        self.logger = App_Logger()

    def replaceMissingWithNull(self):
        """
                                  Method Name: replaceMissingWithNull
                                  Description: This method replaces the missing values in columns with "NULL" to
                                               store in the table. We are using substring in the first column to
                                               keep only "Integer" data for ease up the loading.
                                               This column is anyways going to be removed during prediction.

                                   Written By: iNeuron Intelligence
                                  Version: 1.0
                                  Revisions: None

                                          """

        try:
            log_file = open("Prediction_Logs/dataTransformLog.txt", 'a+')
            onlyfiles = [f for f in listdir(self.goodDataPath)]
            for file in onlyfiles:
                csv = pandas.read_csv(self.goodDataPath + "/" + file)
                csv.fillna("'NULL'", inplace=True)
                # #csv.update("'"+ csv['Wafer'] +"'")
                # csv.update(csv['Wafer'].astype(str))
                #csv['Wafer'] = csv['Wafer'].str[6:]
                csv.to_csv(self.goodDataPath + "/" + file,
                           index=None,
                           header=True)
                self.logger.log(log_file,
                                " %s: File Transformed successfully!!" % file)
            #log_file.write("Current Date :: %s" %date +"\t" + "Current time:: %s" % current_time + "\t \t" +  + "\n")

        except Exception as e:
            self.logger.log(log_file,
                            "Data Transformation failed because:: %s" % e)
            #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n")
            log_file.close()
            raise e
        log_file.close()

    def addQuotesToStringValuesInColumn(self):
        """
                                           Method Name: addQuotesToStringValuesInColumn
                                           Description: This method converts all the columns with string datatype such that
                                                       each value for that column is enclosed in quotes. This is done
                                                       to avoid the error while inserting string values in table as varchar.

                                            Written By: iNeuron Intelligence
                                           Version: 1.0
                                           Revisions: None

                                                   """

        log_file = open("Training_Logs/addQuotesToStringValuesInColumn.txt",
                        'a+')
        try:
            onlyfiles = [f for f in listdir(self.goodDataPath)]
            for file in onlyfiles:
                data = pandas.read_csv(self.goodDataPath + "/" + file)
                str_column = [
                    "potential_issue", "deck_risk", "oe_constraint",
                    "ppap_risk", "stop_auto_buy", "rev_stop",
                    "went_on_backorder"
                ]

                for col in data.columns:
                    if col in str_column:  # add quotes in string value
                        data[col] = data[col].apply(
                            lambda x: "'" + str(x) + "'")

                    # if col not in column: # add quotes to '?' values in integer/float columns
                # for column in data.columns:
                #      count = data[column][data[column] == '?'].count()
                #      if count != 0:
                #           data[column] = data[column].replace('?', "'?'")
                # #csv.update("'"+ csv['Wafer'] +"'")
                # csv.update(csv['Wafer'].astype(str))
                #csv['Wafer'] = csv['Wafer'].str[6:]
                data.to_csv(self.goodDataPath + "/" + file,
                            index=None,
                            header=True)
                self.logger.log(log_file,
                                " %s: Quotes added successfully!!" % file)
            #log_file.write("Current Date :: %s" %date +"\t" + "Current time:: %s" % current_time + "\t \t" +  + "\n")
        except Exception as e:
            self.logger.log(log_file,
                            "Data Transformation failed because:: %s" % e)
            #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n")
            log_file.close()
        log_file.close()
 def __init__(self):
     self.training_file='../training_file/Churn_Modelling.csv'
     self.file_object = open("../logs/filereadlogs/log.txt", 'a+')
     self.logger = App_Logger()
class Prediction_Data_validation:
    """
    This class shall be used for handling all the validation done on the Raw Prediction Data!!.
    """
    def __init__(self, path):
        self.Batch_Directory = path
        self.schema_path = 'schema_prediction.json'
        self.logger = App_Logger()

    def valuesFromSchema(self):
        """
            This method extract as all the relevant information from the pre-defined "Schema" file.
        :return: LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, Numberofcolumns
        """

        try:
            with open(self.schema_path, 'r') as f:
                dic = json.load(f)
                f.close()
            LengthOfDateStampInFile = dic['LengthOfDateStampInFile']
            LengthOfTimeStampInFile = dic['LengthOfTimeStampInFile']
            column_names = dic['ColName']
            NumberofColumns = dic['NumberofColumns']

            file = open("Prediction_Logs/valuesfromSchemaValidationLog.txt",
                        'a+')
            message = "LengthOfDateStampInFile:: %s" % LengthOfDateStampInFile + "\t" + "LengthOfTimeStampInFile:: %s" % LengthOfTimeStampInFile + "\t " + "NumberofColumns:: %s" % NumberofColumns + "\n"
            self.logger.log(file, message)

            file.close()

        except ValueError:
            file = open("Prediction_Logs/valuesfromSchemaValidationLog.txt",
                        'a+')
            self.logger.log(
                file, "ValueError:Value not found inside schema_training.json")
            file.close()
            raise ValueError

        except KeyError:
            file = open("Prediction_Logs/valuesfromSchemaValidationLog.txt",
                        'a+')
            self.logger.log(file,
                            "KeyError:Key value error incorrect key passed")
            file.close()
            raise KeyError

        except Exception as e:
            file = open("Prediction_Logs/valuesfromSchemaValidationLog.txt",
                        'a+')
            self.logger.log(file, str(e))
            file.close()
            raise e
        return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns

    def manualRegexCreation(self):
        """
            This method contains a manually defined regex based on the "FileName" given in "Schema" file.
            This Regex is used to validate the filename of the prediction data.
        :return: Regex pattern
        """
        # "SampleFileName": "creditCardFraud_021119920_010222.csv"
        regex = "['creditCardFraud']+['\_'']+[\d_]+[\d]+\.csv"
        return regex

    def createDirectoryForGoodBadRawData(self):
        """
            This method creates directories to store the Good Data and Bad data after validating the prediction
        :return: None
        """

        try:
            path = os.path.join("Prediction_Raw_Files_Validated/", "Good_Raw/")
            if not os.path.isdir(path):
                os.makedirs(path)
            path = os.path.join("Prediction_Raw_Files_Validated/", "Bad_Raw/")
            if not os.path.isdir(path):
                os.makedirs(path)
        except OSError as ex:
            file = open("Prediction_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file, "Error while creating Directory %s:" % ex)
            file.close()
            raise OSError

    def deleteExistingGoodDataPredictionFolder(self):
        """
            This method deletes the directory made to store the good data after loading in the table.
            Once the good files are loaded in DB, deleting the directory ensures space optimization.
        :return: None
        """

        try:
            path = 'Prediction_Raw_Data_Validated/Good_Raw/'

            if os.path.isdir(path):
                shutil.rmtree(path)
                file = open("Prediction_Logs/GeneralLog.txt", 'a+')
                self.logger.log(file,
                                "GoodRaw directory deleted successfully!!!")
                file.close()
        except OSError as s:
            file = open("Prediction_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file, "Error while Deleting Directory : %s" % s)
            file.close()
            raise OSError

    def deleteExistingBadDataPredictionFolder(self):
        """
            This method deletes the directory made to store the good data after loading in the table.
            Once the good files are loaded in DB, deleting the directory ensures space optimization.
        :return: None
        """

        try:
            path = 'Prediction_Raw_Data_Validated/Bad_Raw/'

            if os.path.isdir(path):
                shutil.rmtree(path)
                file = open("Prediction_Logs/GeneralLog.txt", 'a+')
                self.logger.log(file,
                                "Bad_Raw directory deleted successfully!!!")
                file.close()
        except OSError as s:
            file = open("Prediction_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file, "Error while Deleting Directory : %s" % s)
            file.close()
            raise OSError

    def moveBadFilesToArchiveBad(self):
        """
            This method deletes the directory made to store the Bad Data after moving the data
            in an archive folder. We archive the bad files to send them back to the client for invalid data issue.
        :return: None
        """

        now = datetime.now()
        date = now.date()
        time = now.strftime("%H%M%S")
        try:
            path = "PredictionArchiveBadData"
            if not os.path.isdir(path):
                os.makedirs(path)
            source = "Prediction_Raw_Files_Validated/Bad_Raw/"
            dest = 'PredictionArchiveBadData/BadData_' + str(time) + '_' + str(
                date)
            if not os.path.isdir(dest):
                os.makedirs(dest)

            files = os.listdir(source)
            for f in files:
                if f not in os.listdir(dest):
                    shutil.move(source + f, dest)

            file = open("Prediction_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file, "Bad files moved to archive")
            if os.path.isdir(source):
                shutil.rmtree(source)
            self.logger.log(file, "Bad Raw Data Folder Deleted successfully!!")
            file.close()
        except OSError as e:
            file = open("Prediction_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file,
                            "Error while moving bad files to archive:: %s" % e)
            file.close()
            raise OSError

    def validationFileNameRaw(self, regex, LengthOfDateStampInFile,
                              LengthOfTimeStampInFile):
        """
            This function validates the name of prediction csv file as per given name in the schema!
            Regex pattern is used to do the validation.If name format do not match the file is moved
            to Bad Ra Data folder else Good Raw Data.
        :param regex:
        :param LengthOfDateStampInFile:
        :param LengthOfTimeStampInFile:
        :return:
        """

        #delete the directories for good and bad data in case last run was unsuccessful and folders were not deleted.
        self.deleteExistingBadDataPredictionFolder()
        self.deleteExistingGoodDataPredictionFolder()
        self.createDirectoryForGoodBadRawData()
        onlyfiles = [f for f in listdir(self.Batch_Directory)]

        try:
            f = open("Prediction_Logs/nameValidationLog.txt", 'a+')
            for filename in onlyfiles:
                if (re.match(regex, filename)):
                    splitAtDot = re.split('.csv', filename)
                    splitAtDot = (re.split('_', splitAtDot[0]))

                    if len(splitAtDot[1]) == LengthOfDateStampInFile:
                        if len(splitAtDot[2]) == LengthOfTimeStampInFile:
                            shutil.copy(
                                "Prediction_Batch_files/" + filename,
                                "Prediction_Raw_Files_Validated/Good_Raw")
                            self.logger.log(
                                f,
                                "Valid File name !! File moved to Good_Raw Folder :: %s"
                                % filename)
                        else:
                            shutil.copy(
                                "Prediction_Batch_files/" + filename,
                                "Prediction_Raw_Files_Validated/Bad_Raw")
                            self.logger.log(
                                f,
                                "Invalid File name  !! File moved to Bad Raw Folder :: %s"
                                % filename)

                    else:
                        shutil.copy("Prediction_Batch_files/" + filename,
                                    "Prediction_Raw_Files_Validated/Bad_Raw")
                        self.logger.log(
                            f,
                            "Invalid File name  !! File moved to Bad Raw Folder :: %s"
                            % filename)
                else:
                    shutil.copy("Prediction_Batch_files/" + filename,
                                "Prediction_Raw_Files_Validated/Bad_Raw")
                    self.logger.log(
                        f,
                        "Invalid File Name!! File moved to Bad Raw Folder :: %s"
                        % filename)

            f.close()

        except Exception as e:
            f = open("Prediction_Logs/nameValidationLog.txt", 'a+')
            self.logger.log(f,
                            "Error occured while validating FileName %s" % e)
            f.close()
            raise e

    def validateColumnLength(self, NumberofColumns):
        """
        This function validates the  number of the columns in the csv files as per given in schema.
        if not same then file is moved to Bad Raw data else kept in Good Raw data.


        :param NumberofColumns:
        :return:
        """
        try:
            f = open("Prediction_Logs/columnValidationLog.txt", 'a+')
            self.logger.log(f, "Column Length Validation Started!!")
            for file in listdir('Prediction_Raw_Files_Validated/Good_Raw/'):
                csv = pd.read_csv("Prediction_Raw_Files_Validated/Good_Raw/" +
                                  file)
                if csv.shape[1] == NumberofColumns:
                    csv.to_csv("Prediction_Raw_Files_Validated/Good_Raw/" +
                               file,
                               index=None,
                               header=True)
                else:
                    shutil.move(
                        "Prediction_Raw_Files_Validated/Good_Raw/" + file,
                        "Prediction_Raw_Files_Validated/Bad_Raw")
                    self.logger.log(
                        f,
                        "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s"
                        % file)

            self.logger.log(f, "Column Length Validation Completed!!")
        except OSError:
            f = open("Prediction_Logs/columnValidationLog.txt", 'a+')
            self.logger.log(
                f, "Error Occured while moving the file :: %s" % OSError)
            f.close()
            raise OSError
        except Exception as e:
            f = open("Prediction_Logs/columnValidationLog.txt", 'a+')
            self.logger.log(f, "Error Occured:: %s" % e)
            f.close()
            raise e

        f.close()

    def deletePredictionFile(self):
        if os.path.exists("Prediction_Output_File/Predictions.csv"):
            os.remove('Prediction_Output_File/Predictions.csv')

    def validateMissingValuesInWholeColumn(self):
        """
            This function validates if any column in the csv file has all values missing.
            If all the values are missing, the file is not suitable for processing.
            Such files are moved to Bad Raw data.
        :return: None
        """

        try:
            f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+')
            self.logger.log(f, "Missing Values Validation Started!!")

            for file in listdir('Prediction_Raw_Files_Validated/Good_Raw/'):
                csv = pd.read_csv("Prediction_Raw_Files_Validated/Good_Raw/" +
                                  file)
                count = 0
                for columns in csv:
                    if (len(csv[columns]) - csv[columns].count()) == len(
                            csv[columns]):
                        count += 1
                        shutil.move(
                            "Prediction_Raw_Files_Validated/Good_Raw/" + file,
                            "Prediction_Raw_Files_Validated/Bad_Raw")
                        self.logger.log(
                            f,
                            "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s"
                            % file)
                        break
                if count == 0:
                    csv.to_csv("Prediction_Raw_Files_Validated/Good_Raw/" +
                               file,
                               index=None,
                               header=True)
        except OSError:
            f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+')
            self.logger.log(
                f, "Error Occured while moving the file :: %s" % OSError)
            f.close()
            raise OSError
        except Exception as e:
            f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+')
            self.logger.log(f, "Error Occurred:: %s" % e)
            f.close()
            raise e
        f.close()
class dBOperation:
    def __init__(self):
        self.path = 'Prediction_Database/'
        self.badFilePath = "Prediction_Raw_Files_Validated/Bad_Raw"
        self.goodFilePath = "Prediction_Raw_Files_Validated/Good_Raw"
        self.logger = App_Logger()

    def dataBaseConnection(self, DatabaseName):
        try:
            conn = sqlite3.connect(self.path + DatabaseName + '.db')

            file = open("Prediction_Logs/DataBaseConnectionLog.txt", 'a+')
            self.logger.log(file,
                            "Opened %s database successfully" % DatabaseName)
            file.close()

        except ConnectionError:
            file = open("Prediction_Logs/DataBaseConnectionLog.txt", 'a+')
            self.logger.log(
                file,
                "Error while connecting to database: %s" % ConnectionError)
            file.close()

            raise ConnectionError

        return conn

    def createTableDb(self, DatabaseName, column_names):
        try:
            conn = self.dataBaseConnection(DatabaseName)
            conn.execute('DROP TABLE IF EXISTS Good_Raw_Data;')

            for key in column_names.keys():
                type = column_names[key]
                try:
                    conn.execute(
                        'ALTER TABLE Good_Raw_Data ADD COLUMN "{column_name}" {dataType}'
                        .format(column_name=key, dataType=type))
                except:
                    conn.execute(
                        'CREATE TABLE  Good_Raw_Data ({column_name} {dataType})'
                        .format(column_name=key, dataType=type))

            conn.close()

            file = open("Prediction_Logs/DbTableCreateLog.txt", 'a+')
            self.logger.log(file, "Tables created successfully!!")
            file.close()

            file = open("Prediction_Logs/DataBaseConnectionLog.txt", 'a+')
            self.logger.log(file,
                            "Closed %s database successfully" % DatabaseName)
            file.close()

        except Exception as e:
            file = open("Prediction_Logs/DbTableCreateLog.txt", 'a+')
            self.logger.log(file, "Error while creating table: %s " % e)
            file.close()
            conn.close()
            file = open("Prediction_Logs/DataBaseConnectionLog.txt", 'a+')
            self.logger.log(file,
                            "Closed %s database successfully" % DatabaseName)
            file.close()
            raise e

    def insertIntoTableGoodData(self, Database):
        conn = self.dataBaseConnection(Database)
        goodFilePath = self.goodFilePath
        badFilePath = self.badFilePath
        onlyfiles = [f for f in listdir(goodFilePath)]
        log_file = open("Prediction_Logs/DbInsertLog.txt", 'a+')

        for file in onlyfiles:
            try:
                with open(goodFilePath + '/' + file, "r") as f:
                    next(f)
                    reader = csv.reader(f, delimiter="\n")
                    for line in enumerate(reader):
                        for list_ in (line[1]):
                            try:
                                conn.execute(
                                    'INSERT INTO Good_Raw_Data values ({values})'
                                    .format(values=(list_)))
                                self.logger.log(
                                    log_file,
                                    " %s: File loaded successfully!!" % file)
                                conn.commit()
                            except Exception as e:
                                raise e

            except Exception as e:
                conn.rollback()
                self.logger.log(log_file,
                                "Error while creating table: %s " % e)
                shutil.move(goodFilePath + '/' + file, badFilePath)
                self.logger.log(log_file, "File Moved Successfully %s" % file)
                log_file.close()
                conn.close()
                raise e

        conn.close()
        log_file.close()

    def selectingDatafromtableintocsv(self, Database):

        self.fileFromDb = 'Prediction_FileFromDB/'
        self.fileName = 'InputFile.csv'
        log_file = open("Prediction_Logs/ExportToCsv.txt", 'a+')
        try:
            conn = self.dataBaseConnection(Database)
            sqlSelect = "SELECT *  FROM Good_Raw_Data"
            cursor = conn.cursor()

            cursor.execute(sqlSelect)

            results = cursor.fetchall()

            #Get the headers of the csv file
            headers = [i[0] for i in cursor.description]

            #Make the CSV ouput directory
            if not os.path.isdir(self.fileFromDb):
                os.makedirs(self.fileFromDb)

            # Open CSV file for writing.
            csvFile = csv.writer(open(self.fileFromDb + self.fileName,
                                      'w',
                                      newline=''),
                                 delimiter=',',
                                 lineterminator='\r\n',
                                 quoting=csv.QUOTE_ALL,
                                 escapechar='\\')

            # Add the headers and data to the CSV file.
            csvFile.writerow(headers)
            csvFile.writerows(results)

            self.logger.log(log_file, "File exported successfully!!!")

        except Exception as e:
            self.logger.log(log_file, "File exporting failed. Error : %s" % e)
            raise e
class Prediction_Data_validation:
    def __init__(self, path):
        self.Batch_Directory = path
        self.schema_path = 'schema_prediction.json'
        self.logger = App_Logger()

    def valuesFromSchema(self):
        try:
            with open(self.schema_path, 'r') as f:
                dic = json.load(f)
                f.close()
            pattern = dic['SampleFileName']
            LengthOfDateStampInFile = dic['LengthOfDateStampInFile']
            LengthOfTimeStampInFile = dic['LengthOfTimeStampInFile']
            column_names = dic['ColName']
            NumberofColumns = dic['NumberofColumns']

            file = open("Training_Logs/valuesfromSchemaValidationLog.txt",
                        'a+')
            message = "LengthOfDateStampInFile:: %s" % LengthOfDateStampInFile + "\t" + "LengthOfTimeStampInFile:: %s" % LengthOfTimeStampInFile + "\t " + "NumberofColumns:: %s" % NumberofColumns + "\n"
            self.logger.log(file, message)

            file.close()

        except ValueError:
            file = open("Prediction_Logs/valuesfromSchemaValidationLog.txt",
                        'a+')
            self.logger.log(
                file, "ValueError:Value not found inside schema_training.json")
            file.close()
            raise ValueError

        except KeyError:
            file = open("Prediction_Logs/valuesfromSchemaValidationLog.txt",
                        'a+')
            self.logger.log(file,
                            "KeyError:Key value error incorrect key passed")
            file.close()
            raise KeyError

        except Exception as e:
            file = open("Prediction_Logs/valuesfromSchemaValidationLog.txt",
                        'a+')
            self.logger.log(file, str(e))
            file.close()
            raise e

        return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns

    def manualRegexCreation(self):
        regex = "['wafer']+['\_'']+[\d_]+[\d]+\.csv"
        return regex

    def createDirectoryForGoodBadRawData(self):
        try:
            path = os.path.join("Prediction_Raw_Files_Validated/", "Good_Raw/")
            if not os.path.isdir(path):
                os.makedirs(path)
            path = os.path.join("Prediction_Raw_Files_Validated/", "Bad_Raw/")
            if not os.path.isdir(path):
                os.makedirs(path)

        except OSError as ex:
            file = open("Prediction_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file, "Error while creating Directory %s:" % ex)
            file.close()
            raise OSError

    def deleteExistingGoodDataTrainingFolder(self):
        try:
            path = 'Prediction_Raw_Files_Validated/'
            # if os.path.isdir("ids/" + userName):
            # if os.path.isdir(path + 'Bad_Raw/'):
            #     shutil.rmtree(path + 'Bad_Raw/')
            if os.path.isdir(path + 'Good_Raw/'):
                shutil.rmtree(path + 'Good_Raw/')
                file = open("Prediction_Logs/GeneralLog.txt", 'a+')
                self.logger.log(file,
                                "GoodRaw directory deleted successfully!!!")
                file.close()
        except OSError as s:
            file = open("Prediction_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file, "Error while Deleting Directory : %s" % s)
            file.close()
            raise OSError

    def deleteExistingBadDataTrainingFolder(self):
        try:
            path = 'Prediction_Raw_Files_Validated/'
            if os.path.isdir(path + 'Bad_Raw/'):
                shutil.rmtree(path + 'Bad_Raw/')
                file = open("Prediction_Logs/GeneralLog.txt", 'a+')
                self.logger.log(
                    file,
                    "BadRaw directory deleted before starting validation!!!")
                file.close()
        except OSError as s:
            file = open("Prediction_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file, "Error while Deleting Directory : %s" % s)
            file.close()
            raise OSError

    def moveBadFilesToArchiveBad(self):
        now = datetime.now()
        date = now.date()
        time = now.strftime("%H%M%S")
        try:
            path = "PredictionArchivedBadData"
            if not os.path.isdir(path):
                os.makedirs(path)
            source = 'Prediction_Raw_Files_Validated/Bad_Raw/'
            dest = 'PredictionArchivedBadData/BadData_' + str(
                date) + "_" + str(time)
            if not os.path.isdir(dest):
                os.makedirs(dest)
            files = os.listdir(source)
            for f in files:
                if f not in os.listdir(dest):
                    shutil.move(source + f, dest)
            file = open("Prediction_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file, "Bad files moved to archive")
            path = 'Prediction_Raw_Files_Validated/'
            if os.path.isdir(path + 'Bad_Raw/'):
                shutil.rmtree(path + 'Bad_Raw/')
            self.logger.log(file, "Bad Raw Data Folder Deleted successfully!!")
            file.close()
        except OSError as e:
            file = open("Prediction_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file,
                            "Error while moving bad files to archive:: %s" % e)
            file.close()
            raise OSError

    def validationFileNameRaw(self, regex, LengthOfDateStampInFile,
                              LengthOfTimeStampInFile):
        # delete the directories for good and bad data in case last run was unsuccessful and folders were not deleted.
        self.deleteExistingBadDataTrainingFolder()
        self.deleteExistingGoodDataTrainingFolder()
        self.createDirectoryForGoodBadRawData()
        onlyfiles = [f for f in listdir(self.Batch_Directory)]
        try:
            f = open("Prediction_Logs/nameValidationLog.txt", 'a+')
            for filename in onlyfiles:
                if (re.match(regex, filename)):
                    splitAtDot = re.split('.csv', filename)
                    splitAtDot = (re.split('_', splitAtDot[0]))
                    if len(splitAtDot[1]) == LengthOfDateStampInFile:
                        if len(splitAtDot[2]) == LengthOfTimeStampInFile:
                            shutil.copy(
                                "Prediction_Batch_files/" + filename,
                                "Prediction_Raw_Files_Validated/Good_Raw")
                            self.logger.log(
                                f,
                                "Valid File name!! File moved to GoodRaw Folder :: %s"
                                % filename)

                        else:
                            shutil.copy(
                                "Prediction_Batch_files/" + filename,
                                "Prediction_Raw_Files_Validated/Bad_Raw")
                            self.logger.log(
                                f,
                                "Invalid File Name!! File moved to Bad Raw Folder :: %s"
                                % filename)
                    else:
                        shutil.copy("Prediction_Batch_files/" + filename,
                                    "Prediction_Raw_Files_Validated/Bad_Raw")
                        self.logger.log(
                            f,
                            "Invalid File Name!! File moved to Bad Raw Folder :: %s"
                            % filename)
                else:
                    shutil.copy("Prediction_Batch_files/" + filename,
                                "Prediction_Raw_Files_Validated/Bad_Raw")
                    self.logger.log(
                        f,
                        "Invalid File Name!! File moved to Bad Raw Folder :: %s"
                        % filename)

            f.close()

        except Exception as e:
            f = open("Prediction_Logs/nameValidationLog.txt", 'a+')
            self.logger.log(f,
                            "Error occured while validating FileName %s" % e)
            f.close()
            raise e

    def validateColumnLength(self, NumberofColumns):
        try:
            f = open("Prediction_Logs/columnValidationLog.txt", 'a+')
            self.logger.log(f, "Column Length Validation Started!!")
            for file in listdir('Prediction_Raw_Files_Validated/Good_Raw/'):
                csv = pd.read_csv("Prediction_Raw_Files_Validated/Good_Raw/" +
                                  file)
                if csv.shape[1] == NumberofColumns:
                    csv.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True)
                    csv.to_csv("Prediction_Raw_Files_Validated/Good_Raw/" +
                               file,
                               index=None,
                               header=True)
                else:
                    shutil.move(
                        "Prediction_Raw_Files_Validated/Good_Raw/" + file,
                        "Prediction_Raw_Files_Validated/Bad_Raw")
                    self.logger.log(
                        f,
                        "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s"
                        % file)

            self.logger.log(f, "Column Length Validation Completed!!")
        except OSError:
            f = open("Prediction_Logs/columnValidationLog.txt", 'a+')
            self.logger.log(
                f, "Error Occured while moving the file :: %s" % OSError)
            f.close()
            raise OSError
        except Exception as e:
            f = open("Prediction_Logs/columnValidationLog.txt", 'a+')
            self.logger.log(f, "Error Occured:: %s" % e)
            f.close()
            raise e

        f.close()

    def deletePredictionFile(self):

        if os.path.exists('Prediction_Output_File/Predictions.csv'):
            os.remove('Prediction_Output_File/Predictions.csv')

    def validateMissingValuesInWholeColumn(self):
        try:
            f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+')
            self.logger.log(f, "Missing Values Validation Started!!")

            for file in listdir('Prediction_Raw_Files_Validated/Good_Raw/'):
                csv = pd.read_csv("Prediction_Raw_Files_Validated/Good_Raw/" +
                                  file)
                count = 0
                for columns in csv:
                    if (len(csv[columns]) - csv[columns].count()) == len(
                            csv[columns]):
                        count += 1
                        shutil.move(
                            "Prediction_Raw_Files_Validated/Good_Raw/" + file,
                            "Prediction_Raw_Files_Validated/Bad_Raw")
                        self.logger.log(
                            f,
                            "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s"
                            % file)
                        break
                if count == 0:
                    csv.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True)
                    csv.to_csv("Prediction_Raw_Files_Validated/Good_Raw/" +
                               file,
                               index=None,
                               header=True)
        except OSError:
            f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+')
            self.logger.log(
                f, "Error Occured while moving the file :: %s" % OSError)
            f.close()
            raise OSError
        except Exception as e:
            f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+')
            self.logger.log(f, "Error Occured:: %s" % e)
            f.close()
            raise e
        f.close()
class dataTransform:
    """
               This class shall be used for transforming the Good Raw Training Data before loading it in Database!!.

               Written By: iNeuron Intelligence
               Version: 1.0
               Revisions: None

               """
    def __init__(self):
        self.goodDataPath = "Training_Raw_files_validated/Good_Raw"
        self.logger = App_Logger()

    def addQuotesToStringValuesInColumn(self):
        """
                                           Method Name: addQuotesToStringValuesInColumn
                                           Description: This method converts all the columns with string datatype such that
                                                       each value for that column is enclosed in quotes. This is done
                                                       to avoid the error while inserting string values in table as varchar.

                                            Written By: iNeuron Intelligence
                                           Version: 1.0
                                           Revisions: None

                                                   """

        log_file = open("Training_Logs/addQuotesToStringValuesInColumn.txt",
                        'a+')
        try:
            onlyfiles = [f for f in listdir(self.goodDataPath)]
            for file in onlyfiles:
                data = pd.read_csv(self.goodDataPath + "/" + file)
                #list of columns with string datatype variables
                column = [
                    'sex', 'on_thyroxine', 'query_on_thyroxine',
                    'on_antithyroid_medication', 'sick', 'pregnant',
                    'thyroid_surgery', 'I131_treatment', 'query_hypothyroid',
                    'query_hyperthyroid', 'lithium', 'goitre', 'tumor',
                    'hypopituitary', 'psych', 'TSH_measured', 'T3_measured',
                    'TT4_measured', 'T4U_measured', 'FTI_measured',
                    'TBG_measured', 'TBG', 'referral_source', 'Class'
                ]

                for col in data.columns:
                    if col in column:  # add quotes in string value
                        data[col] = data[col].apply(
                            lambda x: "'" + str(x) + "'")
                    if col not in column:  # add quotes to '?' values in integer/float columns
                        data[col] = data[col].replace('?', "'?'")
                # #csv.update("'"+ csv['Wafer'] +"'")
                # csv.update(csv['Wafer'].astype(str))
                #csv['Wafer'] = csv['Wafer'].str[6:]
                data.to_csv(self.goodDataPath + "/" + file,
                            index=None,
                            header=True)
                self.logger.log(log_file,
                                " %s: Quotes added successfully!!" % file)
            #log_file.write("Current Date :: %s" %date +"\t" + "Current time:: %s" % current_time + "\t \t" +  + "\n")
        except Exception as e:
            self.logger.log(log_file,
                            "Data Transformation failed because:: %s" % e)
            #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n")
            log_file.close()
        log_file.close()
 def __init__(self):
     self.goodDataPath = "Training_Raw_files_validated/Good_Raw"
     self.logger = App_Logger()
class train_validation:
    def __init__(self, path):
        self.raw_data = Raw_Data_Validation(path)
        self.dataTransform = dataTransform()
        self.dBOperation = dBOperation()
        self.file_object = open("Training_Logs/Training_Main_Log.txt", 'a+')
        self.log_writer = App_Logger()

    def train_validation(self):
        try:
            self.log_writer.log(
                self.file_object,
                'Start of Validation on files for prediction!!')
            LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, noofcolumns = self.raw_data.valuesFromSchema(
            )
            print("column names:{}".format(column_names))
            regex = self.raw_data.manualRegexCreation()
            self.raw_data.validationFileNameRaw(regex, LengthOfDateStampInFile,
                                                LengthOfTimeStampInFile)
            self.raw_data.validateColumnLength(noofcolumns)
            self.raw_data.validateMissingValuesInWholeColumn()
            self.log_writer.log(self.file_object,
                                "Raw Data Validation Complete!!")
            self.log_writer.log(
                self.file_object,
                "Creating Training_Database and tables on the basis of given schema!!!"
            )
            self.dBOperation.createTableDb(DataBaseName="Training",
                                           column_names=column_names)
            self.log_writer.log(self.file_object, "Table creation Completed!!")
            self.log_writer.log(self.file_object,
                                "Insertion of Data into Table started!!!!")
            self.dBOperation.insertIntoTableGoodData('Training')
            self.log_writer.log(self.file_object,
                                "Insertion in Table completed!!!")
            self.log_writer.log(self.file_object,
                                "Deleting Good Data Folder!!!")
            self.raw_data.deleteExistingGoodDataTrainingFolder()
            self.log_writer.log(self.file_object,
                                "Good_Data folder deleted!!!")
            self.log_writer.log(
                self.file_object,
                "Moving bad files to Archive and deleting Bad_Data folder!!!")
            self.raw_data.moveBadFilesToArchiveBad()
            self.log_writer.log(
                self.file_object,
                "Bad files moved to archive!! Bad folder Deleted!!")
            self.log_writer.log(self.file_object,
                                "Validation Operation completed!!")
            self.log_writer.log(self.file_object,
                                "Extracting csv file from table")
            df = self.dBOperation.selectingDatafromtableintocsv('Training')
            self.file_object.close()
            return df
        except Exception as e:
            self.log_writer.log(self.file_object,
                                "Conversion to input csv failed")
            self.file_object.close()
            raise e
 def __init__(self):
     self.path = 'Training_Database/'
     self.badFilePath = "Training_Raw_files_validated/Bad_Raw"
     self.goodFilePath = "Training_Raw_files_validated/Good_Raw"
     self.logger = App_Logger()
 def __init__(self, path):
     self.raw_data = Raw_Data_Validation(path)
     self.dataTransform = dataTransform()
     self.dBOperation = dBOperation()
     self.file_object = open("Training_Logs/Training_Main_Log.txt", 'a+')
     self.log_writer = App_Logger()
class Raw_Data_validation:
    """
             This class shall be used for handling all the validation done on the Raw Training Data!!.

             Written By: iNeuron Intelligence
             Version: 1.0
             Revisions: None

             """
    def __init__(self, path):
        self.Batch_Directory = path
        #my_file = rootProjPath+'\\schema_training.json'
        #self.schema_path = my_file
        self.schema_path = 'schema_training.json'
        self.logger = App_Logger()

    def valuesFromSchema(self):
        """
                        Method Name: valuesFromSchema
                        Description: This method extracts all the relevant information from the pre-defined "Schema" file.
                        Output: LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, Number of Columns
                        On Failure: Raise ValueError,KeyError,Exception

                         Written By: iNeuron Intelligence
                        Version: 1.0
                        Revisions: None

                                """
        try:
            with open(self.schema_path, 'r') as f:
                dic = json.load(f)
                f.close()
            pattern = dic['SampleFileName']
            LengthOfDateStampInFile = dic['LengthOfDateStampInFile']
            LengthOfTimeStampInFile = dic['LengthOfTimeStampInFile']
            column_names = dic['ColName']
            NumberofColumns = dic['NumberofColumns']
            #my_file = self.rootProjPath+'\\Training_Logs\\valuesfromSchemaValidationLog.txt'
            #file = open(my_file, 'a+')
            file = open("Training_Logs/valuesfromSchemaValidationLog.txt",
                        'a+')
            message = "LengthOfDateStampInFile:: %s" % LengthOfDateStampInFile + "\t" + "LengthOfTimeStampInFile:: %s" % LengthOfTimeStampInFile + "\t " + "NumberofColumns:: %s" % NumberofColumns + "\n"
            self.logger.log(file, message)

            file.close()

        except ValueError:
            #my_file = self.rootProjPath+'\\Training_Logs\\valuesfromSchemaValidationLog.txt'
            #file = open(my_file, 'a+')
            file = open("Training_Logs/valuesfromSchemaValidationLog.txt",
                        'a+')
            self.logger.log(
                file, "ValueError:Value not found inside schema_training.json")
            file.close()
            raise ValueError

        except KeyError:
            #THIS_FOLDER = os.path.dirname(os.path.abspath(__file__))
            #my_file = self.rootProjPath+'\\Training_Logs\\valuesfromSchemaValidationLog.txt'
            #file = open(my_file, 'a+')
            file = open("Training_Logs/valuesfromSchemaValidationLog.txt",
                        'a+')
            self.logger.log(file,
                            "KeyError:Key value error incorrect key passed")
            file.close()
            raise KeyError

        except Exception as e:
            #my_file = self.rootProjPath+'\\Training_Logs\\valuesfromSchemaValidationLog.txt'
            #file = open(my_file, 'a+')
            file = open("Training_Logs/valuesfromSchemaValidationLog.txt",
                        'a+')
            self.logger.log(file, str(e))
            file.close()
            raise e

        return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns

    def manualRegexCreation(self):
        """
                                Method Name: manualRegexCreation
                                Description: This method contains a manually defined regex based on the "FileName" given in "Schema" file.
                                            This Regex is used to validate the filename of the training data.
                                Output: Regex pattern
                                On Failure: None

                                 Written By: iNeuron Intelligence
                                Version: 1.0
                                Revisions: None

                                        """
        regex = "['cardio']+['\_'']+[\d_]+[\d]+\.csv"
        return regex

    def createDirectoryForGoodBadRawData(self):
        """
                                      Method Name: createDirectoryForGoodBadRawData
                                      Description: This method creates directories to store the Good Data and Bad Data
                                                    after validating the training data.

                                      Output: None
                                      On Failure: OSError

                                       Written By: iNeuron Intelligence
                                      Version: 1.0
                                      Revisions: None

                                              """

        try:
            path = os.path.join("Training_Raw_files_validated/", "Good_Raw/")
            if not os.path.isdir(path):
                os.makedirs(path)
            path = os.path.join("Training_Raw_files_validated/", "Bad_Raw/")
            if not os.path.isdir(path):
                os.makedirs(path)

        except OSError as ex:
            file = open("Training_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file, "Error while creating Directory %s:" % ex)
            file.close()
            raise OSError

    def deleteExistingGoodDataTrainingFolder(self):
        """
                                            Method Name: deleteExistingGoodDataTrainingFolder
                                            Description: This method deletes the directory made  to store the Good Data
                                                          after loading the data in the table. Once the good files are
                                                          loaded in the DB,deleting the directory ensures space optimization.
                                            Output: None
                                            On Failure: OSError

                                             Written By: iNeuron Intelligence
                                            Version: 1.0
                                            Revisions: None

                                                    """

        try:
            path = 'Training_Raw_files_validated/'
            # if os.path.isdir("ids/" + userName):
            # if os.path.isdir(path + 'Bad_Raw/'):
            #     shutil.rmtree(path + 'Bad_Raw/')
            if os.path.isdir(path + 'Good_Raw/'):
                shutil.rmtree(path + 'Good_Raw/')
                file = open("Training_Logs/GeneralLog.txt", 'a+')
                self.logger.log(file,
                                "GoodRaw directory deleted successfully!!!")
                file.close()
        except OSError as s:
            file = open("Training_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file, "Error while Deleting Directory : %s" % s)
            file.close()
            raise OSError

    def deleteExistingBadDataTrainingFolder(self):
        """
                                            Method Name: deleteExistingBadDataTrainingFolder
                                            Description: This method deletes the directory made to store the bad Data.
                                            Output: None
                                            On Failure: OSError

                                             Written By: iNeuron Intelligence
                                            Version: 1.0
                                            Revisions: None

                                                    """

        try:
            path = 'Training_Raw_files_validated/'
            if os.path.isdir(path + 'Bad_Raw/'):
                shutil.rmtree(path + 'Bad_Raw/')
                file = open("Training_Logs/GeneralLog.txt", 'a+')
                self.logger.log(
                    file,
                    "BadRaw directory deleted before starting validation!!!")
                file.close()
        except OSError as s:
            file = open("Training_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file, "Error while Deleting Directory : %s" % s)
            file.close()
            raise OSError

    def moveBadFilesToArchiveBad(self):
        """
                                            Method Name: moveBadFilesToArchiveBad
                                            Description: This method deletes the directory made  to store the Bad Data
                                                          after moving the data in an archive folder. We archive the bad
                                                          files to send them back to the client for invalid data issue.
                                            Output: None
                                            On Failure: OSError

                                             Written By: iNeuron Intelligence
                                            Version: 1.0
                                            Revisions: None

                                                    """
        now = datetime.now()
        date = now.date()
        time = now.strftime("%H%M%S")
        try:

            source = 'Training_Raw_files_validated/Bad_Raw/'
            if os.path.isdir(source):
                path = "TrainingArchiveBadData"
                if not os.path.isdir(path):
                    os.makedirs(path)
                dest = 'TrainingArchiveBadData/BadData_' + str(
                    date) + "_" + str(time)
                if not os.path.isdir(dest):
                    os.makedirs(dest)
                files = os.listdir(source)
                for f in files:
                    if f not in os.listdir(dest):
                        shutil.move(source + f, dest)
                file = open("Training_Logs/GeneralLog.txt", 'a+')
                self.logger.log(file, "Bad files moved to archive")
                path = 'Training_Raw_files_validated/'
                if os.path.isdir(path + 'Bad_Raw/'):
                    shutil.rmtree(path + 'Bad_Raw/')
                self.logger.log(file,
                                "Bad Raw Data Folder Deleted successfully!!")
                file.close()
        except Exception as e:
            file = open("Training_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file,
                            "Error while moving bad files to archive:: %s" % e)
            file.close()
            raise e

    def validationFileNameRaw(self, regex, LengthOfDateStampInFile,
                              LengthOfTimeStampInFile):
        """
                    Method Name: validationFileNameRaw
                    Description: This function validates the name of the training csv files as per given name in the schema!
                                 Regex pattern is used to do the validation.If name format do not match the file is moved
                                 to Bad Raw Data folder else in Good raw data.
                    Output: None
                    On Failure: Exception

                     Written By: iNeuron Intelligence
                    Version: 1.0
                    Revisions: None

                """

        # pattern = "['Wafer']+['\_'']+[\d_]+[\d]+\.csv"
        # delete the directories for good and bad data in case last run was unsuccessful and folders were not deleted.
        self.deleteExistingBadDataTrainingFolder()
        self.deleteExistingGoodDataTrainingFolder()
        # create new directories
        self.createDirectoryForGoodBadRawData()
        onlyfiles = [f for f in listdir(self.Batch_Directory)]
        try:
            #my_file = self.rootProjPath+'\\Training_Logs\\nameValidationLog.txt'
            #f = open(my_file, 'a+')
            f = open("Training_Logs/nameValidationLog.txt", 'a+')
            for filename in onlyfiles:
                #my_file = self.rootProjPath+'\\Training_Batch_Files\\'
                if (re.match(regex, filename)):
                    splitAtDot = re.split('.csv', filename)
                    splitAtDot = (re.split('_', splitAtDot[0]))

                    if len(splitAtDot[1]) == LengthOfDateStampInFile:

                        if len(splitAtDot[2]) == LengthOfTimeStampInFile:

                            #shutil.copy(my_file+ filename, rootProjPath+"\\Training_Raw_files_validated\\Good_Raw")
                            shutil.copy(
                                "Training_Batch_Files/" + filename,
                                "Training_Raw_files_validated/Good_Raw")
                            self.logger.log(
                                f,
                                "Valid File name!! File moved to GoodRaw Folder :: %s"
                                % filename)

                        else:
                            #shutil.copy(my_file+ filename, rootProjPath+"\\Training_Raw_files_validated\\Bad_Raw")
                            shutil.copy(
                                "Training_Batch_Files/" + filename,
                                "Training_Raw_files_validated/Bad_Raw")
                            self.logger.log(
                                f,
                                "Invalid File Name!! File moved to Bad Raw Folder :: %s"
                                % filename)
                    else:
                        #shutil.copy(my_file+ filename, rootProjPath+"\\Training_Raw_files_validated\\Bad_Raw")
                        shutil.copy("Training_Batch_Files/" + filename,
                                    "Training_Raw_files_validated/Bad_Raw")
                        self.logger.log(
                            f,
                            "Invalid File Name!! File moved to Bad Raw Folder :: %s"
                            % filename)
                else:
                    #shutil.copy(my_file + filename, rootProjPath+"\\Training_Raw_files_validated\\Bad_Raw")
                    shutil.copy("Training_Batch_Files/" + filename,
                                "Training_Raw_files_validated/Bad_Raw")
                    self.logger.log(
                        f,
                        "Invalid File Name!! File moved to Bad Raw Folder :: %s"
                        % filename)

            f.close()

        except Exception as e:
            #my_file = self.rootProjPath+'\\Training_Logs\\nameValidationLog.txt'
            #f = open(my_file, 'a+')
            f = open("Training_Logs/nameValidationLog.txt", 'a+')
            self.logger.log(f,
                            "Error occured while validating FileName %s" % e)
            f.close()
            raise e

    def validateColumnLength(self, NumberofColumns):
        """
                          Method Name: validateColumnLength
                          Description: This function validates the number of columns in the csv files.
                                       It is should be same as given in the schema file.
                                       If not same file is not suitable for processing and thus is moved to Bad Raw Data folder.
                                       If the column number matches, file is kept in Good Raw Data for processing.
                                      The csv file is missing the first column name, this function changes the missing name to "Wafer".
                          Output: None
                          On Failure: Exception

                           Written By: iNeuron Intelligence
                          Version: 1.0
                          Revisions: None

                      """
        try:
            #my_file = self.rootProjPath+'\\Training_Logs\\columnValidationLog.txt'
            #f = open(my_file, 'a+')
            f = open("Training_Logs/columnValidationLog.txt", 'a+')
            self.logger.log(f, "Column Length Validation Started!!")
            #my_file = self.rootProjPath+'\\Training_Raw_files_validated\\Good_Raw\\'
            #for file in listdir(my_file):
            for file in listdir('Training_Raw_files_validated/Good_Raw/'):
                #my_file = self.rootProjPath+'\\Training_Raw_files_validated\\Good_Raw\\'
                #csv = pd.read_csv(my_file+file,sep=';')
                csv = pd.read_csv("Training_Raw_files_validated/Good_Raw/" +
                                  file,
                                  sep=';')
                if csv.shape[1] == NumberofColumns:
                    pass
                else:
                    #my_file = self.rootProjPath+'\\Training_Raw_files_validated\\'
                    #shutil.move(my_file+"Good_Raw\\" + file, my_file+"Bad_Raw")
                    shutil.move(
                        "Training_Raw_files_validated/Good_Raw/" + file,
                        "Training_Raw_files_validated/Bad_Raw")
                    self.logger.log(
                        f,
                        "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s"
                        % file)
            self.logger.log(f, "Column Length Validation Completed!!")
        except OSError:
            #THIS_FOLDER = os.path.dirname(os.path.abspath(__file__))
            #my_file = self.rootProjPath+'\\Training_Logs\\columnValidationLog.txt'
            #f = open(my_file, 'a+')
            f = open("Training_Logs/columnValidationLog.txt", 'a+')
            self.logger.log(
                f, "Error Occured while moving the file :: %s" % OSError)
            f.close()
            raise OSError
        except Exception as e:
            #my_file = self.rootProjPath+'\\Training_Logs\\columnValidationLog.txt'
            #f = open(my_file, 'a+')
            f = open("Training_Logs/columnValidationLog.txt", 'a+')
            self.logger.log(f, "Error Occured:: %s" % e)
            f.close()
            raise e
        f.close()

    def validateMissingValuesInWholeColumn(self, rootProjPath):
        """
                                  Method Name: validateMissingValuesInWholeColumn
                                  Description: This function validates if any column in the csv file has all values missing.
                                               If all the values are missing, the file is not suitable for processing.
                                               SUch files are moved to bad raw data.
                                  Output: None
                                  On Failure: Exception

                                   Written By: iNeuron Intelligence
                                  Version: 1.0
                                  Revisions: None

                              """
        try:
            #my_file = self.rootProjPath+'\\Training_Logs\\missingValuesInColumn.txt'
            #f = open(my_file, 'a+')
            f = open("Training_Logs/missingValuesInColumn.txt", 'a+')
            self.logger.log(f, "Missing Values Validation Started!!")
            #my_file =self.rootProjPath+'\\Training_Raw_files_validated\\Good_Raw\\'
            #my_file2 = self.rootProjPath+'\\Training_Raw_files_validated\\Bad_Raw'
            for file in listdir('Training_Raw_files_validated/Good_Raw/'):
                #for file in listdir(my_file):
                #csv = pd.read_csv(my_file + file)
                csv = pd.read_csv("Training_Raw_files_validated/Good_Raw/" +
                                  file)
                count = 0
                for columns in csv:
                    if (len(csv[columns]) - csv[columns].count()) == len(
                            csv[columns]):
                        count += 1
                        #shutil.move(my_file + file,my_file2)
                        shutil.move(
                            "Training_Raw_files_validated/Good_Raw/" + file,
                            "Training_Raw_files_validated/Bad_Raw")
                        self.logger.log(
                            f,
                            "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s"
                            % file)
                        break
                if count == 0:
                    #csv.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True)
                    #csv.to_csv(my_file + file, index=None, header=True)
                    csv.to_csv("Training_Raw_files_validated/Good_Raw/" + file,
                               index=None,
                               header=True)
        except OSError:
            #my_file = self.rootProjPath+'\\Training_Logs\\missingValuesInColumn.txt'
            #f = open(my_file, 'a+')
            f = open("Training_Logs/missingValuesInColumn.txt", 'a+')
            self.logger.log(
                f, "Error Occured while moving the file :: %s" % OSError)
            f.close()
            raise OSError
        except Exception as e:
            #my_file = self.rootProjPath+'\\Training_Logs\\missingValuesInColumn.txt'
            #f = open(my_file, 'a+')
            f = open("Training_Logs/missingValuesInColumn.txt", 'a+')
            self.logger.log(f, "Error Occured:: %s" % e)
            f.close()
            raise e
        f.close()
Exemple #28
0
def preprocess_and_split(config_path):
    file_object = open('Training_log.txt', 'a+')
    logger = App_Logger()
    config = read_params(config_path)

    train_data_path = config["split_data"]["train_path"]
    raw_train_data_path = config["load_data"]["raw_train_data_csv"]
    logger.log(file_object, "Training Data load was successful")

    train_df = pd.read_csv(raw_train_data_path, nrows=100000)
    logger.log(file_object, "Data reading successful")

    # 1.Function for extracting features from date column
    train_df = date_process(
        train_df)  # function  for datetime cols processing in train data
    logger.log(file_object, "Datetime Processing in train data completed ")

    # 2. Function to validate the columns in the dataset for json datatype
    train_json_columns = column_validator(
        train_df
    )  # Validating the columns in the train dataset for json datatype
    logger.log(file_object, "Column_validator successful")

    # 2.1 Function for flattening the json columns and merge them with original dataset
    if train_json_columns is not None:
        train_df = json_to_df(
            train_df,
            train_json_columns)  #Normalizing the json columns in train data
        target = train_df['transactionRevenue']
        logger.log(file_object, "Normalizing the json columns completed")

# 3.Dropping columns which have more than 50% of null values and columns not contributing to the target variable
    train_df = remove_nan_cols(train_df)
    logger.log(file_object, "50% NAN value columns are removed")
    train_df.drop(
        'sessionId', axis=1, inplace=True
    )  # Removing this column as  it is the  combination of fullVisitorId and visitId
    train_df.drop(
        'visitStartTime', axis=1,
        inplace=True)  # Removing this column as it is extracted into visitHour
    train_df.drop(
        'fullVisitorId', axis=1, inplace=True
    )  # This column is very long and of no much contribution towards target variable
    #drop_columns = ['visitId', 'weekday', 'day', 'bounces', 'keyword']
    drop_columns = ['visitId', 'weekday', 'day']
    train_df.drop(drop_columns, axis=1, inplace=True)
    logger.log(
        file_object,
        'Dropped columns which are not contributing to the transaction revenue'
    )

    # 4.Imputation of null values
    train_df = pd.concat(
        [train_df, target], axis=1
    )  # transactionRevenue col is attached to the dataframe for imputing nan with 0
    train_df = impute_na(train_df)
    logger.log(file_object, "Imputing NAN values with 0 is completed")

    # 5.Changing datatypes from object to desired ones
    train_df = data_type_convert(train_df)
    logger.log(file_object, "Conversion of Datatype to int completed")

    # 6. Removing columns with constant values or with zero standard deviation
    train_df = remove_zero_std_cols(train_df)
    logger.log(file_object, "Zero standard deviation columns are removed")

    #

    # 7 Function to gather categorical columns in the dataset and performing label encoding
    label_cols = categorical_cols(train_df)
    logger.log(file_object,
               "Gathering of label _cols in train data completed ")

    train_df = label_encoding(train_df, label_cols)
    logger.log(file_object, "Label_encoding in train data completed ")

    # 8. Imputing pageviews column with KNNImputer in train data

    from sklearn.impute import KNNImputer
    imputer = KNNImputer()

    imputer_train_df = imputer.fit_transform(train_df[[
        'pageviews'
    ]])  ## Imputing pageviews with KNNimputer in training data
    train_df['pageviews'] = imputer_train_df

    logger.log(file_object, "Pageviews column imputed with KNNimputer")
    train_df.to_csv(train_data_path, sep=",", index=False,
                    encoding="utf-8")  ## Storing Processed train data
    logger.log(
        file_object,
        "Traning data is processed and stored as data/processed/train_processed.csv"
    )
    file_object.close()
Exemple #29
0
 def __init__(self):
      self.goodDataPath = "Prediction_Raw_Files_Validated/Good_Raw"
      self.logger = App_Logger()
Exemple #30
0
class prediction:
    def __init__(self, path):
        self.file_object = open("Prediction_Logs/Prediction_Log.txt", 'a+')
        self.log_writer = App_Logger()
        self.pred_data_val = Prediction_Data_Validation(path)

    def predictionFromModel(self):

        try:
            self.pred_data_val.deletePredictionFile(
            )  #deletes the existing prediction file from last run!
            self.log_writer.log(self.file_object, 'Start of Prediction')
            data_getter = data_loader_prediction.Data_Getter_Pred(
                self.file_object, self.log_writer)
            data = data_getter.get_data()

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)

            if (is_null_present):
                data = preprocessor.impute_missing_values(data)

            data = preprocessor.logTransformation(data)
            # print(data)

            # #scale the prediction data
            data_scaled = pandas.DataFrame(
                preprocessor.standardScalingData(data), columns=data.columns)

            # #data=data.to_numpy()
            file_loader = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
            kmeans = file_loader.load_model('KMeans')

            clusters = kmeans.predict(
                data_scaled)  #drops the first column for cluster prediction
            data_scaled['clusters'] = clusters
            clusters = data_scaled['clusters'].unique()
            result = []  # initialize blank list for storing predicitons

            for i in clusters:
                cluster_data = data_scaled[data_scaled['clusters'] == i]
                cluster_data = cluster_data.drop(['clusters'], axis=1)
                model_name = file_loader.find_correct_model_file(i)
                model = file_loader.load_model(model_name)
                for val in (model.predict(cluster_data.values)):
                    result.append(val)
            result = pandas.DataFrame(result, columns=['Predictions'])
            path = "Prediction_Output_File/Predictions.csv"
            result.to_csv("Prediction_Output_File/Predictions.csv",
                          header=True)  #appends result to prediction file
            self.log_writer.log(self.file_object, 'End of Prediction')

        except Exception as ex:
            self.log_writer.log(
                self.file_object,
                'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex

        return path