예제 #1
0
 def __init__(self,run_id,data_path):
     self.run_id = run_id
     self.data_path = data_path
     self.logger = Logger(self.run_id, 'PredictModel', 'prediction')
     self.loadValidate = LoadValidate(self.run_id, self.data_path,'prediction')
     self.preProcess = Preprocessor(self.run_id, self.data_path,'prediction')
     self.fileOperation = FileOperation(self.run_id, self.data_path, 'prediction')
예제 #2
0
 def __init__(self,run_id,data_path):
     self.run_id = run_id
     self.data_path = data_path
     self.logger = Logger(self.run_id, 'TrainModel', 'training')
     self.loadValidate = LoadValidate(self.run_id, self.data_path,'training')
     self.preProcess = Preprocessor(self.run_id, self.data_path,'training')
     self.modelTuner = ModelTuner(self.run_id, self.data_path, 'training')
     self.fileOperation = FileOperation(self.run_id, self.data_path, 'training')
     self.cluster = KMeansCluster(self.run_id, self.data_path)
class TrainModel:

    def __init__(self,run_id,data_path):
        self.run_id = run_id
        self.data_path = data_path
        self.logger = Logger(self.run_id, 'TrainModel', 'training')
        self.loadValidate = LoadValidate(self.run_id, self.data_path,'training')
        self.preProcess = Preprocessor(self.run_id, self.data_path,'training')
        self.modelTuner = ModelTuner(self.run_id, self.data_path, 'training')
        self.fileOperation = FileOperation(self.run_id, self.data_path, 'training')
        self.cluster = KMeansCluster(self.run_id, self.data_path)

    def training_model(self):
       
        try:
            self.logger.info('Start of Training')
            self.logger.info('Run_id:' + str(self.run_id))
            #Load, validations and transformation
            self.loadValidate.validate_trainset()
            #preprocessing activities
            self.X, self.y = self.preProcess.preprocess_trainset()
            columns = {"data_columns":[col for col in self.X.columns]}
            with open('apps/database/columns.json','w') as f:
                f.write(json.dumps(columns))
            #create clusters
            number_of_clusters = self.cluster.elbow_plot(self.X)
            # Divide the data into clusters
            self.X= self.cluster.create_clusters(self.X, number_of_clusters)
            # create a new column in the dataset consisting of the corresponding cluster assignments.
            self.X['Labels'] = self.y
            # getting the unique clusters from our data set
            list_of_clusters = self.X['Cluster'].unique()
            # parsing all the clusters and look for the best ML algorithm to fit on individual cluster
            for i in list_of_clusters:
                cluster_data=self.X[self.X['Cluster']==i] # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features=cluster_data.drop(['Labels','Cluster'],axis=1)
                cluster_label= cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(cluster_features, cluster_label, test_size=0.2, random_state=0)
                #getting the best model for each of the clusters
                best_model_name, best_model = self.modelTuner.get_best_model(x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                save_model=self.fileOperation.save_model(best_model,best_model_name+str(i))


            self.logger.info('End of Training')
        except Exception:
            self.logger.exception('Unsuccessful End of Training')
            raise Exception
class FileOperation:
    def __init__(self, run_id, data_path, mode):
        self.run_id = run_id
        self.data_path = data_path
        self.logger = Logger(self.run_id, 'FileOperation', mode)

    def save_model(self, model, file_name):

        try:
            self.logger.info('Start of Save Models')
            path = os.path.join(
                'apps/models/',
                file_name)  #create seperate directory for each cluster
            if os.path.isdir(
                    path
            ):  #remove previously existing models for each clusters
                shutil.rmtree('apps/models')
                os.makedirs(path)
            else:
                os.makedirs(path)  #
            with open(path + '/' + file_name + '.sav', 'wb') as f:
                pickle.dump(model, f)  # save the model to file
            self.logger.info('Model File ' + file_name + ' saved')
            self.logger.info('End of Save Models')
            return 'success'
        except Exception as e:
            self.logger.exception('Exception raised while Save Models: %s' % e)
            raise Exception()

    def load_model(self, file_name):

        try:
            self.logger.info('Start of Load Model')
            with open('apps/models/' + file_name + '/' + file_name + '.sav',
                      'rb') as f:
                self.logger.info('Model File ' + file_name + ' loaded')
                self.logger.info('End of Load Model')
                return pickle.load(f)
        except Exception as e:
            self.logger.exception('Exception raised while Loading Model: %s' %
                                  e)
            raise Exception()

    def correct_model(self, cluster_number):

        try:
            self.logger.info('Start of finding correct model')
            self.cluster_number = cluster_number
            self.folder_name = 'apps/models'
            self.list_of_model_files = []
            self.list_of_files = os.listdir(self.folder_name)
            for self.file in self.list_of_files:
                try:
                    if (self.file.index(str(self.cluster_number)) != -1):
                        self.model_name = self.file
                except:
                    continue
            self.model_name = self.model_name.split('.')[0]
            self.logger.info('End of finding correct model')
            return self.model_name
        except Exception as e:
            self.logger.info('Exception raised while finding correct model' +
                             str(e))
            raise Exception()
 def __init__(self, run_id, data_path, mode):
     self.run_id = run_id
     self.data_path = data_path
     self.logger = Logger(self.run_id, 'LoadValidate', mode)
     self.dbOperation = DatabaseOperation(self.run_id, self.data_path, mode)
class LoadValidate:
    def __init__(self, run_id, data_path, mode):
        self.run_id = run_id
        self.data_path = data_path
        self.logger = Logger(self.run_id, 'LoadValidate', mode)
        self.dbOperation = DatabaseOperation(self.run_id, self.data_path, mode)

    def values_from_schema(self, schema_file):

        try:
            self.logger.info('Start of Reading values From Schema...')
            with open('apps/database/' + schema_file + '.json', 'r') as f:
                dic = json.load(f)
                f.close()
            column_names = dic['ColName']
            number_of_columns = dic['NumberofColumns']
            self.logger.info('End of Reading values From Schema...')
        except ValueError:
            self.logger.exception(
                'ValueError raised while Reading values From Schema')
            raise ValueError
        except KeyError:
            self.logger.exception(
                'KeyError raised while Reading values From Schema')
            raise KeyError
        except Exception as e:
            self.logger.exception(
                'Exception raised while Reading values From Schema: %s' % e)
            raise e
        return column_names, number_of_columns

    def validate_column_length(self, number_of_columns):

        try:
            self.logger.info('Start of Validating Column Length...')
            for file in listdir(self.data_path):
                csv = pd.read_csv(self.data_path + '/' + file)
                if csv.shape[1] == number_of_columns:
                    pass
                else:
                    shutil.move(self.data_path + '/' + file,
                                self.data_path + '_rejects')
                    self.logger.info("Invalid Columns Length :: %s" % file)

            self.logger.info('End of Validating Column Length...')
        except OSError:
            self.logger.exception(
                'OSError raised while Validating Column Length')
            raise OSError
        except Exception as e:
            self.logger.exception(
                'Exception raised while Validating Column Length: %s' % e)
            raise e

    def validate_missing_values(self):

        try:
            self.logger.info('Start of Validating Missing Values...')
            for file in listdir(self.data_path):
                csv = pd.read_csv(self.data_path + '/' + file)
                count = 0
                for columns in csv:
                    if (len(csv[columns]) - csv[columns].count()) == len(
                            csv[columns]):
                        count += 1
                        shutil.move(self.data_path + '/' + file,
                                    self.data_path + '_rejects')
                        self.logger.info("All Missing Values in Column :: %s" %
                                         file)
                        break

            self.logger.info('End of Validating Missing Values...')
        except OSError:
            self.logger.exception(
                'OSError raised while Validating Missing Values')
            raise OSError
        except Exception as e:
            self.logger.exception(
                'Exception raised while Validating Missing Values: %s' % e)
            raise e

    def replace_missing_values(self):

        try:
            self.logger.info('Start of Replacing Missing Values with NULL...')
            only_files = [f for f in listdir(self.data_path)]
            for file in only_files:
                csv = pd.read_csv(self.data_path + "/" + file)
                csv.fillna('NULL', inplace=True)
                csv.to_csv(self.data_path + "/" + file,
                           index=None,
                           header=True)
                self.logger.info('%s: File Transformed successfully!!' % file)
            self.logger.info('End of Replacing Missing Values with NULL...')
        except Exception as e:
            self.logger.exception(
                'Exception raised while Replacing Missing Values with NULL: %s'
                % e)

    def archive_old_files(self):

        now = datetime.now()
        date = now.date()
        time = now.strftime("%H%M%S")
        try:
            self.logger.info('Start of Archiving Old Rejected Files...')
            source = self.data_path + '_rejects/'
            if os.path.isdir(source):
                path = self.data_path + '_archive'
                if not os.path.isdir(path):
                    os.makedirs(path)
                dest = path + '/reject_' + str(date) + "_" + str(time)
                files = os.listdir(source)
                for f in files:
                    if not os.path.isdir(dest):
                        os.makedirs(dest)
                    if f not in os.listdir(dest):
                        shutil.move(source + f, dest)

            self.logger.info('End of Archiving Old Rejected Files...')

            self.logger.info('Start of Archiving Old Validated Files...')
            source = self.data_path + '_validation/'
            if os.path.isdir(source):
                path = self.data_path + '_archive'
                if not os.path.isdir(path):
                    os.makedirs(path)
                dest = path + '/validation_' + str(date) + "_" + str(time)
                files = os.listdir(source)
                for f in files:
                    if not os.path.isdir(dest):
                        os.makedirs(dest)
                    if f not in os.listdir(dest):
                        shutil.move(source + f, dest)

            self.logger.info('End of Archiving Old Validated Files...')

            self.logger.info('Start of Archiving Old Processed Files...')
            source = self.data_path + '_processed/'
            if os.path.isdir(source):
                path = self.data_path + '_archive'
                if not os.path.isdir(path):
                    os.makedirs(path)
                dest = path + '/processed_' + str(date) + "_" + str(time)
                files = os.listdir(source)
                for f in files:
                    if not os.path.isdir(dest):
                        os.makedirs(dest)
                    if f not in os.listdir(dest):
                        shutil.move(source + f, dest)

            self.logger.info('End of Archiving Old Processed Files...')

            self.logger.info('Start of Archiving Old Result Files...')
            source = self.data_path + '_results/'
            if os.path.isdir(source):
                path = self.data_path + '_archive'
                if not os.path.isdir(path):
                    os.makedirs(path)
                dest = path + '/results_' + str(date) + "_" + str(time)
                files = os.listdir(source)
                for f in files:
                    if not os.path.isdir(dest):
                        os.makedirs(dest)
                    if f not in os.listdir(dest):
                        shutil.move(source + f, dest)

            self.logger.info('End of Archiving Old Result Files...')
        except Exception as e:
            self.logger.exception(
                'Exception raised while Archiving Old Rejected Files: %s' % e)
            raise e

    def move_processed_files(self):

        try:
            self.logger.info('Start of Moving Processed Files...')
            for file in listdir(self.data_path):
                shutil.move(self.data_path + '/' + file,
                            self.data_path + '_processed')
                self.logger.info("Moved the already processed file %s" % file)

            self.logger.info('End of Moving Processed Files...')
        except Exception as e:
            self.logger.exception(
                'Exception raised while Moving Processed Files: %s' % e)
            raise e

    def validate_trainset(self):

        try:
            self.logger.info(
                'Start of Data Load, validation and transformation')
            # archive old  files
            self.archive_old_files()
            # extracting values from training schema
            column_names, number_of_columns = self.values_from_schema(
                'schema_train')
            # validating column length in the file
            self.validate_column_length(number_of_columns)
            # validating if any column has all values missing
            self.validate_missing_values()
            # replacing blanks in the csv file with "Null" values
            self.replace_missing_values()
            # create database with given name, if present open the connection! Create table with columns given in schema
            self.dbOperation.create_table('training', 'training_raw_data_t',
                                          column_names)
            # insert csv files in the table
            self.dbOperation.insert_data('training', 'training_raw_data_t')
            # export data in table to csv file
            self.dbOperation.export_csv('training', 'training_raw_data_t')
            # move processed files
            self.move_processed_files()
            self.logger.info('End of Data Load, validation and transformation')
        except Exception:
            self.logger.exception(
                'Unsuccessful End of Data Load, validation and transformation')
            raise Exception

    def validate_predictset(self):

        try:
            self.logger.info(
                'Start of Data Load, validation and transformation')
            # archive old rejected files
            self.archive_old_files()
            # extracting values from schema
            column_names, number_of_columns = self.values_from_schema(
                'schema_predict')
            # validating column length in the file
            self.validate_column_length(number_of_columns)
            # validating if any column has all values missing
            self.validate_missing_values()
            # replacing blanks in the csv file with "Null" values
            self.replace_missing_values()
            # create database with given name, if present open the connection! Create table with columns given in schema
            self.dbOperation.create_table('prediction',
                                          'prediction_raw_data_t',
                                          column_names)
            # insert csv files in the table
            self.dbOperation.insert_data('prediction', 'prediction_raw_data_t')
            # export data in table to csv file
            self.dbOperation.export_csv('prediction', 'prediction_raw_data_t')
            # move processed files
            self.move_processed_files()
            self.logger.info('End of Data Load, validation and transformation')
        except Exception:
            self.logger.exception(
                'Unsuccessful End of Data Load, validation and transformation')
            raise Exception
예제 #7
0
class ModelTuner:
    """
    *****************************************************************************
    *
    * filename:       model_tuner.py
    * version:        1.0
    * author:
    * creation date:
    *
    *
    * description:    Class to tune and select best model
    *
    ****************************************************************************
    """

    def __init__(self, run_id, data_path, mode):
        self.run_id = run_id
        self.data_path = data_path
        self.logger = Logger(self.run_id, 'ModelTuner', mode)
        self.rfc = RandomForestClassifier()
        self.xgb = XGBClassifier(objective='binary:logistic')
        # self.knn = KNeighborsClassifier()

    def best_params_randomforest(self, train_x, train_y):
        """
        * method: best_params_randomforest
        * description: method to get the parameters for Random Forest Algorithm which give the best accuracy.
                                             Use Hyper Parameter Tuning.
        * return: The model with the best parameters
        *
        *
        * Parameters
        *   train_x:
        *   train_y:
        """
        try:
            self.logger.info('Start of finding best params for randomforest algo...')
            # initializing with different combination of parameters
            self.param_grid = {
                "n_estimators": [2, 3, 4],
                "criterion": ['gini', 'entropy'],
                "max_depth": range(2, 4, 1),
                "max_features": ['auto', 'log2']}

            # self.cv_method = RepeatedStratifiedKFold(n_splits=5,
            #                                          n_repeats=3,
            #                                          random_state=3)
            # Creating an object of the Grid Search class
            self.grid = GridSearchCV(estimator=self.rfc, param_grid=self.param_grid, cv=5)
            # finding the best parameters
            self.grid.fit(train_x, train_y)

            # extracting the best parameters
            self.criterion = self.grid.best_params_['criterion']
            self.max_depth = self.grid.best_params_['max_depth']
            self.max_features = self.grid.best_params_['max_features']
            self.n_estimators = self.grid.best_params_['n_estimators']

            # creating a new model with the best parameters
            self.rfc = RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion,
                                              max_depth=self.max_depth, max_features=self.max_features)
            # training the mew model
            self.rfc.fit(train_x, train_y)
            self.logger.info('Random Forest best params: ' + str(self.grid.best_params_))
            self.logger.info('End of finding best params for randomforest algo...')

            return self.rfc
        except Exception as e:
            self.logger.exception('Exception raised while finding best params for randomforest algo:' + str(e))
            raise Exception()

    # def get_best_params_for_KNN(self, train_x, train_y):
    #     """
    #          Method Name: get_best_params_for_KNN
    #          Description: get the parameters for KNN Algorithm which give the best accuracy.
    #                                                          Use Hyper Parameter Tuning.
    #         Output: The model with the best parameters
    #         On Failure: Raise Exception
    #
    #
    #
    #                                     """
    #
    #     try:
    #         self.logger.info('Start of finding best params for KNN algo...')
    #         # initializing with different combination of parameters
    #         self.param_grid_knn = {
    #             'algorithm': ['ball_tree', 'kd_tree', 'brute'],
    #             'leaf_size': [10, 17, 24, 28, 30, 35],
    #              'n_neighbors': [2, 3],
    #             'p': [1, 2]
    #         }
    #         # self.cv_method = RepeatedStratifiedKFold(n_splits=5,
    #         #                                          n_repeats=3,
    #         #                                          random_state=999)
    #         # Creating an object of the Grid Search class
    #         self.grid = GridSearchCV(self.knn, self.param_grid_knn, verbose=3,
    #                                  cv=5)
    #         # finding the best parameters
    #         self.grid.fit(train_x, train_y)
    #
    #         # extracting the best parameters
    #         self.algorithm = self.grid.best_params_['algorithm']
    #         self.leaf_size = self.grid.best_params_['leaf_size']
    #         self.n_neighbors = self.grid.best_params_['n_neighbors']
    #         self.p = self.grid.best_params_['p']
    #
    #         # creating a new model with the best parameters
    #         self.knn = KNeighborsClassifier(algorithm=self.algorithm, leaf_size=self.leaf_size,
    #                                         n_neighbors=self.n_neighbors, p=self.p, n_jobs=-1)
    #         # training the mew model
    #         self.knn.fit(train_x, train_y)
    #         self.logger.info('Knn Forest best params: ' + str(self.grid.best_params_))
    #         self.logger.info('End of finding best params for knn algo...')
    #
    #         return self.knn
    #     except Exception as e:
    #         self.logger.exception('Exception raised while finding best params for knn algo:' + str(e))
    #         raise Exception()

    def best_params_xgboost(self, train_x, train_y):
        """
        * method: best_params_xgboost
        * description: method to get the parameters for XGBoost Algorithm which give the best accuracy.
                                             Use Hyper Parameter Tuning.
        * return: The model with the best parameters
        *
        * Parameters
        *   train_x:
        *   train_y:
        """
        try:
            self.logger.info('Start of finding best params for XGBoost algo...')
            # initializing with different combination of parameters
            self.param_grid_xgboost = {
                'learning_rate': [0.5, 0.1, 0.01, 0.001],
                'max_depth': [3, 5, 10, 20],
                'n_estimators': [10, 50, 100, 200]

            }
            # Creating an object of the Grid Search class
            self.grid = GridSearchCV(XGBClassifier(objective='binary:logistic'), self.param_grid_xgboost, cv=5)
            # finding the best parameters
            self.grid.fit(train_x, train_y)

            # extracting the best parameters
            self.learning_rate = self.grid.best_params_['learning_rate']
            self.max_depth = self.grid.best_params_['max_depth']
            self.n_estimators = self.grid.best_params_['n_estimators']

            # creating a new model with the best parameters
            self.xgb = XGBClassifier(objective='binary:logistic', learning_rate=self.learning_rate,
                                     max_depth=self.max_depth, n_estimators=self.n_estimators)
            # training the mew model
            self.xgb.fit(train_x, train_y)
            self.logger.info('XGBoost best params: ' + str(self.grid.best_params_))
            self.logger.info('End of finding best params for XGBoost algo...')
            return self.xgb
        except Exception as e:
            self.logger.exception('Exception raised while finding best params for XGBoost algo:' + str(e))
            raise Exception()

    def get_best_model(self, train_x, train_y, test_x, test_y):
        """
        * method: get_best_model
        * description: method to get best model
        * return: none
        *
        *
        * Parameters
        *   train_x:
        *   train_y:
        *   test_x:
        *   test_y:
        """
        try:
            # self.logger.info('Start of finding best model...')
            # self.knn = self.get_best_params_for_KNN(train_x, train_y)
            # self.prediction_knn = self.knn.predict(test_x)  # Predictions using the Knn Model
            #
            # if len(test_y.unique()) == 1:  # if there is only one label in y, then roc_auc_score returns error. We
            #     # will use accuracy in that case
            #     self.knn_score = accuracy_score(test_y, self.prediction_knn)
            #     self.logger.info('Accuracy for knn:' + str(self.knn_score))
            # else:
            #     self.knn_score = roc_auc_score(test_y, self.prediction_knn)  # AUC for knn
            #     self.logger.info('AUC for knn:' + str(self.knn_score))

            self.xgb = self.best_params_xgboost(train_x, train_y)
            self.prediction_xgb = self.xgb.predict(test_x)  # prediction using the xgb  Algorithm

            if len(test_y.unique()) == 1:  # if there is only one label in y, then roc_auc_score returns error. We
                # will use accuracy in that case
                self.xgb_score = accuracy_score(test_y, self.prediction_xgb)
                self.logger.info('Accuracy for Xgboost:' + str(self.xgb_score))
            else:
                self.xgb_score = roc_auc_score(test_y, self.prediction_xgb)  # AUC for XGBoost
                self.logger.info('AUC for Xgboost:' + str(self.xgb_score))

            # create best model for Random Forest

            self.random_forest = self.best_params_randomforest(train_x, train_y)
            self.prediction_random_forest = self.random_forest.predict(
                test_x)  # prediction using the Random Forest Algorithm

            if len(test_y.unique()) == 1:  # if there is only one label in y, then roc_auc_score returns error. We
                # will use accuracy in that case
                self.random_forest_score = accuracy_score(test_y, self.prediction_random_forest)
                self.logger.info('Accuracy for Random Forest:' + str(self.random_forest_score))
            else:
                self.random_forest_score = roc_auc_score(test_y, self.prediction_random_forest)  # AUC for XGBoost
                self.logger.info('AUC for Random Forest:' + str(self.random_forest_score))

            # comparing the two models
            self.logger.info('End of finding best model...')
            if (self.random_forest_score < self.xgb_score):
                return 'XGB', self.xgb
            else:
                return 'RandomForest', self.random_forest

        except Exception as e:
            self.logger.exception('Exception raised while finding best model:' + str(e))
            raise Exception()
class FileOperation:
    """
	***************************************************************************
	*
	* filename:     file_operation.py
	* version:      1.0
	* author:       Abdullah Makhdoom
	* creation:     23-DEC-2020
	*
	* change_history:
	*
	who             when           version  change (include bug# if apply)
	* ----------      -----------    -------  ------------------------------
	*
	*
	* description:    Class for file operation
	*
	****************************************************************************
	"""
    def __init__(self, run_id, data_path, mode):
        self.run_id = run_id
        self.data_path = data_path
        self.logger = Logger(self.run_id, 'FIleOperation', mode)

    def save_model(self, model, file_name):
        """
		* method: save_model
		* description: method to save the model file
		* return: File gets saved
		*
		* who             when           version  change (include bug# if apply)
		* ----------      -----------    -------  ------------------------------
		*
		* Parameters
		*   model:
		*   file_name:
		"""
        try:
            self.logger.info('Start of Save Models')
            path = os.path.join(
                'apps/models/',
                file_name)  # create seperate directory for each cluster
            if os.path.isdir(
                    path
            ):  # remove previously existing models for each clusters
                shutil.rmtree('apps/models')
                os.makedirs(path)
            else:
                os.makedirs(path)
            with open(path + '/' + file_name + '.sav', 'wb') as f:
                pickle.dump(model, f)

            self.logger.info('Model File ' + file_name + ' saved')
            self.logger.info('End of Save Models')
            return 'success'

        except Exception as e:
            self.logger.exception('Exception raised while Saving Models: %s' %
                                  e)
            raise Exception()

    def load_model(self, file_name):
        """
		* method:       load_model
		* decription:   method to load the model file
		* return:       File get saved
		*
		* who           when            version         change
		----------      -----------     ---------       --------------------
		*
		*
		* Parameters
		*   file_name:
		"""
        try:
            self.logger.info('Start of Load Models')
            with open('apps/models/' + file_name + '/' + file_name + '.sav',
                      'rb') as f:
                self.logger.info('Model File ' + file_name + ' loaded')
                self.logger.info('End of Load Model')
                return pickle.load(f)
        except Exception as e:
            self.logger.exception('Exception raised while Loading Model: %s' %
                                  e)
            raise Exception()

    def correct_model(self, cluster_number):
        """
		* method:       correct_model
		* decription:   method to find the best model
		* return:       The Model FIle
		*
		* Parameters
		*   cluster_number:
		"""
        try:
            self.logger.info('Start of finding correct model')
            self.cluster_number = cluster_number
            self.folder_name = 'apps/models'
            self.list_of_model_files = []
            self.list_of_files = os.listdir(self.folder_name)
            for self.file in self.list_of_files:
                try:
                    if (self.file.index(str(self.cluster_number)) != -1):
                        self.model_name = self.file
                except:
                    continue
            self.model_name = self.model_name.split('.')[0]
            self.logger.info('End of finding correct model')
            return self.model_name
        except Exception as e:
            self.logger.info('Exception raised while finding correct model' +
                             str(e))
            raise Exception()
예제 #9
0
class ModelTuner:
	"""
	**************************************************************************
	*
	* filename:       model_tuner.py
	* version:        1.0
	*
	* change history:
	*
	*
	* description:    Class to tune and select best model
	*
	**************************************************************************
	"""

	def __init__(self,run_id,data_path,mode):
		self.run_id = run_id
		self.data_path = data_path
		self.logger = Logger(self.run_id, 'ModelTuner', mode)
		self.rfc = RandomForestClassifier()
		self.xgb = XGBClassifier(objective='binary:logistic')

	def best_params_randomforest(self,train_x,train_y):
		"""
		* method: best_params_randomforest
		* description: method to get the parameters for Random Forest Algorithm which give the best accuracy.Use Hyper Parameter Tuning.
	*
		* return: The model with the best parameters
		*
		* Parameters
		*   train_x:
		*   train_y:
		"""
		try:
			self.logger.info('Start of finding best params for randomforest algo...')
			# initializing with different combination of parameters
			self.param_grid = {"n_estimators": [10, 50, 100, 130], "criterion": ['gini', 'entropy'],
							   "max_depth": range(2, 4, 1), "max_features": ['auto', 'log2']}

			#Creating an object of the Grid Search class
			self.grid = GridSearchCV(estimator=self.rfc, param_grid=self.param_grid, cv=5)
			#finding the best parameters
			self.grid.fit(train_x, train_y)

			#extracting the best parameters
			self.criterion = self.grid.best_params_['criterion']
			self.max_depth = self.grid.best_params_['max_depth']
			self.max_features = self.grid.best_params_['max_features']
			self.n_estimators = self.grid.best_params_['n_estimators']

			#creating a new model with the best parameters
			self.rfc = RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion,
											  max_depth=self.max_depth, max_features=self.max_features)
			# training the mew model
			self.rfc.fit(train_x, train_y)
			self.logger.info('Random Forest best params: '+str(self.grid.best_params_))
			self.logger.info('End of finding best params for randomforest algo...')

			return self.rfc
		except Exception as e:
			self.logger.exception('Exception raised while finding best params for randomforest algo:' + str(e))
			raise Exception()

	def best_params_xgboost(self,train_x,train_y):
		"""
		* method: best_params_xgboost
		* description: method to get the parameters for XGBoost Algorithm which give the best accuracy.Use Hyper Parameter Tuning.
	*
		* return: The model with the best parameters
		*
		* Parameters
		*   train_x:
		*   train_y:
		"""
		try:
			self.logger.info('Start of finding best params for XGBoost algo...')
			# initializing with different combination of parameters
			self.param_grid_xgboost = {
				'learning_rate': [0.5, 0.1, 0.01, 0.001],
				'max_depth': [3, 5, 10, 20],
				'n_estimators': [10, 50, 100, 200]

			}
			# Creating an object of the Grid Search class
			self.grid= GridSearchCV(XGBClassifier(objective='binary:logistic'),self.param_grid_xgboost, cv=5)
			# finding the best parameters
			self.grid.fit(train_x, train_y)

			# extracting the best parameters
			self.learning_rate = self.grid.best_params_['learning_rate']
			self.max_depth = self.grid.best_params_['max_depth']
			self.n_estimators = self.grid.best_params_['n_estimators']

			# creating a new model with the best parameters
			self.xgb = XGBClassifier(objective='binary:logistic',learning_rate=self.learning_rate, max_depth=self.max_depth, n_estimators=self.n_estimators)
			# training the mew model
			self.xgb.fit(train_x, train_y)
			self.logger.info('XGBoost best params: ' + str(self.grid.best_params_))
			self.logger.info('End of finding best params for XGBoost algo...')
			return self.xgb
		except Exception as e:
			self.logger.exception('Exception raised while finding best params for XGBoost algo:' + str(e))
			raise Exception()


	def get_best_model(self,train_x,train_y,test_x,test_y):
		"""
		* method: get_best_model
		* description: method to get best model
		* return: none
		*
		* Parameters
		*   train_x:
		*   train_y:
		*   test_x:
		*   test_y:
		"""
		try:
			self.logger.info('Start of finding best model...')
			self.xgboost= self.best_params_xgboost(train_x,train_y)
			self.prediction_xgboost = self.xgboost.predict(test_x) # Predictions using the XGBoost Model

			if len(test_y.unique()) == 1:  # if there is only one label in y, then roc_auc_score returns error. We will use accuracy in that case
				self.xgboost_score = accuracy_score(test_y, self.prediction_xgboost)
				self.logger.info('Accuracy for XGBoost:' + str(self.xgboost_score))
			else:
				self.xgboost_score = roc_auc_score(test_y, self.prediction_xgboost)  # AUC for XGBoost
				self.logger.info('AUC for XGBoost:' + str(self.xgboost_score))

			# create best model for Random Forest
			self.random_forest=self.best_params_randomforest(train_x,train_y)
			self.prediction_random_forest=self.random_forest.predict(test_x) # prediction using the Random Forest Algorithm

			if len(test_y.unique()) == 1:  # if there is only one label in y, then roc_auc_score returns error. We will use accuracy in that case
				self.random_forest_score = accuracy_score(test_y, self.prediction_random_forest)
				self.logger.info('Accuracy for XGBoost:' + str(self.random_forest_score))
			else:
				self.random_forest_score = roc_auc_score(test_y, self.prediction_random_forest)  # AUC for XGBoost
				self.logger.info('AUC for XGBoost:' + str(self.random_forest_score))

			#comparing the two models
			self.logger.info('End of finding best model...')
			if(self.random_forest_score <  self.xgboost_score):
				return 'XGBoost',self.xgboost
			else:
				return 'RandomForest',self.random_forest

		except Exception as e:
			self.logger.exception('Exception raised while finding best model:' + str(e))
			raise Exception()
예제 #10
0
class Preprocessor:
    """
    *****************************************************************************
    *
    * filename:       Preprocessor.py
    * version:        1.0
    * author:
    * creation date:
    *
    *
    *
    * description:    Class to pre-process training and predict dataset
    *
    ****************************************************************************
    """
    def __init__(self, run_id, data_path, mode):
        self.run_id = run_id
        self.data_path = data_path
        self.logger = Logger(self.run_id, 'Preprocessor', mode)

    def get_data(self):
        """
        * method: get_data
        * description: method to read datafile
        * return: A pandas DataFrame
        *
        *
        * Parameters
        *   none:
        """
        try:
            # reading the data file
            self.logger.info('Start of reading dataset...')
            self.data = pd.read_csv(self.data_path +
                                    '_validation/InputFile.csv')
            self.logger.info('End of reading dataset...')
            return self.data
        except Exception as e:
            self.logger.exception(
                'Exception raised while reading dataset: %s' + str(e))
            raise Exception()

    def save_encoded_data(self):
        """
            * method: get_data
            * description: method to save datafile
            * return: A pandas DataFrame
            *
            *
            * Parameters
            *   none:
            """
        try:
            # reading the data file
            self.logger.info('Start of saving dataset...')
            self.data.to_csv(self.data_path + '_encode/encoded.csv')
            self.logger.info('End of saving dataset...')
            return self.data
        except Exception as e:
            self.logger.exception(
                'Exception raised while reading dataset: %s' + str(e))
            raise Exception()

    def drop_columns(self, data, columns):
        """
        * method: drop_columns
        * description: method to drop columns
        * return: A pandas DataFrame after removing the specified columns.
        *
        *
        * Parameters
        *   data:
        *   columns:
        """
        self.data = data
        self.columns = columns
        try:
            self.logger.info('Start of Droping Columns...')
            self.useful_data = self.data.drop(
                labels=self.columns,
                axis=1)  # drop the labels specified in the columns
            self.logger.info('End of Droping Columns...')
            return self.useful_data
        except Exception as e:
            self.logger.exception('Exception raised while Droping Columns:' +
                                  str(e))
            raise Exception()

    def replace_invalid_values_with_null(self, data):
        """
          Method Name: is_null_present
          Description: This method replaces invalid values i.e. '?' with null, as discussed in EDA.

                             """
        # self.data = data
        try:
            self.logger.info('Start of replacing invalid values...')
            for column in data.columns:
                count = data[column][data[column] == '?'].count()
                if count != 0:
                    data[column] = data[column].replace('?', np.nan)
            self.logger.info('end of replacing invalid values...')
            return data
        except Exception as e:
            self.logger.exception(
                'Exception raised while replacing invalid values' + str(e))
            raise Exception()

    def is_null_present(self, data):
        """
        * method: is_null_present
        * description: method to check null values
        * return: Returns a Boolean Value. True if null values are present in the DataFrame, False if they are not present.
        *
        * Parameters
        *   data:
        """
        self.null_present = False
        try:
            self.logger.info('Start of finding missing values...')
            self.null_counts = data.isna().sum(
            )  # check for the count of null values per column
            for i in self.null_counts:
                if i > 0:
                    self.null_present = True
                    break

            if (self.null_present
                ):  # write the logs to see which columns have null values
                dataframe_with_null = pd.DataFrame()
                dataframe_with_null['columns'] = data.columns
                dataframe_with_null['missing values count'] = np.asarray(
                    data.isna().sum())
                dataframe_with_null.to_csv(
                    self.data_path + '_validation/' + 'null_values.csv'
                )  # storing the null column information to file
            self.logger.info('End of finding missing values...')
            return self.null_present
        except Exception as e:
            self.logger.exception(
                'Exception raised while finding missing values:' + str(e))
            raise Exception()

    def impute_missing_values(self, data):
        """
        * method: impute_missing_values
        * description: method to impute missing values
        * return: none
        *
        * Parameters
        *   data:
        """
        self.data = data
        try:
            self.logger.info('Start of imputing missing values...')
            imputer = KNNImputer(n_neighbors=3,
                                 weights='uniform',
                                 missing_values=np.nan)
            self.new_array = imputer.fit_transform(
                self.data)  # impute the missing values
            # convert the nd-array returned in the step above to a Data frame
            self.new_data = pd.DataFrame(data=self.new_array,
                                         columns=self.data.columns)
            self.logger.info('End of imputing missing values...')
            return self.new_data
        except Exception as e:
            self.logger.exception(
                'Exception raised while imputing missing values:' + str(e))
            raise Exception()

    def feature_encoding(self, data):
        """
        * method: feature_encoding
        * description: method to convert categiorical to numerical
        * return: none
        *
        *
        * Parameters
        *   data:
        """
        try:
            self.logger.info('Start of feature encoding...')
            self.new_data = data.select_dtypes(include=['object']).copy()
            # Using the dummy encoding to encode the categorical columns to numerical ones
            for col in self.new_data.columns:
                self.new_data = pd.get_dummies(self.new_data,
                                               columns=[col],
                                               prefix=[col],
                                               drop_first=True)

            self.logger.info('End of feature encoding...')
            return self.new_data
        except Exception as e:
            self.logger.exception('Exception raised while feature encoding:' +
                                  str(e))
            raise Exception()

    def encode_categorical_values(self, data):
        """
        Method Name: encodeCategoricalValues Description: This method encodes all the categorical values in the
        training set. Output: A Dataframe which has all the categorical values encoded.
        On Failure: Raise Exception


                      """
        try:
            self.logger.info('Start of encode Categorical Values ...')
            # We can map the categorical values like below:
            data['Gender'] = data['Gender'].map({'a': 0, 'b': 1})

            # columns with two categorical data have same value 'f' and 't'.

            data['PriorDefault'] = data['PriorDefault'].map({'f': 0, 't': 1})
            data['Employed'] = data['Employed'].map({'f': 0, 't': 1})
            data['DriversLicense'] = data['DriversLicense'].map({
                'f': 0,
                't': 1
            })

            self.logger.info('end of encode Categorical Values...')
            return data
        except Exception as e:
            self.logger.exception(
                'Exception raised while splitting features and label:' +
                str(e))
            raise Exception()

    def feature_selection(self, data):
        """
        * method: get_data
        * description: method to feature selection of dataset
        * return: A pandas DataFrame
        *
        *
        * Parameters
        *   none:
        """
        self.data = data
        try:
            self.logger.info('Start feature selection of dataset...')

            X = self.data.iloc[:, :-18]
            y = self.data['Approved']
            ordered_rank_features = SelectKBest(score_func=chi2, k='all')
            ordered_feature = ordered_rank_features.fit(X, y)

            data_scores = pd.DataFrame(ordered_feature.scores_,
                                       columns=["Score"])
            data_columns = pd.DataFrame(X.columns)

            features_rank = pd.concat([data_columns, data_scores], axis=1)

            features_rank.columns = ['Features', 'Score']
            features_rank.nlargest(10,
                                   'Score').to_csv(self.data_path +
                                                   '_encode/features_rank.csv')

            data1 = self.data[[
                'PriorDefault', 'YearsEmployed', 'CreditScore', 'Income',
                'Approved'
            ]]
            data1.to_csv(self.data_path + '_encode/feature_selection.csv')
            self.logger.info('End feature selection of dataset...')
            return data1
        except Exception as e:
            self.logger.exception(
                'Exception raised while feature selection of dataset: %s' +
                str(e))
            raise Exception()

    def feature_select(self, data):
        """
            * method: get_data
            * description: method to feature selection of dataset
            * return: A pandas DataFrame
            *
            *
            * Parameters
            *   none:
            """
        self.data = data
        try:
            self.logger.info('Start feature selection of dataset...')

            data1 = self.data[[
                'PriorDefault', 'YearsEmployed', 'CreditScore', 'Income'
            ]]
            self.logger.info('End feature selection of dataset...')
            return data1
        except Exception as e:
            self.logger.exception(
                'Exception raised while feature selection of dataset: %s' +
                str(e))
            raise Exception()

    def split_features_label(self, data, label_name):
        """
        * method: split_features_label
        * description: method to separate features and label
        * return: none
        *
        * Parameters
        *   data:
        *   label_name:
        """
        self.data = data
        try:
            self.logger.info('Start of splitting features and label ...')
            self.X = self.data.drop(
                labels=label_name, axis=1
            )  # drop the columns specified and separate the feature columns
            self.y = self.data[label_name]  # Filter the Label columns
            self.logger.info('End of splitting features and label ...')
            return self.X, self.y
        except Exception as e:
            self.logger.exception(
                'Exception raised while splitting features and label:' +
                str(e))
            raise Exception()

    def final_predictset(self, data):
        """
        * method: final_predictset
        * description: method to build final predict set by adding additional encoded column with value as 0
        * return: column_names, Number of Columns
        *
        * Parameters
        *   none:
        """
        try:
            self.logger.info('Start of building final predictset...')
            with open('apps/database/columns.json', 'r') as f:
                data_columns = json.load(f)['data_columns']
                f.close()
            df = pd.DataFrame(data=None, columns=data_columns)
            df_new = pd.concat([df, data], ignore_index=True, sort=False)
            data_new = df_new.fillna(0)
            self.logger.info('End of building final predictset...')
            return data_new
        except ValueError:
            self.logger.exception(
                'ValueError raised while building final predictset')
            raise ValueError
        except KeyError:
            self.logger.exception(
                'KeyError raised while building final predictset')
            raise KeyError
        except Exception as e:
            self.logger.exception(
                'Exception raised while building final predictset: %s' % e)
            raise e

    def preprocess_trainset(self):
        """
        * method: preprocess_trainset
        * description: method to pre-process training data
        * return: none
        *
        *
        * Parameters
        *   none:
        """
        try:
            self.logger.info('Start of Preprocessing...')
            # get data into pandas data frame
            data = self.get_data()
            # drop unwanted columns
            data = self.drop_columns(data, ['ZipCode'])
            # Replacing '?' with nan
            data = self.replace_invalid_values_with_null(data)
            # handle Categorical Values
            data = self.encode_categorical_values(data)
            cat_df = self.feature_encoding(data)
            data = pd.concat([data, cat_df], axis=1)
            # drop categorical column
            data = self.drop_columns(data, [
                'Married', 'BankCustomer', 'Citizen', 'EducationLevel',
                'Ethnicity'
            ])
            # check if missing values are present in the data set
            is_null_present = self.is_null_present(data)
            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = self.impute_missing_values(
                    data)  # missing value imputation
            # feature engineering
            data1 = self.feature_selection(data)
            # create separate features and labels
            self.X, self.y = self.split_features_label(data1,
                                                       label_name='Approved')
            self.logger.info('End of Preprocessing...')
            return self.X, self.y
        except Exception:
            self.logger.exception('Unsuccessful End of Preprocessing...')
            raise Exception

    def preprocess_predictset(self):
        """
        * method: preprocess_predictset
        * description: method to pre-process prediction data
        * return: none
        *
        *
        * Parameters
        *   none:
        """
        try:
            self.logger.info('Start of Preprocessing...')
            # get data into pandas data frame
            data = self.get_data()
            # drop unwanted columns
            data = self.drop_columns(data, ['ZipCode'])
            # Replacing '?' with nan
            data = self.replace_invalid_values_with_null(data)
            # handle Categorical Values
            data = self.encode_categorical_values(data)
            cat_df = self.feature_encoding(data)
            data = pd.concat([data, cat_df], axis=1)
            # drop categorical column
            data = self.drop_columns(data, [
                'Married', 'BankCustomer', 'Citizen', 'EducationLevel',
                'Ethnicity'
            ])
            # check if missing values are present in the data set
            is_null_present = self.is_null_present(data)
            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = self.impute_missing_values(
                    data)  # missing value imputation
            # feature engineering
            data1 = self.feature_select(data)
            data = self.final_predictset(data1)
            self.logger.info('End of Preprocessing...')
            return data
        except Exception:
            self.logger.exception('Unsuccessful End of Preprocessing...')
            raise Exception

    def preprocess_predict(self, data):
        """
        * method: preprocess_predict
        * description: method to pre-process prediction data
        * return: none
        *
        *
        * Parameters
        *   none:
        """
        try:
            self.logger.info('Start of Preprocessing...')
            data = self.encode_categorical_values(data)
            cat_df = self.feature_encoding(data)
            data = pd.concat([data, cat_df], axis=1)
            # drop categorical column
            data = self.drop_columns(data, [
                'Married', 'BankCustomer', 'Citizen', 'EducationLevel',
                'Ethnicity'
            ])
            is_null_present = self.is_null_present(data)
            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = self.impute_missing_values(
                    data)  # missing value imputation

            data = self.final_predictset(data)
            self.logger.info('End of Preprocessing...')
            return data
        except Exception:
            self.logger.exception('Unsuccessful End of Preprocessing...')
            raise Exception
예제 #11
0
class FileOperation:
    """
    *****************************************************************************
    *
    * file_name:       FileOperation.py
    * version:        1.0
    * author:
    * creation date:
    *
    * change history:
    *
    *
    *
    * description:    Class for file operation
    *
    ****************************************************************************
    """

    def __init__(self, run_id, data_path, mode):
        self.run_id = run_id
        self.data_path = data_path
        self.logger = Logger(self.run_id, 'FileOperation', mode)

    def save_model(self, model, file_name):
        """
        * method: save_model
        * description: method to save the model file
        * return: File gets saved
        *
        *
        * Parameters
        *   model:
        *   file_name:
        """
        try:
            self.logger.info('Start of Save Models')
            path = os.path.join('apps/models/', file_name)  # create seperate directory for each cluster
            if os.path.isdir(path):  # remove previously existing models for each clusters
                shutil.rmtree('apps/models')
                os.makedirs(path)
            else:
                os.makedirs(path)  #
            with open(path + '/' + file_name + '.sav',
                      'wb') as f:
                pickle.dump(model, f)  # save the model to file
            self.logger.info('Model File ' + file_name + ' saved')
            self.logger.info('End of Save Models')
            return 'success'
        except Exception as e:
            self.logger.exception('Exception raised while Save Models: %s' % e)
            raise Exception()

    def load_model(self, file_name):
        """
        * method: load_model
        * description: method to load the model file
        * return: File gets saved
        *
        *
        * Parameters
        *   file_name:
        """
        try:
            self.logger.info('Start of Load Model')
            with open('apps/models/' + file_name + '/' + file_name + '.sav', 'rb') as f:
                self.logger.info('Model File ' + file_name + ' loaded')
                self.logger.info('End of Load Model')
                return pickle.load(f)
        except Exception as e:
            self.logger.exception('Exception raised while Loading Model: %s' % e)
            raise Exception()
예제 #12
0
class TrainModel:
    """
    *****************************************************************************
    *
    * filename:       TrainModel.py
    * version:        1.0
    * author:
    * creation date:
    *
    *
    *
    *
    * description:    Class to training the models
    *
    ****************************************************************************
    """
    def __init__(self, run_id, data_path):
        self.run_id = run_id
        self.data_path = data_path
        self.logger = Logger(self.run_id, 'TrainModel', 'training')
        self.loadValidate = LoadValidate(self.run_id, self.data_path,
                                         'training')
        self.preProcess = Preprocessor(self.run_id, self.data_path, 'training')
        self.modelTuner = ModelTuner(self.run_id, self.data_path, 'training')
        self.fileOperation = FileOperation(self.run_id, self.data_path,
                                           'training')
        # self.cluster = KMeansCluster(self.run_id, self.data_path)

    def training_model(self):
        """
        * method: trainingModel
        * description: method to training the model
        * return: none
        *
        *
        * Parameters
        *   none:
        """
        try:
            self.logger.info('Start of Training')
            self.logger.info('Run_id:' + str(self.run_id))
            # Load, validations and transformation
            self.loadValidate.validate_trainset()
            # preprocessing activities
            self.X, self.y = self.preProcess.preprocess_trainset()
            columns = {"data_columns": [col for col in self.X.columns]}
            with open('apps/database/columns.json', 'w') as f:
                f.write(json.dumps(columns))

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    self.X, self.y, test_size=0.2, random_state=0)
                # getting the best model for each of the clusters
                best_model_name, best_model = self.modelTuner.get_best_model(
                    x_train, y_train, x_test, y_test)

                # saving the best model to the directory.
                save_model = self.fileOperation.save_model(
                    best_model, best_model_name)

            self.logger.info('End of Training')
        except Exception:
            self.logger.exception('Unsuccessful End of Training')
            raise Exception
예제 #13
0
class KMeansCluster:
    """
    *****************************************************************************
    *
    * filename:       KMeansCluster.py
    * version:        1.0
    * author:         CODESTUDIO
    * creation date:  05-MAY-2020
    *
    * change history:
    *
    * who             when           version  change (include bug# if apply)
    * ----------      -----------    -------  ------------------------------
    * bcheekati       05-MAY-2020    1.0      initial creation
    *
    *
    * description:    Class to cluster the dataset
    *
    ****************************************************************************
    """
    def __init__(self, run_id, data_path):
        self.run_id = run_id
        self.data_path = data_path
        self.logger = Logger(self.run_id, 'KMeansCluster', 'training')
        self.fileOperation = FileOperation(self.run_id, self.data_path,
                                           'training')

    def elbow_plot(self, data):
        """
        * method: log
        * description: method to saves the plot to decide the optimum number of clusters to the file.
        * return: A picture saved to the directory
        *
        * who             when           version  change (include bug# if apply)
        * ----------      -----------    -------  ------------------------------
        * bcheekati       05-MAY-2020    1.0      initial creation
        *
        * Parameters
        *   data:
        """
        wcss = []  # initializing an empty list --within cluster sum of errors
        try:
            self.logger.info('Start of elbow plotting...')
            for i in range(1, 11):
                kmeans = KMeans(
                    n_clusters=i, init='k-means++',
                    random_state=0)  # initializing the KMeans object
                kmeans.fit(data)  # fitting the data to the KMeans Algorithm
                wcss.append(kmeans.inertia_)
            plt.plot(
                range(1, 11), wcss
            )  # creating the graph between WCSS and the number of clusters
            plt.title('The Elbow Method')
            plt.xlabel('Number of clusters')
            plt.ylabel('WCSS')
            #plt.show()
            plt.savefig('apps/models/kmeans_elbow.png'
                        )  # saving the elbow plot locally
            # finding the value of the optimum cluster programmatically
            self.kn = KneeLocator(range(1, 11),
                                  wcss,
                                  curve='convex',
                                  direction='decreasing')
            self.logger.info('The optimum number of clusters is: ' +
                             str(self.kn.knee))
            self.logger.info('End of elbow plotting...')
            return self.kn.knee

        except Exception as e:
            self.logger.exception('Exception raised while elbow plotting:' +
                                  str(e))
            raise Exception()

    def create_clusters(self, data, number_of_clusters):
        """
        * method: create_clusters
        * description: method to create clusters
        * return: A date frame with cluster column
        *
        * who             when           version  change (include bug# if apply)
        * ----------      -----------    -------  ------------------------------
        * bcheekati       05-MAY-2020    1.0      initial creation
        *
        * Parameters
        *   data:
        *   number_of_clusters:
        """
        self.data = data
        try:
            self.logger.info('Start of Create clusters...')
            self.kmeans = KMeans(n_clusters=number_of_clusters,
                                 init='k-means++',
                                 random_state=0)
            self.y_kmeans = self.kmeans.fit_predict(
                data)  #  divide data into clusters
            self.saveModel = self.fileOperation.save_model(
                self.kmeans, 'KMeans')
            # saving the KMeans model to directory
            # passing 'Model' as the functions need three parameters
            self.data[
                'Cluster'] = self.y_kmeans  # create a new column in dataset for storing the cluster information
            self.logger.info('succesfully created ' + str(self.kn.knee) +
                             'clusters.')
            self.logger.info('End of Create clusters...')
            return self.data
        except Exception as e:
            self.logger.exception('Exception raised while Creating clusters:' +
                                  str(e))
            raise Exception()
예제 #14
0
class Preprocessor:
    """
    *****************************************************************************
    *
    * filename:       Preprocessor.py
    * version:        1.0
    * author:         CODESTUDIO
    * creation date:  05-MAY-2020
    *
    * change history:
    *
    * who             when           version  change (include bug# if apply)
    * ----------      -----------    -------  ------------------------------
    * bcheekati       05-MAY-2020    1.0      initial creation
    *
    *
    * description:    Class to pre-process training dataset
    *
    ****************************************************************************
    """

    def __init__(self,run_id,data_path,mode):
        self.run_id = run_id
        self.data_path = data_path
        self.logger = Logger(self.run_id, 'Preprocessor', mode)

    def get_data(self):
        """
        * method: get_data
        * description: method to read datafile
        * return: A pandas DataFrame
        *
        * who             when           version  change (include bug# if apply)
        * ----------      -----------    -------  ------------------------------
        * bcheekati       05-MAY-2020    1.0      initial creation
        *
        * Parameters
        *   none:
        """
        try:
            # reading the data file
            self.logger.info('Start of reading dataset...')
            self.data= pd.read_csv(self.data_path+'_validation/InputFile.csv')
            self.logger.info('End of reading dataset...')
            return self.data
        except Exception as e:
            self.logger.exception('Exception raised while reading dataset: %s'+str(e))
            raise Exception()

    def drop_columns(self,data,columns):
        """
        * method: drop_columns
        * description: method to drop columns
        * return: A pandas DataFrame after removing the specified columns.
        *
        * who             when           version  change (include bug# if apply)
        * ----------      -----------    -------  ------------------------------
        * bcheekati       05-MAY-2020    1.0      initial creation
        *
        * Parameters
        *   data:
        *   columns:
        """
        self.data=data
        self.columns=columns
        try:
            self.logger.info('Start of Droping Columns...')
            self.useful_data=self.data.drop(labels=self.columns, axis=1) # drop the labels specified in the columns
            self.logger.info('End of Droping Columns...')
            return self.useful_data
        except Exception as e:
            self.logger.exception('Exception raised while Droping Columns:'+str(e))
            raise Exception()

    def is_null_present(self,data):
        """
        * method: is_null_present
        * description: method to check null values
        * return: Returns a Boolean Value. True if null values are present in the DataFrame, False if they are not present.
        *
        * who             when           version  change (include bug# if apply)
        * ----------      -----------    -------  ------------------------------
        * bcheekati       05-MAY-2020    1.0      initial creation
        *
        * Parameters
        *   data:
        """
        self.null_present = False
        try:
            self.logger.info('Start of finding missing values...')
            self.null_counts=data.isna().sum() # check for the count of null values per column
            for i in self.null_counts:
                if i>0:
                    self.null_present=True
                    break
            if(self.null_present): # write the logs to see which columns have null values
                dataframe_with_null = pd.DataFrame()
                dataframe_with_null['columns'] = data.columns
                dataframe_with_null['missing values count'] = np.asarray(data.isna().sum())
                dataframe_with_null.to_csv(self.data_path+'_validation/'+'null_values.csv') # storing the null column information to file
            self.logger.info('End of finding missing values...')
            return self.null_present
        except Exception as e:
            self.logger.exception('Exception raised while finding missing values:'+str(e))
            raise Exception()

    def impute_missing_values(self, data):
        """
        * method: impute_missing_values
        * description: method to impute missing values
        * return: none
        *
        * who             when           version  change (include bug# if apply)
        * ----------      -----------    -------  ------------------------------
        * bcheekati       05-MAY-2020    1.0      initial creation
        *
        * Parameters
        *   data:
        """
        self.data= data
        try:
            self.logger.info('Start of imputing missing values...')
            imputer=KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)
            self.new_array=imputer.fit_transform(self.data) # impute the missing values
            # convert the nd-array returned in the step above to a Data frame
            self.new_data=pd.DataFrame(data=self.new_array, columns=self.data.columns)
            self.logger.info('End of imputing missing values...')
            return self.new_data
        except Exception as e:
            self.logger.exception('Exception raised while imputing missing values:'+str(e))
            raise Exception()

    def feature_encoding(self, data):
        """
        * method: feature_encoding
        * description: method to impute missing values
        * return: none
        *
        * who             when           version  change (include bug# if apply)
        * ----------      -----------    -------  ------------------------------
        * bcheekati       05-MAY-2020    1.0      initial creation
        *
        * Parameters
        *   data:
        """
        try:
            self.logger.info('Start of feature encoding...')
            self.new_data = data.select_dtypes(include=['object']).copy()
            # Using the dummy encoding to encode the categorical columns to numerical ones
            for col in self.new_data.columns:
                self.new_data = pd.get_dummies(self.new_data, columns=[col], prefix=[col], drop_first=True)

            self.logger.info('End of feature encoding...')
            return self.new_data
        except Exception as e:
            self.logger.exception('Exception raised while feature encoding:' + str(e))
            raise Exception()


    def split_features_label(self, data, label_name):
        """
        * method: split_features_label
        * description: method to separate features and label
        * return: none
        *
        * who             when           version  change (include bug# if apply)
        * ----------      -----------    -------  ------------------------------
        * bcheekati       05-MAY-2020    1.0      initial creation
        *
        * Parameters
        *   data:
        *   label_name:
        """
        self.data =data
        try:
            self.logger.info('Start of splitting features and label ...')
            self.X=self.data.drop(labels=label_name,axis=1) # drop the columns specified and separate the feature columns
            self.y=self.data[label_name] # Filter the Label columns
            self.logger.info('End of splitting features and label ...')
            return self.X,self.y
        except Exception as e:
            self.logger.exception('Exception raised while splitting features and label:' + str(e))
            raise Exception()

    def final_predictset(self,data):
        """
        * method: final_predictset
        * description: method to build final predict set by adding additional encoded column with value as 0
        * return: column_names, Number of Columns
        *
        * who             when           version  change (include bug# if apply)
        * ----------      -----------    -------  ------------------------------
        * bcheekati       05-MAY-2020    1.0      initial creation
        *
        * Parameters
        *   none:
        """
        try:
            self.logger.info('Start of building final predictset...')
            with open('apps/database/columns.json', 'r') as f:
                data_columns = json.load(f)['data_columns']
                f.close()
            df = pd.DataFrame(data=None, columns=data_columns)
            df_new = pd.concat([df, data], ignore_index=True,sort=False)
            data_new = df_new.fillna(0)
            self.logger.info('End of building final predictset...')
            return data_new
        except ValueError:
            self.logger.exception('ValueError raised while building final predictset')
            raise ValueError
        except KeyError:
            self.logger.exception('KeyError raised while building final predictset')
            raise KeyError
        except Exception as e:
            self.logger.exception('Exception raised while building final predictset: %s' % e)
            raise e


    def preprocess_trainset(self):
        """
        * method: preprocess_trainset
        * description: method to pre-process training data
        * return: none
        *
        * who             when           version  change (include bug# if apply)
        * ----------      -----------    -------  ------------------------------
        * bcheekati       05-MAY-2020    1.0      initial creation
        *
        * Parameters
        *   none:
        """
        try:
            self.logger.info('Start of Preprocessing...')
            # get data into pandas data frame
            data=self.get_data()
            # drop unwanted columns
            data=self.drop_columns(data,['empid'])
            # handle label encoding
            cat_df = self.feature_encoding(data)
            data = pd.concat([data, cat_df], axis=1)
            # drop categorical column
            data = self.drop_columns(data, ['salary'])
            # check if missing values are present in the data set
            is_null_present = self.is_null_present(data)
            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = self.impute_missing_values(data)  # missing value imputation
            # create separate features and labels
            self.X, self.y = self.split_features_label(data, label_name='left')
            self.logger.info('End of Preprocessing...')
            return self.X, self.y
        except Exception:
            self.logger.exception('Unsuccessful End of Preprocessing...')
            raise Exception

    def preprocess_predictset(self):
        """
        * method: preprocess_predictset
        * description: method to pre-process prediction data
        * return: none
        *
        * who             when           version  change (include bug# if apply)
        * ----------      -----------    -------  ------------------------------
        * bcheekati       05-MAY-2020    1.0      initial creation
        *
        * Parameters
        *   none:
        """
        try:
            self.logger.info('Start of Preprocessing...')
            # get data into pandas data frame
            data=self.get_data()
            # drop unwanted columns
            #data=self.drop_columns(data,['empid'])
            # handle label encoding
            cat_df = self.feature_encoding(data)
            data = pd.concat([data, cat_df], axis=1)
            # drop categorical column
            data = self.drop_columns(data, ['salary'])
            # check if missing values are present in the data set
            is_null_present = self.is_null_present(data)
            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = self.impute_missing_values(data)  # missing value imputation

            data = self.final_predictset(data)
            self.logger.info('End of Preprocessing...')
            return data
        except Exception:
            self.logger.exception('Unsuccessful End of Preprocessing...')
            raise Exception


    def preprocess_predict(self,data):
        """
        * method: preprocess_predict
        * description: method to pre-process prediction data
        * return: none
        *
        * who             when           version  change (include bug# if apply)
        * ----------      -----------    -------  ------------------------------
        * bcheekati       05-MAY-2020    1.0      initial creation
        *
        * Parameters
        *   none:
        """
        try:
            self.logger.info('Start of Preprocessing...')
            cat_df = self.feature_encoding(data)
            data = pd.concat([data, cat_df], axis=1)
            # drop categorical column
            data = self.drop_columns(data, ['salary'])
            # check if missing values are present in the data set
            is_null_present = self.is_null_present(data)
            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = self.impute_missing_values(data)  # missing value imputation

            data = self.final_predictset(data)
            self.logger.info('End of Preprocessing...')
            return data
        except Exception:
            self.logger.exception('Unsuccessful End of Preprocessing...')
            raise Exception
예제 #15
0
class PredictModel:



    def __init__(self,run_id,data_path):
        self.run_id = run_id
        self.data_path = data_path
        self.logger = Logger(self.run_id, 'PredictModel', 'prediction')
        self.loadValidate = LoadValidate(self.run_id, self.data_path,'prediction')
        self.preProcess = Preprocessor(self.run_id, self.data_path,'prediction')
        self.fileOperation = FileOperation(self.run_id, self.data_path, 'prediction')

    def batch_predict_from_model(self):

        try:
            self.logger.info('Start of Prediction')
            self.logger.info('run_id:' + str(self.run_id))
            #validations and transformation
            self.loadValidate.validate_predictset()
            #preprocessing activities
            self.X = self.preProcess.preprocess_predictset()
            #load model
            kmeans = self.fileOperation.load_model('KMeans')
            #cluster selection
            clusters = kmeans.predict(self.X.drop(['empid'],axis=1))
            self.X['clusters'] = clusters
            clusters = self.X['clusters'].unique()
            y_predicted=[]
            for i in clusters:
                self.logger.info('clusters loop started')
                cluster_data = self.X[self.X['clusters'] == i]
                cluster_data_new = cluster_data.drop(['empid','clusters'], axis=1)
                model_name = self.fileOperation.correct_model(i)
                model = self.fileOperation.load_model(model_name)
                y_predicted = model.predict(cluster_data_new)
                result = pd.DataFrame({"EmpId": cluster_data['empid'],"Prediction": y_predicted})
                result.to_csv(self.data_path+'_results/'+'Predictions.csv', header=True, mode='a+',index=False)
            self.logger.info('End of Prediction')
        except Exception:
            self.logger.exception('Unsuccessful End of Prediction')
            raise Exception


    def single_predict_from_model(self,data):

        try:
            self.logger.info('Start of Prediction')
            self.logger.info('run_id:' + str(self.run_id))
            #preprocessing activities
            self.X = self.preProcess.preprocess_predict(data)
            #load model
            kmeans = self.fileOperation.load_model('KMeans')
            #cluster selection
            clusters = kmeans.predict(self.X.drop(['empid'],axis=1))
            self.X['clusters'] = clusters
            clusters = self.X['clusters'].unique()
            y_predicted=[]
            for i in clusters:
                self.logger.info('clusters loop started')
                cluster_data = self.X[self.X['clusters'] == i]
                cluster_data_new = cluster_data.drop(['empid','clusters'], axis=1)
                model_name = self.fileOperation.correct_model(i)
                model = self.fileOperation.load_model(model_name)
                self.logger.info('Shape of Data '+str(cluster_data_new.shape))
                self.logger.info('Shape of Data ' + str(cluster_data_new.info()))
                y_predicted = model.predict(cluster_data_new)
                self.logger.info('Output : '+str(y_predicted))
                self.logger.info('End of Prediction')
                return int(y_predicted[0])
        except Exception:
            self.logger.exception('Unsuccessful End of Prediction')
            raise Exception
class DatabaseOperation:
    def __init__(self, run_id, data_path, mode):
        self.run_id = run_id
        self.data_path = data_path
        self.logger = Logger(self.run_id, 'DatabaseOperation', mode)

    def database_connection(self, database_name):

        try:
            conn = sqlite3.connect('apps/database/' + database_name + '.db')
            self.logger.info("Opened %s database successfully" % database_name)
        except ConnectionError:
            self.logger.info("Error while connecting to database: %s" %
                             ConnectionError)
            raise ConnectionError
        return conn

    def create_table(self, database_name, table_name, column_names):

        try:
            self.logger.info('Start of Creating Table...')
            conn = self.database_connection(database_name)

            if (database_name == 'prediction'):
                conn.execute("DROP TABLE IF EXISTS '" + table_name + "';")

            c = conn.cursor()
            c.execute(
                "SELECT count(name) FROM sqlite_master WHERE type = 'table' AND name = '"
                + table_name + "'")
            if c.fetchone()[0] == 1:
                conn.close()
                self.logger.info('Tables created successfully')
                self.logger.info("Closed %s database successfully" %
                                 database_name)
            else:
                for key in column_names.keys():
                    type = column_names[key]

                    #in try block we check if the table exists, if yes then add columns to the table
                    # else in catch block we will create the table --training_raw_data_t
                    try:
                        conn.execute("ALTER TABLE " + table_name +
                                     " ADD COLUMN {column_name} {dataType}".
                                     format(column_name=key, dataType=type))
                        self.logger.info("ALTER TABLE " + table_name +
                                         " ADD COLUMN")
                    except:
                        conn.execute("CREATE TABLE  " + table_name +
                                     " ({column_name} {dataType})".format(
                                         column_name=key, dataType=type))
                        self.logger.info("CREATE TABLE " + table_name +
                                         " column_name")
                conn.close()
            self.logger.info('End of Creating Table...')
        except Exception as e:
            self.logger.exception('Exception raised while Creating Table: %s' %
                                  e)
            raise e

    def insert_data(self, database_name, table_name):

        conn = self.database_connection(database_name)
        good_data_path = self.data_path
        bad_data_path = self.data_path + '_rejects'
        only_files = [f for f in listdir(good_data_path)]
        self.logger.info('Start of Inserting Data into Table...')
        for file in only_files:
            try:
                with open(good_data_path + '/' + file, "r") as f:
                    next(f)
                    reader = csv.reader(f, delimiter=",")
                    for line in enumerate(reader):
                        #self.logger.info(" %s: nu!!" % line[1])
                        to_db = ''
                        for list_ in (line[1]):
                            try:
                                to_db = to_db + ",'" + list_ + "'"
                            except Exception as e:
                                raise e
                        #self.logger.info(" %s: list_!!" % to_db.lstrip(','))
                        to_db = to_db.lstrip(',')
                        conn.execute("INSERT INTO " + table_name +
                                     " values ({values})".format(
                                         values=(to_db)))
                        conn.commit()

            except Exception as e:
                conn.rollback()
                self.logger.exception(
                    'Exception raised while Inserting Data into Table: %s ' %
                    e)
                shutil.move(good_data_path + '/' + file, bad_data_path)
                conn.close()
        conn.close()
        self.logger.info('End of Inserting Data into Table...')

    def export_csv(self, database_name, table_name):

        self.file_from_db = self.data_path + str('_validation/')
        self.file_name = 'InputFile.csv'
        try:
            self.logger.info('Start of Exporting Data into CSV...')
            conn = self.database_connection(database_name)
            sqlSelect = "SELECT *  FROM " + table_name + ""
            cursor = conn.cursor()
            cursor.execute(sqlSelect)
            results = cursor.fetchall()
            # Get the headers of the csv file
            headers = [i[0] for i in cursor.description]
            #Make the CSV ouput directory
            if not os.path.isdir(self.file_from_db):
                os.makedirs(self.file_from_db)
            # Open CSV file for writing.
            csv_file = csv.writer(open(self.file_from_db + self.file_name,
                                       'w',
                                       newline=''),
                                  delimiter=',',
                                  lineterminator='\r\n',
                                  quoting=csv.QUOTE_ALL,
                                  escapechar='\\')
            # Add the headers and data to the CSV file.
            csv_file.writerow(headers)
            csv_file.writerows(results)
            self.logger.info('End of Exporting Data into CSV...')
        except Exception as e:
            self.logger.exception(
                'Exception raised while Exporting Data into CSV: %s ' % e)
예제 #17
0
class PredictModel:
    """
	**************************************************************************
	*
	* filename:       PredictModel.py
	* version:        1.0
	* creation date:  05-MAY-2020
	*
	* change history:
	*
	*
	* description:    Class to prediction the result
	*
	**************************************************************************
	"""
    def __init__(self, run_id, data_path):
        self.run_id = run_id
        self.data_path = data_path
        self.logger = Logger(self.run_id, 'PredictModel', 'prediction')
        self.loadValidate = LoadValidate(self.run_id, self.data_path,
                                         'prediction')
        self.preProcess = Preprocessor(self.run_id, self.data_path,
                                       'prediction')
        self.fileOperation = FileOperation(self.run_id, self.data_path,
                                           'prediction')

    def batch_predict_from_model(self):
        """
		* method: batch_predict_from_model
		* description: method to prediction the results
		* return: none
		*
		* Parameters
		*   none:
		"""
        try:
            self.logger.info('Start of Prediction')
            self.logger.info('run_id:' + str(self.run_id))
            #validations and transformation
            self.loadValidate.validate_predictset()
            #preprocessing activities
            self.X = self.preProcess.preprocess_predictset()
            #load model
            kmeans = self.fileOperation.load_model('KMeans')
            #cluster selection
            clusters = kmeans.predict(self.X.drop(['empid'], axis=1))
            self.X['clusters'] = clusters
            clusters = self.X['clusters'].unique()
            y_predicted = []
            for i in clusters:
                self.logger.info('clusters loop started')
                cluster_data = self.X[self.X['clusters'] == i]
                cluster_data_new = cluster_data.drop(['empid', 'clusters'],
                                                     axis=1)
                model_name = self.fileOperation.correct_model(i)
                model = self.fileOperation.load_model(model_name)
                y_predicted = model.predict(cluster_data_new)
                #result = pd.DataFrame(list(zip(y_predicted)), columns=['Predictions'])
                #result.to_csv(self.data_path+'_results/'+'Predictions.csv', header=True, mode='a+')
                result = pd.DataFrame({
                    "EmpId": cluster_data['empid'],
                    "Prediction": y_predicted
                })
                result.to_csv(self.data_path + '_results/' + 'Predictions.csv',
                              header=True,
                              mode='a+',
                              index=False)
            self.logger.info('End of Prediction')
        except Exception:
            self.logger.exception('Unsuccessful End of Prediction')
            raise Exception

    def single_predict_from_model(self, data):
        """
		* method: single_predict_from_model
		* description: method to prediction the results
		* return: none
		*
		* Parameters
		*   none:
		"""
        try:
            self.logger.info('Start of Prediction')
            self.logger.info('run_id:' + str(self.run_id))
            #preprocessing activities
            self.X = self.preProcess.preprocess_predict(data)
            #load model
            kmeans = self.fileOperation.load_model('KMeans')
            #cluster selection
            clusters = kmeans.predict(self.X.drop(['empid'], axis=1))
            self.X['clusters'] = clusters
            clusters = self.X['clusters'].unique()
            y_predicted = []
            for i in clusters:
                self.logger.info('clusters loop started')
                cluster_data = self.X[self.X['clusters'] == i]
                cluster_data_new = cluster_data.drop(['empid', 'clusters'],
                                                     axis=1)
                model_name = self.fileOperation.correct_model(i)
                model = self.fileOperation.load_model(model_name)
                self.logger.info('Shape of Data ' +
                                 str(cluster_data_new.shape))
                self.logger.info('Shape of Data ' +
                                 str(cluster_data_new.info()))
                y_predicted = model.predict(cluster_data_new)
                #result = pd.DataFrame(list(zip(y_predicted)), columns=['Predictions'])
                #result.to_csv(self.data_path+'_results/'+'Predictions.csv', header=True, mode='a+')
                #result = pd.DataFrame({"EmpId": cluster_data['empid'],"Prediction": y_predicted})
                #result.to_csv(self.data_path+'_results/'+'Predictions.csv', header=True, mode='a+',index=False)
                self.logger.info('Output : ' + str(y_predicted))
                self.logger.info('End of Prediction')
                return int(y_predicted[0])
        except Exception:
            self.logger.exception('Unsuccessful End of Prediction')
            raise Exception
class KMeansCluster:
    def __init__(self, run_id, data_path):
        self.run_id = run_id
        self.data_path = data_path
        self.logger = Logger(self.run_id, 'KMeansCluster', 'training')
        self.fileOperation = FileOperation(self.run_id, self.data_path,
                                           'training')

    def elbow_plot(self, data):

        wcss = []  # initializing an empty list --within cluster sum of errors
        try:
            self.logger.info('Start of elbow plotting...')
            for i in range(1, 11):
                kmeans = KMeans(
                    n_clusters=i, init='k-means++',
                    random_state=0)  # initializing the KMeans object
                kmeans.fit(data)  # fitting the data to the KMeans Algorithm
                wcss.append(kmeans.inertia_)
            plt.plot(
                range(1, 11), wcss
            )  # creating the graph between WCSS and the number of clusters
            plt.title('The Elbow Method')
            plt.xlabel('Number of clusters')
            plt.ylabel('WCSS')
            #plt.show()
            plt.savefig('apps/models/kmeans_elbow.png'
                        )  # saving the elbow plot locally
            # finding the value of the optimum cluster programmatically
            self.kn = KneeLocator(range(1, 11),
                                  wcss,
                                  curve='convex',
                                  direction='decreasing')
            self.logger.info('The optimum number of clusters is: ' +
                             str(self.kn.knee))
            self.logger.info('End of elbow plotting...')
            return self.kn.knee

        except Exception as e:
            self.logger.exception('Exception raised while elbow plotting:' +
                                  str(e))
            raise Exception()

    def create_clusters(self, data, number_of_clusters):

        self.data = data
        try:
            self.logger.info('Start of Create clusters...')
            self.kmeans = KMeans(n_clusters=number_of_clusters,
                                 init='k-means++',
                                 random_state=0)
            self.y_kmeans = self.kmeans.fit_predict(
                data)  #  divide data into clusters
            self.saveModel = self.fileOperation.save_model(
                self.kmeans, 'KMeans')
            # saving the KMeans model to directory
            # passing 'Model' as the functions need three parameters
            self.data[
                'Cluster'] = self.y_kmeans  # create a new column in dataset for storing the cluster information
            self.logger.info('succesfully created ' + str(self.kn.knee) +
                             'clusters.')
            self.logger.info('End of Create clusters...')
            return self.data
        except Exception as e:
            self.logger.exception('Exception raised while Creating clusters:' +
                                  str(e))
            raise Exception()
예제 #19
0
	def __init__(self,run_id,data_path,mode):
		self.run_id = run_id
		self.data_path = data_path
		self.logger = Logger(self.run_id, 'ModelTuner', mode)
		self.rfc = RandomForestClassifier()
		self.xgb = XGBClassifier(objective='binary:logistic')
 def __init__(self, run_id, data_path):
     self.run_id = run_id
     self.data_path = data_path
     self.logger = Logger(self.run_id, 'KMeansCluster', 'training')
     self.fileOperation = FileOperation(self.run_id, self.data_path,
                                        'training')
 def __init__(self, run_id, data_path, mode):
     self.run_id = run_id
     self.data_path = data_path
     self.logger = Logger(self.run_id, 'FIleOperation', mode)
class Preprocessor:
    
    def __init__(self,run_id,data_path,mode):
        self.run_id = run_id
        self.data_path = data_path
        self.logger = Logger(self.run_id, 'Preprocessor', mode)

    def get_data(self):
       
        try:
            # reading the data file
            self.logger.info('Start of reading dataset...')
            self.data= pd.read_csv(self.data_path+'_validation/InputFile.csv')
            self.logger.info('End of reading dataset...')
            return self.data
        except Exception as e:
            self.logger.exception('Exception raised while reading dataset: %s'+str(e))
            raise Exception()

    def drop_columns(self,data,columns):
        
        self.data=data
        self.columns=columns
        try:
            self.logger.info('Start of Droping Columns...')
            self.useful_data=self.data.drop(labels=self.columns, axis=1) # drop the labels specified in the columns
            self.logger.info('End of Droping Columns...')
            return self.useful_data
        except Exception as e:
            self.logger.exception('Exception raised while Droping Columns:'+str(e))
            raise Exception()

    def is_null_present(self,data):
        
        self.null_present = False
        try:
            self.logger.info('Start of finding missing values...')
            self.null_counts=data.isna().sum() # check for the count of null values per column
            for i in self.null_counts:
                if i>0:
                    self.null_present=True
                    break
            if(self.null_present): # write the logs to see which columns have null values
                dataframe_with_null = pd.DataFrame()
                dataframe_with_null['columns'] = data.columns
                dataframe_with_null['missing values count'] = np.asarray(data.isna().sum())
                dataframe_with_null.to_csv(self.data_path+'_validation/'+'null_values.csv') # storing the null column information to file
            self.logger.info('End of finding missing values...')
            return self.null_present
        except Exception as e:
            self.logger.exception('Exception raised while finding missing values:'+str(e))
            raise Exception()

    def impute_missing_values(self, data):
       
        self.data= data
        try:
            self.logger.info('Start of imputing missing values...')
            imputer=KNNImputer(n_neighbors=3, weights='uniform',missing_values=np.nan)
            self.new_array=imputer.fit_transform(self.data) # impute the missing values
            # convert the nd-array returned in the step above to a Data frame
            self.new_data=pd.DataFrame(data=self.new_array, columns=self.data.columns)
            self.logger.info('End of imputing missing values...')
            return self.new_data
        except Exception as e:
            self.logger.exception('Exception raised while imputing missing values:'+str(e))
            raise Exception()

    def feature_encoding(self, data):
        
        try:
            self.logger.info('Start of feature encoding...')
            self.new_data = data.select_dtypes(include=['object']).copy()
            # Using the dummy encoding to encode the categorical columns to numerical ones
            for col in self.new_data.columns:
                self.new_data = pd.get_dummies(self.new_data, columns=[col], prefix=[col], drop_first=True)

            self.logger.info('End of feature encoding...')
            return self.new_data
        except Exception as e:
            self.logger.exception('Exception raised while feature encoding:' + str(e))
            raise Exception()


    def split_features_label(self, data, label_name):
      
        self.data =data
        try:
            self.logger.info('Start of splitting features and label ...')
            self.X=self.data.drop(labels=label_name,axis=1) # drop the columns specified and separate the feature columns
            self.y=self.data[label_name] # Filter the Label columns
            self.logger.info('End of splitting features and label ...')
            return self.X,self.y
        except Exception as e:
            self.logger.exception('Exception raised while splitting features and label:' + str(e))
            raise Exception()

    def final_predictset(self,data):
        
        try:
            self.logger.info('Start of building final predictset...')
            with open('apps/database/columns.json', 'r') as f:
                data_columns = json.load(f)['data_columns']
                f.close()
            df = pd.DataFrame(data=None, columns=data_columns)
            df_new = pd.concat([df, data], ignore_index=True,sort=False)
            data_new = df_new.fillna(0)
            self.logger.info('End of building final predictset...')
            return data_new
        except ValueError:
            self.logger.exception('ValueError raised while building final predictset')
            raise ValueError
        except KeyError:
            self.logger.exception('KeyError raised while building final predictset')
            raise KeyError
        except Exception as e:
            self.logger.exception('Exception raised while building final predictset: %s' % e)
            raise e


    def preprocess_trainset(self):
       
        try:
            self.logger.info('Start of Preprocessing...')
            # get data into pandas data frame
            data=self.get_data()
            # drop unwanted columns
            data=self.drop_columns(data,['empid'])
            # handle label encoding
            cat_df = self.feature_encoding(data)
            data = pd.concat([data, cat_df], axis=1)
            # drop categorical column
            data = self.drop_columns(data, ['salary'])
            # check if missing values are present in the data set
            is_null_present = self.is_null_present(data)
            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = self.impute_missing_values(data)  # missing value imputation
            # create separate features and labels
            self.X, self.y = self.split_features_label(data, label_name='left')
            self.logger.info('End of Preprocessing...')
            return self.X, self.y
        except Exception:
            self.logger.exception('Unsuccessful End of Preprocessing...')
            raise Exception

    def preprocess_predictset(self):
       
        try:
            self.logger.info('Start of Preprocessing...')
            # get data into pandas data frame
            data=self.get_data()
            # drop unwanted columns
            #data=self.drop_columns(data,['empid'])
            # handle label encoding
            cat_df = self.feature_encoding(data)
            data = pd.concat([data, cat_df], axis=1)
            # drop categorical column
            data = self.drop_columns(data, ['salary'])
            # check if missing values are present in the data set
            is_null_present = self.is_null_present(data)
            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = self.impute_missing_values(data)  # missing value imputation

            data = self.final_predictset(data)
            self.logger.info('End of Preprocessing...')
            return data
        except Exception:
            self.logger.exception('Unsuccessful End of Preprocessing...')
            raise Exception


    def preprocess_predict(self,data):
        
        try:
            self.logger.info('Start of Preprocessing...')
            cat_df = self.feature_encoding(data)
            data = pd.concat([data, cat_df], axis=1)
            # drop categorical column
            data = self.drop_columns(data, ['salary'])
            # check if missing values are present in the data set
            is_null_present = self.is_null_present(data)
            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = self.impute_missing_values(data)  # missing value imputation

            data = self.final_predictset(data)
            self.logger.info('End of Preprocessing...')
            return data
        except Exception:
            self.logger.exception('Unsuccessful End of Preprocessing...')
            raise Exception
 def __init__(self,run_id,data_path,mode):
     self.run_id = run_id
     self.data_path = data_path
     self.logger = Logger(self.run_id, 'Preprocessor', mode)
예제 #24
0
class FileOperation:
    """
    **********************************************************************************
    *
    * file name : file_operation.py
    * version : 1.0
    * author : Moncy Kurien
    * creation date : 04-Jan-2021
    *
    *
    * change history:
    *
    *   who           when            version     change(include bug # if apply)
    *   ----------    -------         --------    -----------------------------
    *   Moncy Kurien  04-Jan-2021     1.0         Initial Creation
    *
    *   Description: Class for File operations
    *
    **********************************************************************************
    """
    def __init__(self, run_id, data_path, mode):
        self.run_id = run_id
        self.data_path = data_path
        self.logger = Logger(self.run_id, 'FileOperation', mode)

    def save_model(self, model, file_name):
        """
                **********************************************************************************
                *
                * method : save_model
                * parameters : model: - Type(Object) : model object reference
                *               file_name : Type(String) : Name of the file
                * description : Method to save the ML model file
                * return : none - File gets saved
                *
                * change history:
                *
                *   who           when            version     change(include bug # if apply)
                *   ----------    -------         --------    -----------------------------
                *   Moncy Kurien  04-Jan-2021     1.0         Initial Creation
                *
                *
                *
                **********************************************************************************
        """
        try:
            self.logger.info('Start of Save Models')
            path = os.path.join(
                'apps/models',
                file_name)  #create a separate directory for each cluster

            if os.path.isdir(path):
                shutil.rmtree('apps/models')
                os.makedirs(path)
            else:
                os.makedirs(path)

            with open(path + '/' + file_name + '.sav', 'wb') as f:
                pickle.dump(model, f)  #Save the model to the file

            self.logger.info('Model File ' + file_name + ' saved')
            self.logger.info('End of Save Models')
            return 'success'
        except Exception as e:
            self.logger.exception("Exception raised while Save Models: %s" % e)
            raise Exception()

    def load_model(self, file_name):
        """
                **********************************************************************************
                *
                * method : load_model
                * parameters :  file_name : Type(String) : Name of the file
                * description : Method to load the ML model from file file
                * return : returns de-serialized model object
                *
                * change history:
                *
                *   who           when            version     change(include bug # if apply)
                *   ----------    -------         --------    -----------------------------
                *   Moncy Kurien  04-Jan-2021     1.0         Initial Creation
                *
                *
                *
                **********************************************************************************
        """
        try:
            self.logger.info('Start of Load Model')
            with open('apps/models/' + file_name + '/' + file_name + '.sav',
                      'rb') as f:
                model = pickle.load(f)
                self.logger.info('Model File ' + file_name + ' loaded.')
                self.logger.info('End of Load Models')
                return model
        except Exception as e:
            self.logger.exception("Exception raised while Load Models: %s " %
                                  e)
            raise Exception()

    def correct_model(self, cluster_number):
        """
                **********************************************************************************
                *
                * method : correct_model
                * parameters :  cluster_number
                * description : Method to find the best model
                * return : the model file
                *
                * change history:
                *
                *   who           when            version     change(include bug # if apply)
                *   ----------    -------         --------    -----------------------------
                *   Moncy Kurien  04-Jan-2021     1.0         Initial Creation
                *
                *
                *
                **********************************************************************************
        """
        try:
            self.logger.info("Start of finding Correct Model.")
            self.cluster_number = cluster_number
            self.folder_name = 'apps/models'
            self.list_of_model_files = []
            self.list_of_files = os.listdir(self.folder_name)

            for self.file in self.list_of_files:
                try:
                    #String.index(str(i)) will look at the string and returns the index/position of the str(i) in the String.
                    #If the str(i) is not found it errors.
                    if (self.file.index(str(self.cluster_number)) != -1):
                        self.model_name = self.file
                except:
                    continue
            self.model_name = self.model_name.split('.')[0]
            self.logger.info('End of Correct Model.')
            return self.model_name
        except Exception as e:
            self.logger.exception(
                'Exception raised while finding Correct Model: %s ' % e)
            raise Exception()