class KMeansCluster:
    def __init__(self, run_id, data_path):
        self.run_id = run_id
        self.data_path = data_path
        self.logger = Logger(self.run_id, 'KMeansCluster', 'training')
        self.fileOperation = FileOperation(self.run_id, self.data_path,
                                           'training')

    def elbow_plot(self, data):

        wcss = []  # initializing an empty list --within cluster sum of errors
        try:
            self.logger.info('Start of elbow plotting...')
            for i in range(1, 11):
                kmeans = KMeans(
                    n_clusters=i, init='k-means++',
                    random_state=0)  # initializing the KMeans object
                kmeans.fit(data)  # fitting the data to the KMeans Algorithm
                wcss.append(kmeans.inertia_)
            plt.plot(
                range(1, 11), wcss
            )  # creating the graph between WCSS and the number of clusters
            plt.title('The Elbow Method')
            plt.xlabel('Number of clusters')
            plt.ylabel('WCSS')
            #plt.show()
            plt.savefig('apps/models/kmeans_elbow.png'
                        )  # saving the elbow plot locally
            # finding the value of the optimum cluster programmatically
            self.kn = KneeLocator(range(1, 11),
                                  wcss,
                                  curve='convex',
                                  direction='decreasing')
            self.logger.info('The optimum number of clusters is: ' +
                             str(self.kn.knee))
            self.logger.info('End of elbow plotting...')
            return self.kn.knee

        except Exception as e:
            self.logger.exception('Exception raised while elbow plotting:' +
                                  str(e))
            raise Exception()

    def create_clusters(self, data, number_of_clusters):

        self.data = data
        try:
            self.logger.info('Start of Create clusters...')
            self.kmeans = KMeans(n_clusters=number_of_clusters,
                                 init='k-means++',
                                 random_state=0)
            self.y_kmeans = self.kmeans.fit_predict(
                data)  #  divide data into clusters
            self.saveModel = self.fileOperation.save_model(
                self.kmeans, 'KMeans')
            # saving the KMeans model to directory
            # passing 'Model' as the functions need three parameters
            self.data[
                'Cluster'] = self.y_kmeans  # create a new column in dataset for storing the cluster information
            self.logger.info('succesfully created ' + str(self.kn.knee) +
                             'clusters.')
            self.logger.info('End of Create clusters...')
            return self.data
        except Exception as e:
            self.logger.exception('Exception raised while Creating clusters:' +
                                  str(e))
            raise Exception()
 def __init__(self, run_id, data_path):
     self.run_id = run_id
     self.data_path = data_path
     self.logger = Logger(self.run_id, 'KMeansCluster', 'training')
     self.fileOperation = FileOperation(self.run_id, self.data_path,
                                        'training')
Ejemplo n.º 3
0
class TrainModel:
    """
    *****************************************************************************
    *
    * filename:       TrainModel.py
    * version:        1.0
    * author:
    * creation date:
    *
    *
    *
    *
    * description:    Class to training the models
    *
    ****************************************************************************
    """
    def __init__(self, run_id, data_path):
        self.run_id = run_id
        self.data_path = data_path
        self.logger = Logger(self.run_id, 'TrainModel', 'training')
        self.loadValidate = LoadValidate(self.run_id, self.data_path,
                                         'training')
        self.preProcess = Preprocessor(self.run_id, self.data_path, 'training')
        self.modelTuner = ModelTuner(self.run_id, self.data_path, 'training')
        self.fileOperation = FileOperation(self.run_id, self.data_path,
                                           'training')
        # self.cluster = KMeansCluster(self.run_id, self.data_path)

    def training_model(self):
        """
        * method: trainingModel
        * description: method to training the model
        * return: none
        *
        *
        * Parameters
        *   none:
        """
        try:
            self.logger.info('Start of Training')
            self.logger.info('Run_id:' + str(self.run_id))
            # Load, validations and transformation
            self.loadValidate.validate_trainset()
            # preprocessing activities
            self.X, self.y = self.preProcess.preprocess_trainset()
            columns = {"data_columns": [col for col in self.X.columns]}
            with open('apps/database/columns.json', 'w') as f:
                f.write(json.dumps(columns))

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    self.X, self.y, test_size=0.2, random_state=0)
                # getting the best model for each of the clusters
                best_model_name, best_model = self.modelTuner.get_best_model(
                    x_train, y_train, x_test, y_test)

                # saving the best model to the directory.
                save_model = self.fileOperation.save_model(
                    best_model, best_model_name)

            self.logger.info('End of Training')
        except Exception:
            self.logger.exception('Unsuccessful End of Training')
            raise Exception
class TrainModel:
    def __init__(self, run_id, data_path):
        self.run_id = run_id
        self.data_path = data_path
        self.logger = Logger(self.run_id, 'TrainModel', 'training')
        self.loadValidate = LoadValidate(self.run_id, self.data_path,
                                         'training')
        self.preProcess = Preprocessor(self.run_id, self.data_path, 'training')
        self.modelTuner = ModelTuner(self.run_id, self.data_path, 'training')
        self.fileOperation = FileOperation(self.run_id, self.data_path,
                                           'training')
        self.cluster = KMeansCluster(self.run_id, self.data_path)

    def training_model(self):

        try:
            self.logger.info('Start of Training')
            self.logger.info('Run_id:' + str(self.run_id))
            #Load, validations and transformation
            self.loadValidate.validate_trainset()
            #preprocessing activities
            self.X, self.y = self.preProcess.preprocess_trainset()
            columns = {"data_columns": [col for col in self.X.columns]}
            with open('apps/database/columns.json', 'w') as f:
                f.write(json.dumps(columns))
            #create clusters
            number_of_clusters = self.cluster.elbow_plot(self.X)
            # Divide the data into clusters
            self.X = self.cluster.create_clusters(self.X, number_of_clusters)
            # create a new column in the dataset consisting of the corresponding cluster assignments.
            self.X['Labels'] = self.y
            # getting the unique clusters from our data set
            list_of_clusters = self.X['Cluster'].unique()
            # parsing all the clusters and look for the best ML algorithm to fit on individual cluster
            for i in list_of_clusters:
                cluster_data = self.X[self.X['Cluster'] ==
                                      i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=0.2,
                    random_state=0)
                #getting the best model for each of the clusters
                best_model_name, best_model = self.modelTuner.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                save_model = self.fileOperation.save_model(
                    best_model, best_model_name + str(i))

            self.logger.info('End of Training')
        except Exception:
            self.logger.exception('Unsuccessful End of Training')
            raise Exception
class PredictModel:
    def __init__(self, run_id, data_path):
        self.run_id = run_id
        self.data_path = data_path
        self.logger = Logger(self.run_id, 'PredictModel', 'prediction')
        self.loadValidate = LoadValidate(self.run_id, self.data_path,
                                         'prediction')
        self.preProcess = Preprocessor(self.run_id, self.data_path,
                                       'prediction')
        self.fileOperation = FileOperation(self.run_id, self.data_path,
                                           'prediction')

    def batch_predict_from_model(self):

        try:
            self.logger.info('Start of Prediction')
            self.logger.info('run_id:' + str(self.run_id))
            #validations and transformation
            self.loadValidate.validate_predictset()
            #preprocessing activities
            self.X = self.preProcess.preprocess_predictset()
            #load model
            kmeans = self.fileOperation.load_model('KMeans')
            #cluster selection
            clusters = kmeans.predict(self.X.drop(['empid'], axis=1))
            self.X['clusters'] = clusters
            clusters = self.X['clusters'].unique()
            y_predicted = []
            for i in clusters:
                self.logger.info('clusters loop started')
                cluster_data = self.X[self.X['clusters'] == i]
                cluster_data_new = cluster_data.drop(['empid', 'clusters'],
                                                     axis=1)
                model_name = self.fileOperation.correct_model(i)
                model = self.fileOperation.load_model(model_name)
                y_predicted = model.predict(cluster_data_new)
                #result = pd.DataFrame(list(zip(y_predicted)), columns=['Predictions'])
                #result.to_csv(self.data_path+'_results/'+'Predictions.csv', header=True, mode='a+')
                result = pd.DataFrame({
                    "EmpId": cluster_data['empid'],
                    "Prediction": y_predicted
                })
                result.to_csv(self.data_path + '_results/' + 'Predictions.csv',
                              header=True,
                              mode='a+',
                              index=False)
            self.logger.info('End of Prediction')
        except Exception:
            self.logger.exception('Unsuccessful End of Prediction')
            raise Exception

    def single_predict_from_model(self, data):

        try:
            self.logger.info('Start of Prediction')
            self.logger.info('run_id:' + str(self.run_id))
            #preprocessing activities
            self.X = self.preProcess.preprocess_predict(data)
            #load model
            kmeans = self.fileOperation.load_model('KMeans')
            #cluster selection
            clusters = kmeans.predict(self.X.drop(['empid'], axis=1))
            self.X['clusters'] = clusters
            clusters = self.X['clusters'].unique()
            y_predicted = []
            for i in clusters:
                self.logger.info('clusters loop started')
                cluster_data = self.X[self.X['clusters'] == i]
                cluster_data_new = cluster_data.drop(['empid', 'clusters'],
                                                     axis=1)
                model_name = self.fileOperation.correct_model(i)
                model = self.fileOperation.load_model(model_name)
                self.logger.info('Shape of Data ' +
                                 str(cluster_data_new.shape))
                self.logger.info('Shape of Data ' +
                                 str(cluster_data_new.info()))
                y_predicted = model.predict(cluster_data_new)
                #result = pd.DataFrame(list(zip(y_predicted)), columns=['Predictions'])
                #result.to_csv(self.data_path+'_results/'+'Predictions.csv', header=True, mode='a+')
                #result = pd.DataFrame({"EmpId": cluster_data['empid'],"Prediction": y_predicted})
                #result.to_csv(self.data_path+'_results/'+'Predictions.csv', header=True, mode='a+',index=False)
                self.logger.info('Output : ' + str(y_predicted))
                self.logger.info('End of Prediction')
                return int(y_predicted[0])
        except Exception:
            self.logger.exception('Unsuccessful End of Prediction')
            raise Exception
Ejemplo n.º 6
0
class KMeansCluster:
    """
    *****************************************************************************
    *
    * filename:       KMeansCluster.py
    * version:        1.0
    * author:         CODESTUDIO
    * creation date:  05-MAY-2020
    *
    * change history:
    *
    * who             when           version  change (include bug# if apply)
    * ----------      -----------    -------  ------------------------------
    * bcheekati       05-MAY-2020    1.0      initial creation
    *
    *
    * description:    Class to cluster the dataset
    *
    ****************************************************************************
    """
    def __init__(self, run_id, data_path):
        self.run_id = run_id
        self.data_path = data_path
        self.logger = Logger(self.run_id, 'KMeansCluster', 'training')
        self.fileOperation = FileOperation(self.run_id, self.data_path,
                                           'training')

    def elbow_plot(self, data):
        """
        * method: log
        * description: method to saves the plot to decide the optimum number of clusters to the file.
        * return: A picture saved to the directory
        *
        * who             when           version  change (include bug# if apply)
        * ----------      -----------    -------  ------------------------------
        * bcheekati       05-MAY-2020    1.0      initial creation
        *
        * Parameters
        *   data:
        """
        wcss = []  # initializing an empty list --within cluster sum of errors
        try:
            self.logger.info('Start of elbow plotting...')
            for i in range(1, 11):
                kmeans = KMeans(
                    n_clusters=i, init='k-means++',
                    random_state=0)  # initializing the KMeans object
                kmeans.fit(data)  # fitting the data to the KMeans Algorithm
                wcss.append(kmeans.inertia_)
            plt.plot(
                range(1, 11), wcss
            )  # creating the graph between WCSS and the number of clusters
            plt.title('The Elbow Method')
            plt.xlabel('Number of clusters')
            plt.ylabel('WCSS')
            #plt.show()
            plt.savefig('apps/models/kmeans_elbow.png'
                        )  # saving the elbow plot locally
            # finding the value of the optimum cluster programmatically
            self.kn = KneeLocator(range(1, 11),
                                  wcss,
                                  curve='convex',
                                  direction='decreasing')
            self.logger.info('The optimum number of clusters is: ' +
                             str(self.kn.knee))
            self.logger.info('End of elbow plotting...')
            return self.kn.knee

        except Exception as e:
            self.logger.exception('Exception raised while elbow plotting:' +
                                  str(e))
            raise Exception()

    def create_clusters(self, data, number_of_clusters):
        """
        * method: create_clusters
        * description: method to create clusters
        * return: A date frame with cluster column
        *
        * who             when           version  change (include bug# if apply)
        * ----------      -----------    -------  ------------------------------
        * bcheekati       05-MAY-2020    1.0      initial creation
        *
        * Parameters
        *   data:
        *   number_of_clusters:
        """
        self.data = data
        try:
            self.logger.info('Start of Create clusters...')
            self.kmeans = KMeans(n_clusters=number_of_clusters,
                                 init='k-means++',
                                 random_state=0)
            self.y_kmeans = self.kmeans.fit_predict(
                data)  #  divide data into clusters
            self.saveModel = self.fileOperation.save_model(
                self.kmeans, 'KMeans')
            # saving the KMeans model to directory
            # passing 'Model' as the functions need three parameters
            self.data[
                'Cluster'] = self.y_kmeans  # create a new column in dataset for storing the cluster information
            self.logger.info('succesfully created ' + str(self.kn.knee) +
                             'clusters.')
            self.logger.info('End of Create clusters...')
            return self.data
        except Exception as e:
            self.logger.exception('Exception raised while Creating clusters:' +
                                  str(e))
            raise Exception()
Ejemplo n.º 7
0
class PredictModel:

    """
    *****************************************************************************
    *
    * filename:       PredictModel.py
    * version:        1.0
    * author:         CODESTUDIO
    * creation date:  05-MAY-2020
    *
    * change history:
    *
    * who             when           version  change (include bug# if apply)
    * ----------      -----------    -------  ------------------------------
    * bcheekati       05-MAY-2020    1.0      initial creation
    *
    *
    * description:    Class to prediction the result
    *
    ****************************************************************************
    """

    def __init__(self,run_id,data_path):
        self.run_id = run_id
        self.data_path = data_path
        self.logger = Logger(self.run_id, 'PredictModel', 'prediction')
        self.loadValidate = LoadValidate(self.run_id, self.data_path,'prediction')
        self.preProcess = Preprocessor(self.run_id, self.data_path,'prediction')
        self.fileOperation = FileOperation(self.run_id, self.data_path, 'prediction')

    def batch_predict_from_model(self):
        """
        * method: batch_predict_from_model
        * description: method to prediction the results
        * return: none
        *
        * who             when           version  change (include bug# if apply)
        * ----------      -----------    -------  ------------------------------
        * bcheekati       05-MAY-2020    1.0      initial creation
        *
        * Parameters
        *   none:
        """
        try:
            self.logger.info('Start of Prediction')
            self.logger.info('run_id:' + str(self.run_id))
            #validations and transformation
            self.loadValidate.validate_predictset()
            #preprocessing activities
            self.X = self.preProcess.preprocess_predictset()
            #load model
            kmeans = self.fileOperation.load_model('KMeans')
            #cluster selection
            clusters = kmeans.predict(self.X.drop(['empid'],axis=1))
            self.X['clusters'] = clusters
            clusters = self.X['clusters'].unique()
            y_predicted=[]
            for i in clusters:
                self.logger.info('clusters loop started')
                cluster_data = self.X[self.X['clusters'] == i]
                cluster_data_new = cluster_data.drop(['empid','clusters'], axis=1)
                model_name = self.fileOperation.correct_model(i)
                model = self.fileOperation.load_model(model_name)
                y_predicted = model.predict(cluster_data_new)
                #result = pd.DataFrame(list(zip(y_predicted)), columns=['Predictions'])
                #result.to_csv(self.data_path+'_results/'+'Predictions.csv', header=True, mode='a+')
                result = pd.DataFrame({"EmpId": cluster_data['empid'],"Prediction": y_predicted})
                result.to_csv(self.data_path+'_results/'+'Predictions.csv', header=True, mode='a+',index=False)
            self.logger.info('End of Prediction')
        except Exception:
            self.logger.exception('Unsuccessful End of Prediction')
            raise Exception


    def single_predict_from_model(self,data):
        """
        * method: single_predict_from_model
        * description: method to prediction the results
        * return: none
        *
        * who             when           version  change (include bug# if apply)
        * ----------      -----------    -------  ------------------------------
        * bcheekati       05-MAY-2020    1.0      initial creation
        *
        * Parameters
        *   none:
        """
        try:
            self.logger.info('Start of Prediction')
            self.logger.info('run_id:' + str(self.run_id))
            #preprocessing activities
            self.X = self.preProcess.preprocess_predict(data)
            #load model
            kmeans = self.fileOperation.load_model('KMeans')
            #cluster selection
            clusters = kmeans.predict(self.X.drop(['empid'],axis=1))
            self.X['clusters'] = clusters
            clusters = self.X['clusters'].unique()
            y_predicted=[]
            for i in clusters:
                self.logger.info('clusters loop started')
                cluster_data = self.X[self.X['clusters'] == i]
                cluster_data_new = cluster_data.drop(['empid','clusters'], axis=1)
                model_name = self.fileOperation.correct_model(i)
                model = self.fileOperation.load_model(model_name)
                self.logger.info('Shape of Data '+str(cluster_data_new.shape))
                self.logger.info('Shape of Data ' + str(cluster_data_new.info()))
                y_predicted = model.predict(cluster_data_new)
                #result = pd.DataFrame(list(zip(y_predicted)), columns=['Predictions'])
                #result.to_csv(self.data_path+'_results/'+'Predictions.csv', header=True, mode='a+')
                #result = pd.DataFrame({"EmpId": cluster_data['empid'],"Prediction": y_predicted})
                #result.to_csv(self.data_path+'_results/'+'Predictions.csv', header=True, mode='a+',index=False)
                self.logger.info('Output : '+str(y_predicted))
                self.logger.info('End of Prediction')
                return int(y_predicted[0])
        except Exception:
            self.logger.exception('Unsuccessful End of Prediction')
            raise Exception