class KMeansCluster: def __init__(self, run_id, data_path): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'KMeansCluster', 'training') self.fileOperation = FileOperation(self.run_id, self.data_path, 'training') def elbow_plot(self, data): wcss = [] # initializing an empty list --within cluster sum of errors try: self.logger.info('Start of elbow plotting...') for i in range(1, 11): kmeans = KMeans( n_clusters=i, init='k-means++', random_state=0) # initializing the KMeans object kmeans.fit(data) # fitting the data to the KMeans Algorithm wcss.append(kmeans.inertia_) plt.plot( range(1, 11), wcss ) # creating the graph between WCSS and the number of clusters plt.title('The Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') #plt.show() plt.savefig('apps/models/kmeans_elbow.png' ) # saving the elbow plot locally # finding the value of the optimum cluster programmatically self.kn = KneeLocator(range(1, 11), wcss, curve='convex', direction='decreasing') self.logger.info('The optimum number of clusters is: ' + str(self.kn.knee)) self.logger.info('End of elbow plotting...') return self.kn.knee except Exception as e: self.logger.exception('Exception raised while elbow plotting:' + str(e)) raise Exception() def create_clusters(self, data, number_of_clusters): self.data = data try: self.logger.info('Start of Create clusters...') self.kmeans = KMeans(n_clusters=number_of_clusters, init='k-means++', random_state=0) self.y_kmeans = self.kmeans.fit_predict( data) # divide data into clusters self.saveModel = self.fileOperation.save_model( self.kmeans, 'KMeans') # saving the KMeans model to directory # passing 'Model' as the functions need three parameters self.data[ 'Cluster'] = self.y_kmeans # create a new column in dataset for storing the cluster information self.logger.info('succesfully created ' + str(self.kn.knee) + 'clusters.') self.logger.info('End of Create clusters...') return self.data except Exception as e: self.logger.exception('Exception raised while Creating clusters:' + str(e)) raise Exception()
def __init__(self, run_id, data_path): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'KMeansCluster', 'training') self.fileOperation = FileOperation(self.run_id, self.data_path, 'training')
class TrainModel: """ ***************************************************************************** * * filename: TrainModel.py * version: 1.0 * author: * creation date: * * * * * description: Class to training the models * **************************************************************************** """ def __init__(self, run_id, data_path): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'TrainModel', 'training') self.loadValidate = LoadValidate(self.run_id, self.data_path, 'training') self.preProcess = Preprocessor(self.run_id, self.data_path, 'training') self.modelTuner = ModelTuner(self.run_id, self.data_path, 'training') self.fileOperation = FileOperation(self.run_id, self.data_path, 'training') # self.cluster = KMeansCluster(self.run_id, self.data_path) def training_model(self): """ * method: trainingModel * description: method to training the model * return: none * * * Parameters * none: """ try: self.logger.info('Start of Training') self.logger.info('Run_id:' + str(self.run_id)) # Load, validations and transformation self.loadValidate.validate_trainset() # preprocessing activities self.X, self.y = self.preProcess.preprocess_trainset() columns = {"data_columns": [col for col in self.X.columns]} with open('apps/database/columns.json', 'w') as f: f.write(json.dumps(columns)) # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split( self.X, self.y, test_size=0.2, random_state=0) # getting the best model for each of the clusters best_model_name, best_model = self.modelTuner.get_best_model( x_train, y_train, x_test, y_test) # saving the best model to the directory. save_model = self.fileOperation.save_model( best_model, best_model_name) self.logger.info('End of Training') except Exception: self.logger.exception('Unsuccessful End of Training') raise Exception
class TrainModel: def __init__(self, run_id, data_path): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'TrainModel', 'training') self.loadValidate = LoadValidate(self.run_id, self.data_path, 'training') self.preProcess = Preprocessor(self.run_id, self.data_path, 'training') self.modelTuner = ModelTuner(self.run_id, self.data_path, 'training') self.fileOperation = FileOperation(self.run_id, self.data_path, 'training') self.cluster = KMeansCluster(self.run_id, self.data_path) def training_model(self): try: self.logger.info('Start of Training') self.logger.info('Run_id:' + str(self.run_id)) #Load, validations and transformation self.loadValidate.validate_trainset() #preprocessing activities self.X, self.y = self.preProcess.preprocess_trainset() columns = {"data_columns": [col for col in self.X.columns]} with open('apps/database/columns.json', 'w') as f: f.write(json.dumps(columns)) #create clusters number_of_clusters = self.cluster.elbow_plot(self.X) # Divide the data into clusters self.X = self.cluster.create_clusters(self.X, number_of_clusters) # create a new column in the dataset consisting of the corresponding cluster assignments. self.X['Labels'] = self.y # getting the unique clusters from our data set list_of_clusters = self.X['Cluster'].unique() # parsing all the clusters and look for the best ML algorithm to fit on individual cluster for i in list_of_clusters: cluster_data = self.X[self.X['Cluster'] == i] # filter the data for one cluster # Prepare the feature and Label columns cluster_features = cluster_data.drop(['Labels', 'Cluster'], axis=1) cluster_label = cluster_data['Labels'] # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split( cluster_features, cluster_label, test_size=0.2, random_state=0) #getting the best model for each of the clusters best_model_name, best_model = self.modelTuner.get_best_model( x_train, y_train, x_test, y_test) #saving the best model to the directory. save_model = self.fileOperation.save_model( best_model, best_model_name + str(i)) self.logger.info('End of Training') except Exception: self.logger.exception('Unsuccessful End of Training') raise Exception
class PredictModel: def __init__(self, run_id, data_path): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'PredictModel', 'prediction') self.loadValidate = LoadValidate(self.run_id, self.data_path, 'prediction') self.preProcess = Preprocessor(self.run_id, self.data_path, 'prediction') self.fileOperation = FileOperation(self.run_id, self.data_path, 'prediction') def batch_predict_from_model(self): try: self.logger.info('Start of Prediction') self.logger.info('run_id:' + str(self.run_id)) #validations and transformation self.loadValidate.validate_predictset() #preprocessing activities self.X = self.preProcess.preprocess_predictset() #load model kmeans = self.fileOperation.load_model('KMeans') #cluster selection clusters = kmeans.predict(self.X.drop(['empid'], axis=1)) self.X['clusters'] = clusters clusters = self.X['clusters'].unique() y_predicted = [] for i in clusters: self.logger.info('clusters loop started') cluster_data = self.X[self.X['clusters'] == i] cluster_data_new = cluster_data.drop(['empid', 'clusters'], axis=1) model_name = self.fileOperation.correct_model(i) model = self.fileOperation.load_model(model_name) y_predicted = model.predict(cluster_data_new) #result = pd.DataFrame(list(zip(y_predicted)), columns=['Predictions']) #result.to_csv(self.data_path+'_results/'+'Predictions.csv', header=True, mode='a+') result = pd.DataFrame({ "EmpId": cluster_data['empid'], "Prediction": y_predicted }) result.to_csv(self.data_path + '_results/' + 'Predictions.csv', header=True, mode='a+', index=False) self.logger.info('End of Prediction') except Exception: self.logger.exception('Unsuccessful End of Prediction') raise Exception def single_predict_from_model(self, data): try: self.logger.info('Start of Prediction') self.logger.info('run_id:' + str(self.run_id)) #preprocessing activities self.X = self.preProcess.preprocess_predict(data) #load model kmeans = self.fileOperation.load_model('KMeans') #cluster selection clusters = kmeans.predict(self.X.drop(['empid'], axis=1)) self.X['clusters'] = clusters clusters = self.X['clusters'].unique() y_predicted = [] for i in clusters: self.logger.info('clusters loop started') cluster_data = self.X[self.X['clusters'] == i] cluster_data_new = cluster_data.drop(['empid', 'clusters'], axis=1) model_name = self.fileOperation.correct_model(i) model = self.fileOperation.load_model(model_name) self.logger.info('Shape of Data ' + str(cluster_data_new.shape)) self.logger.info('Shape of Data ' + str(cluster_data_new.info())) y_predicted = model.predict(cluster_data_new) #result = pd.DataFrame(list(zip(y_predicted)), columns=['Predictions']) #result.to_csv(self.data_path+'_results/'+'Predictions.csv', header=True, mode='a+') #result = pd.DataFrame({"EmpId": cluster_data['empid'],"Prediction": y_predicted}) #result.to_csv(self.data_path+'_results/'+'Predictions.csv', header=True, mode='a+',index=False) self.logger.info('Output : ' + str(y_predicted)) self.logger.info('End of Prediction') return int(y_predicted[0]) except Exception: self.logger.exception('Unsuccessful End of Prediction') raise Exception
class KMeansCluster: """ ***************************************************************************** * * filename: KMeansCluster.py * version: 1.0 * author: CODESTUDIO * creation date: 05-MAY-2020 * * change history: * * who when version change (include bug# if apply) * ---------- ----------- ------- ------------------------------ * bcheekati 05-MAY-2020 1.0 initial creation * * * description: Class to cluster the dataset * **************************************************************************** """ def __init__(self, run_id, data_path): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'KMeansCluster', 'training') self.fileOperation = FileOperation(self.run_id, self.data_path, 'training') def elbow_plot(self, data): """ * method: log * description: method to saves the plot to decide the optimum number of clusters to the file. * return: A picture saved to the directory * * who when version change (include bug# if apply) * ---------- ----------- ------- ------------------------------ * bcheekati 05-MAY-2020 1.0 initial creation * * Parameters * data: """ wcss = [] # initializing an empty list --within cluster sum of errors try: self.logger.info('Start of elbow plotting...') for i in range(1, 11): kmeans = KMeans( n_clusters=i, init='k-means++', random_state=0) # initializing the KMeans object kmeans.fit(data) # fitting the data to the KMeans Algorithm wcss.append(kmeans.inertia_) plt.plot( range(1, 11), wcss ) # creating the graph between WCSS and the number of clusters plt.title('The Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') #plt.show() plt.savefig('apps/models/kmeans_elbow.png' ) # saving the elbow plot locally # finding the value of the optimum cluster programmatically self.kn = KneeLocator(range(1, 11), wcss, curve='convex', direction='decreasing') self.logger.info('The optimum number of clusters is: ' + str(self.kn.knee)) self.logger.info('End of elbow plotting...') return self.kn.knee except Exception as e: self.logger.exception('Exception raised while elbow plotting:' + str(e)) raise Exception() def create_clusters(self, data, number_of_clusters): """ * method: create_clusters * description: method to create clusters * return: A date frame with cluster column * * who when version change (include bug# if apply) * ---------- ----------- ------- ------------------------------ * bcheekati 05-MAY-2020 1.0 initial creation * * Parameters * data: * number_of_clusters: """ self.data = data try: self.logger.info('Start of Create clusters...') self.kmeans = KMeans(n_clusters=number_of_clusters, init='k-means++', random_state=0) self.y_kmeans = self.kmeans.fit_predict( data) # divide data into clusters self.saveModel = self.fileOperation.save_model( self.kmeans, 'KMeans') # saving the KMeans model to directory # passing 'Model' as the functions need three parameters self.data[ 'Cluster'] = self.y_kmeans # create a new column in dataset for storing the cluster information self.logger.info('succesfully created ' + str(self.kn.knee) + 'clusters.') self.logger.info('End of Create clusters...') return self.data except Exception as e: self.logger.exception('Exception raised while Creating clusters:' + str(e)) raise Exception()
class PredictModel: """ ***************************************************************************** * * filename: PredictModel.py * version: 1.0 * author: CODESTUDIO * creation date: 05-MAY-2020 * * change history: * * who when version change (include bug# if apply) * ---------- ----------- ------- ------------------------------ * bcheekati 05-MAY-2020 1.0 initial creation * * * description: Class to prediction the result * **************************************************************************** """ def __init__(self,run_id,data_path): self.run_id = run_id self.data_path = data_path self.logger = Logger(self.run_id, 'PredictModel', 'prediction') self.loadValidate = LoadValidate(self.run_id, self.data_path,'prediction') self.preProcess = Preprocessor(self.run_id, self.data_path,'prediction') self.fileOperation = FileOperation(self.run_id, self.data_path, 'prediction') def batch_predict_from_model(self): """ * method: batch_predict_from_model * description: method to prediction the results * return: none * * who when version change (include bug# if apply) * ---------- ----------- ------- ------------------------------ * bcheekati 05-MAY-2020 1.0 initial creation * * Parameters * none: """ try: self.logger.info('Start of Prediction') self.logger.info('run_id:' + str(self.run_id)) #validations and transformation self.loadValidate.validate_predictset() #preprocessing activities self.X = self.preProcess.preprocess_predictset() #load model kmeans = self.fileOperation.load_model('KMeans') #cluster selection clusters = kmeans.predict(self.X.drop(['empid'],axis=1)) self.X['clusters'] = clusters clusters = self.X['clusters'].unique() y_predicted=[] for i in clusters: self.logger.info('clusters loop started') cluster_data = self.X[self.X['clusters'] == i] cluster_data_new = cluster_data.drop(['empid','clusters'], axis=1) model_name = self.fileOperation.correct_model(i) model = self.fileOperation.load_model(model_name) y_predicted = model.predict(cluster_data_new) #result = pd.DataFrame(list(zip(y_predicted)), columns=['Predictions']) #result.to_csv(self.data_path+'_results/'+'Predictions.csv', header=True, mode='a+') result = pd.DataFrame({"EmpId": cluster_data['empid'],"Prediction": y_predicted}) result.to_csv(self.data_path+'_results/'+'Predictions.csv', header=True, mode='a+',index=False) self.logger.info('End of Prediction') except Exception: self.logger.exception('Unsuccessful End of Prediction') raise Exception def single_predict_from_model(self,data): """ * method: single_predict_from_model * description: method to prediction the results * return: none * * who when version change (include bug# if apply) * ---------- ----------- ------- ------------------------------ * bcheekati 05-MAY-2020 1.0 initial creation * * Parameters * none: """ try: self.logger.info('Start of Prediction') self.logger.info('run_id:' + str(self.run_id)) #preprocessing activities self.X = self.preProcess.preprocess_predict(data) #load model kmeans = self.fileOperation.load_model('KMeans') #cluster selection clusters = kmeans.predict(self.X.drop(['empid'],axis=1)) self.X['clusters'] = clusters clusters = self.X['clusters'].unique() y_predicted=[] for i in clusters: self.logger.info('clusters loop started') cluster_data = self.X[self.X['clusters'] == i] cluster_data_new = cluster_data.drop(['empid','clusters'], axis=1) model_name = self.fileOperation.correct_model(i) model = self.fileOperation.load_model(model_name) self.logger.info('Shape of Data '+str(cluster_data_new.shape)) self.logger.info('Shape of Data ' + str(cluster_data_new.info())) y_predicted = model.predict(cluster_data_new) #result = pd.DataFrame(list(zip(y_predicted)), columns=['Predictions']) #result.to_csv(self.data_path+'_results/'+'Predictions.csv', header=True, mode='a+') #result = pd.DataFrame({"EmpId": cluster_data['empid'],"Prediction": y_predicted}) #result.to_csv(self.data_path+'_results/'+'Predictions.csv', header=True, mode='a+',index=False) self.logger.info('Output : '+str(y_predicted)) self.logger.info('End of Prediction') return int(y_predicted[0]) except Exception: self.logger.exception('Unsuccessful End of Prediction') raise Exception