class trainModel: def __init__(self): self.log_writer = logger.App_Logger() self.mongo = To_mongo_db('wafer') self.aws = Aws_Bucket_operation() # self.file_object = open("Training_Logs/ModelTrainingLog.txt", 'a+') def trainingModel(self): # Logging the start of Training self.log_writer.log('wafer_log', 'Start of Training') try: # Getting the data from the source # data_getter=data_loader.Data_Getter(self.file_object,self.log_writer) data = self.mongo.downlaod_all_from_mongo('wafer_good_data', 'temp_db') """doing the data preprocessing""" preprocessor = preprocessing.Preprocessor('wafer_log', self.log_writer) data = preprocessor.remove_columns( data, ['Wafer'] ) # remove the wafer column as it doesn't contribute to prediction. # create separate features and labels X, Y = preprocessor.separate_label_feature( data, label_column_name='Good/Bad') # check if missing values are present in the dataset # if missing values are there, replace them appropriately. X.replace(to_replace='NULL', value=np.nan, inplace=True) # consumes 4 sec to compute is_null_present = preprocessor.is_null_present(X) if (is_null_present): X = preprocessor.impute_missing_values( X) # missing value imputation # check further which columns do not contribute to predictions # if the standard deviation for a column is zero, it means that the column has constant values # and they are giving the same output both for good and bad sensors # prepare the list of such columns to drop cols_to_drop = preprocessor.get_columns_with_zero_std_deviation( X) # consumes a lot of time # drop the columns obtained above X = preprocessor.remove_columns(X, cols_to_drop) """ Applying the clustering approach""" kmeans = clustering.KMeansClustering( 'wafer_log', self.log_writer) # object initialization. number_of_clusters = kmeans.elbow_plot( X ) # using the elbow plot to find the number of optimum clusters # Divide the data into clusters X = kmeans.create_clusters(X, number_of_clusters) #create a new column in the dataset consisting of the corresponding cluster assignments. # X=pd.DataFrame.join(X,Y) X['Labels'] = Y.values # getting the unique clusters from our dataset list_of_clusters = X['Cluster'].unique() """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster""" for index, i in enumerate(list_of_clusters): cluster_data = X[X['Cluster'] == i] # filter the data for one cluster # Prepare the feature and Label columns cluster_features = cluster_data.drop(['Labels', 'Cluster'], axis=1) cluster_label = cluster_data['Labels'] # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split( cluster_features, cluster_label, test_size=1 / 3, random_state=355) model_finder = tuner.Model_Finder( 'wafer_log', self.log_writer) # object initialization #getting the best model for each of the clusters best_model_name, best_model = model_finder.get_best_model( x_train, y_train, x_test, y_test) #saving the best model to the directory. # file_op = file_methods.File_Operation('wafer_log',self.log_writer) # save_model=file_op.save_model(best_model,best_model_name+str(i)) print(best_model) best_model = pickle.dumps(best_model) self.aws.Upload_To_S3_obj(best_model, best_model_name + str(index) + '.sav', bucket_prefix='wafer-model') # logging the successful Training self.log_writer.log('wafer_log', 'Successful End of Training') # self.file_object.close() except Exception as err: # logging the unsuccessful Training self.log_writer.log('wafer_log', 'Unsuccessful End of Training') # self.file_object.close() print(str(err)) raise err
class KMeansClustering: """ This class shall be used to divide the data into clusters before training. Written By: Rajat Bisoi Version: 1.0 Revisions: None """ def __init__(self, db_name, logger_object): self.db_name = db_name self.logger_object = logger_object self.aws = Aws_Bucket_operation() def elbow_plot(self, data): """ Method Name: elbow_plot Description: This method saves the plot to decide the optimum number of clusters to the file. Output: A picture saved to the directory On Failure: Raise Exception Written By: Rajat Bisoi Version: 1.0 Revisions: None """ self.logger_object.log( self.db_name, 'Entered the elbow_plot method of the KMeansClustering class') wcss = [] # initializing an empty list try: for i in range(1, 11): kmeans = KMeans( n_clusters=i, init='k-means++', random_state=42) # initializing the KMeans object kmeans.fit(data) # fitting the data to the KMeans Algorithm wcss.append(kmeans.inertia_) plt.plot( range(1, 11), wcss ) # creating the graph between WCSS and the number of clusters plt.title('The Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') #plt.show() plt.savefig('preprocessing_data/K-Means_Elbow.PNG' ) # saving the elbow plot locally # finding the value of the optimum cluster programmatically self.kn = KneeLocator(range(1, 11), wcss, curve='convex', direction='decreasing') self.logger_object.log( self.db_name, 'The optimum number of clusters is: ' + str(self.kn.knee) + ' . Exited the elbow_plot method of the KMeansClustering class' ) return self.kn.knee except Exception as e: self.logger_object.log( self.db_name, 'Exception occured in elbow_plot method of the KMeansClustering class. Exception message: ' + str(e)) self.logger_object.log( self.db_name, 'Finding the number of clusters failed. Exited the elbow_plot method of the KMeansClustering class' ) raise Exception() def create_clusters(self, data, number_of_clusters): """ Method Name: create_clusters Description: Create a new dataframe consisting of the cluster information. Output: A datframe with cluster column On Failure: Raise Exception Written By: Rajat Bisoi Version: 1.0 Revisions: None """ self.logger_object.log( self.db_name, 'Entered the create_clusters method of the KMeansClustering class') self.data = data try: self.kmeans = KMeans(n_clusters=number_of_clusters, init='k-means++', random_state=42) #self.data = self.data[~self.data.isin([np.nan, np.inf, -np.inf]).any(1)] self.y_kmeans = self.kmeans.fit_predict( data) # divide data into clusters # self.file_op = file_methods.File_Operation(self.db_name, self.logger_object) # self.save_model = self.file_op.save_model(self.kmeans, 'KMeans') # saving the KMeans model to directory self.kmeans = pickle.dumps(self.kmeans) self.aws.Upload_To_S3_obj( object=self.kmeans, file_name='kmeans.sav', bucket_prefix='wafer-model' ) # passing 'Model' as the functions need three parameters self.data[ 'Cluster'] = self.y_kmeans # create a new column in dataset for storing the cluster information self.logger_object.log( self.db_name, 'succesfully created ' + str(self.kn.knee) + 'clusters. Exited the create_clusters method of the KMeansClustering class' ) return self.data except Exception as e: self.logger_object.log( self.db_name, 'Exception occured in create_clusters method of the KMeansClustering class. Exception message: ' + str(e)) self.logger_object.log( self.db_name, 'Fitting the data to clusters failed. Exited the create_clusters method of the KMeansClustering class' ) raise Exception()