def create_clusters(self,data,number_of_clusters): """ Method Name: create_clusters Description: Create a new dataframe consisting of the cluster information. Output: A datframe with cluster column On Failure: Raise Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ self.logger_object.log(self.file_object, 'Entered the create_clusters method of the KMeansClustering class') self.data=data try: self.kmeans = KMeans(n_clusters=number_of_clusters, init='k-means++', random_state=42) #self.data = self.data[~self.data.isin([np.nan, np.inf, -np.inf]).any(1)] self.y_kmeans=self.kmeans.fit_predict(data) # divide data into clusters self.file_op = file_methods.File_Operation(self.file_object,self.logger_object) self.save_model = self.file_op.save_model(self.kmeans, 'KMeans') # saving the KMeans model to directory # passing 'Model' as the functions need three parameters self.data['Cluster']=self.y_kmeans # create a new column in dataset for storing the cluster information self.logger_object.log(self.file_object, 'succesfully created '+str(self.kn.knee)+ 'clusters. Exited the create_clusters method of the KMeansClustering class') return self.data except Exception as e: self.logger_object.log(self.file_object,'Exception occured in create_clusters method of the KMeansClustering class. Exception message: ' + str(e)) self.logger_object.log(self.file_object,'Fitting the data to clusters failed. Exited the create_clusters method of the KMeansClustering class') raise Exception()
def create_clusters(self, data, number_of_clusters): self.logger_object.log( self.file_object, 'Entered the create_clusters method of the KMeansClustering class') self.data = data try: self.kmeans = KMeans(n_clusters=number_of_clusters, init='k-means++', random_state=42) self.y_means = self.kmeans.fit_predict(data) self.file_op = file_methods.File_Operation(self.file_object, self.logger_object) self.save_model = self.file_op.save_model(self.kmeans, "KMeans") self.data['Cluster'] = self.y_means self.logger_object.log( self.file_object, 'succesfully created ' + str(self.kn.knee) + 'clusters. Exited the create_clusters method of the KMeansClustering class' ) return self.data except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in create_clusters method of the KMeansClustering class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Fitting the data to clusters failed. Exited the create_clusters method of the KMeansClustering class' ) raise e
def predictionFromModel(self): try: self.pred_data_val.deletePredictionFile( ) #deletes the existing prediction file from last run! self.log_writer.log(self.file_object, 'Start of Prediction') data_getter = data_loader_prediction.Data_Getter_Pred( self.file_object, self.log_writer) data = data_getter.get_data() #code change # wafer_names=data['Wafer'] # data=data.drop(labels=['Wafer'],axis=1) preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) is_null_present = preprocessor.is_null_present(data) if (is_null_present): data = preprocessor.impute_missing_values(data) cols_to_drop = preprocessor.get_columns_with_zero_std_deviation( data) data = preprocessor.remove_columns(data, cols_to_drop) #data=data.to_numpy() file_loader = file_methods.File_Operation(self.file_object, self.log_writer) kmeans = file_loader.load_model('KMeans') ##Code changed #pred_data = data.drop(['Wafer'],axis=1) clusters = kmeans.predict(data.drop( ['Wafer'], axis=1)) #drops the first column for cluster prediction data['clusters'] = clusters data.to_csv( 'Prediction_Raw_Files_Validated/finalpredictiondata.csv', index=False) clusters = data['clusters'].unique() for i in clusters: cluster_data = data[data['clusters'] == i] wafer_names = list(cluster_data['Wafer']) cluster_data = data.drop(labels=['Wafer'], axis=1) cluster_data = cluster_data.drop(['clusters'], axis=1) model_name = file_loader.find_correct_model_file(i) model = file_loader.load_model(model_name) result = list(model.predict(cluster_data)) result = pandas.DataFrame(list(zip(wafer_names, result)), columns=['Wafer', 'Prediction']) path = "Prediction_Output_File/Predictions.csv" result.to_csv("Prediction_Output_File/Predictions.csv", header=True, mode='a+') #appends result to prediction file self.log_writer.log(self.file_object, 'End of Prediction') except Exception as ex: self.log_writer.log( self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return path, result.head().to_json(orient="records")
def predictionFromModel(self): try: self.pred_data_val.deletePredictionFile() #deletes the existing prediction file from last run! self.log_writer.log(self.file_object,'Start of Prediction') data_getter=data_loader_prediction.Data_Getter_Pred(self.file_object,self.log_writer) data=data_getter.get_data() #code change # wafer_names=data['Wafer'] # data=data.drop(labels=['Wafer'],axis=1) preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) data = preprocessor.remove_columns(data, [ 'education']) # remove the column as it doesn't contribute to prediction. data = preprocessor.remove_unwanted_spaces(data) # remove unwanted spaces from the dataframe data.replace('?', np.NaN, inplace=True) # replacing '?' with NaN values for imputation # check if missing values are present in the dataset is_null_present, cols_with_missing_values = preprocessor.is_null_present(data) # if missing values are there, replace them appropriately. if (is_null_present): data = preprocessor.impute_missing_values(data, cols_with_missing_values) # missing value imputation # Proceeding with more data pre-processing steps scaled_num_df = preprocessor.scale_numerical_columns(data) cat_df = preprocessor.encode_categorical_columns(data) X = pd.concat([scaled_num_df, cat_df], axis=1) file_loader=file_methods.File_Operation(self.file_object,self.log_writer) kmeans=file_loader.load_model('KMeans') ##Code changed #pred_data = data.drop(['Wafer'],axis=1) clusters=kmeans.predict(X)#drops the first column for cluster prediction X['clusters']=clusters clusters=X['clusters'].unique() predictions=[] for i in clusters: cluster_data= X[X['clusters']==i] cluster_data = cluster_data.drop(['clusters'],axis=1) model_name = file_loader.find_correct_model_file(i) model = file_loader.load_model(model_name) result=(model.predict(cluster_data)) for res in result: if res==0: predictions.append('<=50K') else: predictions.append('>50K') final= pd.DataFrame(list(zip(predictions)),columns=['Predictions']) path="Prediction_Output_File/Predictions.csv" final.to_csv("Prediction_Output_File/Predictions.csv",header=True,mode='a+') #appends result to prediction file self.log_writer.log(self.file_object,'End of Prediction') except Exception as ex: self.log_writer.log(self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return path
def predictionFromModel(self): try: self.pred_data_val.deletePredictionFile( ) #deletes the existing prediction file from last run! self.log_writer.log(self.file_object, 'Start of Prediction') data_getter = data_loader_prediction.Data_Getter_Pred( self.file_object, self.log_writer) data = data_getter.get_data() preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) data = preprocessor.scaleData(data) #data = preprocessor.enocdeCategoricalvalues(data) file_loader = file_methods.File_Operation(self.file_object, self.log_writer) kmeans = file_loader.load_model('KMeans') ##Code changed #pred_data = data.drop(['Wafer'],axis=1) clusters = kmeans.predict( data) #drops the first column for cluster prediction data['clusters'] = clusters clusters = data['clusters'].unique() result = [] for i in clusters: cluster_data = data[data['clusters'] == i] cluster_data = cluster_data.drop(['clusters'], axis=1) model_name = file_loader.find_correct_model_file(i) model = file_loader.load_model(model_name) for val in (model.predict(cluster_data)): if val == 0: result.append("Lodgepole_Pine") elif val == 1: result.append("Spruce_Fir") elif val == 2: result.append("Douglas_fir") elif val == 3: result.append("Krummholz") elif val == 4: result.append("Ponderosa_Pine") elif val == 5: result.append("Aspen") elif val == 6: result.append("Cottonwood_Willow") result = pandas.DataFrame(result, columns=['Predictions']) path = "Prediction_Output_File/Predictions.csv" result.to_csv("Prediction_Output_File/Predictions.csv", header=True, mode='a+') #appends result to prediction file self.log_writer.log(self.file_object, 'End of Prediction') except Exception as ex: self.log_writer.log( self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return path
def trainingModel(self): # Logging start of the training self.loggerObj.logger_log("Start of TrainingModel") try: # Getting the data from the source data_getter = data_loader.Data_Getter(self.loggerObj) data = data_getter.get_data() """doing the data preprocessing""" preprocessor = preprocessing.Preprocessor(self.loggerObj) # repalcing '?' values with np.nan as discussed in the EDA part data = preprocessor.replaceInvalidValuesWithNull(data) # Drop the columns which are having missing values more than 50% of total observations data = preprocessor.dropUnnecessaryColumns(data) # check if missing values are present in the dataset is_null_present = preprocessor.is_null_present(data) # if missing values are there, impute them appropriately. if (is_null_present): data = preprocessor.impute_missing_values( data) # missing value imputation # create separate features and labels X, y = preprocessor.separate_label_feature( data, label_column_name='classes') # Categorical encoding X, y = preprocessor.encodeCategoricalValues(X, y) # Handling imbalance dataset using SMOTE #X, y = preprocessor.handleImbalanceDataset(X, y) # splitting the data into training and test set X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=44) # Standardisation of X_train data X_train = preprocessor.data_standardisation(X_train) # applying same standardisation object on X_test data X_test = preprocessor.prediction_data_standardisation(X_test) model_finder = tuner.Model_Finder( self.loggerObj) # object initialization # getting the best model for each of the clusters best_model_name, best_model = model_finder.get_best_model( X_train, y_train, X_test, y_test) # saving the best model to the directory. file_op = file_methods.File_Operation(self.loggerObj) save_model = file_op.save_model(best_model, best_model_name) except Exception: # Logging the unsuccessful training self.loggerObj.logger_log("Unsuccessful end of training") raise Exception
def predictionFromModel(self): try: self.pred_data_val.deletePredictionFile( ) # deletes the existing file from last run! self.log_writer.log(self.file_object, 'Start of Prediction') data_getter = data_loader_prediction.Data_Getter( self.file_object, self.log_writer) data = data_getter.get_data() preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) # check if missing values are present in the dataset is_null_present, cols_with_missing_values = preprocessor.is_null_present( data) #if missing values are there, replace them appropriately. if is_null_present: data = preprocessor.impute_missing_values( data, cols_with_missing_values) #missing value imputation # Proceeding with more data pre-processing steps X = preprocessor.scale_numerical_columns(data) file_loader = file_methods.File_Operation(self.file_object, self.log_writer) kmeans = file_loader.load_model('KMeans') clusters = kmeans.predict( X) # drops the first column for cluster prediction X['clusters'] = clusters clusters = X['clusters'].unique() prediction = [] for i in clusters: cluster_data = X[X['clusters'] == i] cluster_data = cluster_data.drop(['clusters'], axis=1) model_name = file_loader.find_correct_model_file(i) model = file_loader.load_model(model_name) result = model.predict(cluster_data) final = pd.DataFrame( list(zip(range(X.shape[0]), result)), columns=['Customer No.', 'Predictions'], ) path = "Prediction_Output_File/Predictions.csv" final.to_csv("Prediction_Output_File/Predictions.csv", header=True, mode='a+') #append result to prediction file self.log_writer.log(self.file_object, 'End of Prediction') except Exception as ex: self.log_writer.log( self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return path, final.head().to_json(orient='records')
def predictionFromModel(self): try: self.pred_data_val.deletePredictionFile( ) #deletes the existing prediction file from last run! self.log_writer.log(self.file_object, 'Start of Prediction') data_getter = data_loader_prediction.Data_Getter_Pred( self.file_object, self.log_writer) data = data_getter.get_data() #cdrop 'Unnamed: 0' for cl in data.columns: if cl == 'Unnamed: 0': data.drop('Unnamed: 0', axis=1, inplace=True) # Dropping column after performing EDA preprocessor_cus = preprocess_cus.Preprocessor_cus( self.file_object, self.log_writer) data = preprocessor_cus.drop_column(data) preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) # replacing '?' values with np.nan as discussed in the EDA part data = preprocessor.replaceInvalidValuesWithNull(data) # get encoded values for categorical data data = preprocessor_cus.test_data_encode(data) is_null_present, cols_with_missing_values = preprocessor.is_null_present( data) if (is_null_present): data = preprocessor.impute_missing_values(data) #data=data.to_numpy() file_loader = file_methods.File_Operation(self.file_object, self.log_writer) result = [] # initialize balnk list for storing predicitons model = file_loader.load_model('CatBoost') for val in (model.predict(data)): result.append(val) result = pandas.DataFrame(result, columns=['Predictions']) path = "Prediction_Output_File/Predictions.csv" result['Predictions'].replace({0: "no", 1: "yes"}, inplace=True) result.to_csv("Prediction_Output_File/Predictions.csv", header=True) #appends result to prediction file self.log_writer.log(self.file_object, 'End of Prediction') except Exception as ex: self.log_writer.log( self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return path
def predictionFromModel(self): try: self.pred_data_val.deletePredictionFile() #deletes the existing prediction file from last run! self.log_writer.log(self.file_object,'Start of Prediction') data_getter=data_loader_prediction.Data_Getter_Pred(self.file_object,self.log_writer) data=data_getter.get_data() preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) data = preprocessor.remove_columns(data,[]) # remove the column as it doesn't contribute to prediction. data.replace('?', np.NaN, inplace=True) # replacing '?' with NaN values for imputation # check if missing values are present in the dataset is_null_present, cols_with_missing_values = preprocessor.is_null_present(data) # if missing values are there, replace them appropriately. if (is_null_present): data = preprocessor.impute_missing_values(data, cols_with_missing_values) # missing value imputation # encode categorical data #data = preprocessor.encode_categorical_columns(data) df=data.copy() df.drop(labels=['Sex'],axis=1,inplace=True) file_loader = file_methods.File_Operation(self.file_object, self.log_writer) kmeans = file_loader.load_model('KMeans') ##Code changed clusters=kmeans.predict(df) data['clusters']=clusters data = preprocessor.encode_categorical_columns(data) clusters=data['clusters'].unique() predictions=[] for i in clusters: cluster_data = data[data['clusters'] == i] cluster_data = cluster_data.drop(['clusters'],axis=1) model_name = file_loader.find_correct_model_file(i) model = file_loader.load_model(model_name) result = (model.predict(np.array(cluster_data))) for res in result: if res == 0: predictions.append('1-8 Rings') elif res == 1: predictions.append('11+ Rings') else: predictions.append('9-10 Rings') final= pd.DataFrame(list(zip(predictions)),columns=['Predictions']) path="Prediction_Output_File/Predictions.csv" final.to_csv("Prediction_Output_File/Predictions.csv",header=True,mode='a+') #appends result to prediction file self.log_writer.log(self.file_object,'End of Prediction') except Exception as ex: self.log_writer.log(self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return path , final
def predictionFromModel(self): try: self.pred_data_val.deletePredictionFile() #deletes the existing prediction file from last run! self.log_writer.log(self.file_object,'Start of Prediction') data_getter=data_loader_prediction.Data_Getter_Pred(self.file_object,self.log_writer) data=data_getter.get_data() #code change # wafer_names=data['Wafer'] # data=data.drop(labels=['Wafer'],axis=1) preprocessor=preprocessing.Preprocessor(self.file_object,self.log_writer) data = preprocessor.dropUnnecessaryColumns(data, ['TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured', 'TBG_measured', 'TBG', 'TSH']) # replacing '?' values with np.nan as discussed in the EDA part data = preprocessor.replaceInvalidValuesWithNull(data) # get encoded values for categorical data data = preprocessor.encodeCategoricalValuesPrediction(data) is_null_present=preprocessor.is_null_present(data) if(is_null_present): data=preprocessor.impute_missing_values(data) #data=data.to_numpy() file_loader=file_methods.File_Operation(self.file_object,self.log_writer) kmeans=file_loader.load_model('KMeans') ##Code changed #pred_data = data.drop(['Wafer'],axis=1) clusters=kmeans.predict(data)#drops the first column for cluster prediction data['clusters']=clusters clusters=data['clusters'].unique() result=[] # initialize balnk list for storing predicitons with open('EncoderPickle/enc.pickle', 'rb') as file: #let's load the encoder pickle file to decode the values encoder = pickle.load(file) for i in clusters: cluster_data= data[data['clusters']==i] cluster_data = cluster_data.drop(['clusters'],axis=1) model_name = file_loader.find_correct_model_file(i) model = file_loader.load_model(model_name) for val in (encoder.inverse_transform(model.predict(cluster_data))): result.append(val) result = pandas.DataFrame(result,columns=['Predictions']) path="Prediction_Output_File/Predictions.csv" result.to_csv("Prediction_Output_File/Predictions.csv",header=True) #appends result to prediction file self.log_writer.log(self.file_object,'End of Prediction') except Exception as ex: self.log_writer.log(self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return path
def predictionFromModel(self): try: self.pred_data_val.deletePredictionFile() #deletes the existing prediction file from last run! self.log_writer.log(self.file_object,'Start of Prediction') data_getter=data_loader_prediction.Data_Getter_Pred(self.file_object,self.log_writer) data=data_getter.get_data() #code change # wafer_names=data['Wafer'] # data=data.drop(labels=['Wafer'],axis=1) preprocessor=preprocessing.Preprocessor(self.file_object,self.log_writer) data = preprocessor.dropUnnecessaryColumns(data,["serial","rate","listed_in(type)","listed_in(city)"]) is_null_present,cols_with_missing_values=preprocessor.is_null_present(data) if(is_null_present): data=data.dropna(how='any') # get encoded values for categorical data data = preprocessor.encodeCategoricalValues(data) #scale the prediction data data_scaled = pandas.DataFrame(preprocessor.standardScalingData(data),columns=data.columns) #data=data.to_numpy() file_loader=file_methods.File_Operation(self.file_object,self.log_writer) kmeans=file_loader.load_model('KMeans') ##Code changed #pred_data = data.drop(['Wafer'],axis=1) clusters=kmeans.predict(data_scaled)#drops the first column for cluster prediction data_scaled['clusters']=clusters clusters=data_scaled['clusters'].unique() result=[] # initialize blank list for storing predicitons # with open('EncoderPickle/enc.pickle', 'rb') as file: #let's load the encoder pickle file to decode the values # encoder = pickle.load(file) for i in clusters: cluster_data= data_scaled[data_scaled['clusters']==i] cluster_data = cluster_data.drop(['clusters'],axis=1) model_name = file_loader.find_correct_model_file(i) model = file_loader.load_model(model_name) for val in (model.predict(cluster_data.values)): result.append(val) result = pandas.DataFrame(result,columns=['Predictions']) path="Prediction_Output_File/Predictions.csv" result.to_csv("Prediction_Output_File/Predictions.csv",header=True) #appends result to prediction file self.log_writer.log(self.file_object,'End of Prediction') except Exception as ex: self.log_writer.log(self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return path
def predictionFromModel(self): try: self.pred_data_val.deletePredictionFile( ) #deletes the existing prediction file from last run! self.log_writer.log(self.file_object, 'Start of Prediction') data_getter = data_loader_prediction.Data_Getter_Pred( self.file_object, self.log_writer) data = data_getter.get_data() preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) data = preprocessor.dropUnnecessaryColumns(data, [ 'id', 'region', 'url', 'region_url', 'image_url', 'state', 'type', 'dogs_allowed' ]) # get encoded values for categorical data data = preprocessor.encodeCategoricalValuesPrediction(data) data_scaled = pandas.DataFrame( preprocessor.standardScalingData(data), columns=data.columns) file_loader = file_methods.File_Operation(self.file_object, self.log_writer) kmeans = file_loader.load_model('KMeans') ##Code changed clusters = kmeans.predict( data_scaled) #drops the first column for cluster prediction data_scaled['clusters'] = clusters clusters = data_scaled['clusters'].unique() result = [] # initialize balnk list for storing predicitons for i in clusters: cluster_data = data_scaled[data_scaled['clusters'] == i] cluster_data = cluster_data.drop(['clusters'], axis=1) model_name = file_loader.find_correct_model_file(i) model = file_loader.load_model(model_name) for val in (model.predict(cluster_data.values)): result.append(val) result = pandas.DataFrame(result, columns=['Prediction']) path = "Prediction_Output_File/Predictions.csv" result.to_csv("Prediction_Output_File/Predictions.csv", header=True, mode='a+') #appends result to prediction file self.log_writer.log(self.file_object, 'End of Prediction') except Exception as ex: self.log_writer.log( self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return path, result.head().to_json(orient="records")
def predictionFromModel(self, singlerecdata=None): try: self.loggerObj.logger_log('Start of Prediction') data_getter = data_loader_prediction.Data_Getter_Pred( self.loggerObj) if singlerecdata is None: data = data_getter.get_data() else: data = data_getter.get_data_for_rec(singlerecdata) preprocessor = preprocessing.Preprocessor(self.loggerObj) # repalcing '?' values with np.nan as discussed in the EDA part data = preprocessor.replaceInvalidValuesWithNull(data) data = preprocessor.dropUnnecessaryColumnsForPrediction(data) # check if missing values are present in the dataset is_null_present = preprocessor.is_null_present(data) if (is_null_present): data = preprocessor.impute_missing_values(data) data = preprocessor.encodeCategoricalValuesPrediction(data) data = preprocessor.prediction_data_standardisation(data) file_loader = file_methods.File_Operation(self.loggerObj) model_name = file_loader.find_model_file() model = file_loader.load_model(model_name) result = [] # initialize balnk list for storing predicitons with open( 'EncoderPickle/enc.pickle', 'rb' ) as file: # let's load the encoder pickle file to decode the values encoder = pickle.load(file) if singlerecdata is None: for val in (encoder.inverse_transform(model.predict(data))): result.append(val) result = pandas.DataFrame(result, columns=['Predictions']) path = "Prediction_Output_File/Predictions.csv" result.to_csv("Prediction_Output_File/Predictions.csv", header=True) # appends result to prediction file self.loggerObj.logger_log('End of Prediction') return path else: val = encoder.inverse_transform(model.predict(data)) self.loggerObj.logger_log('End of Prediction') return val except Exception as ex: self.loggerObj.logger_log( 'Error occured while running the prediction!! Error:: %s' % ex) raise ex
def predictionFromModel(self): try: self.pred_data_val.deletePredictionFile() #deletes the existing prediction file from last run! self.log_writer.log(self.file_object,'Start of Prediction') data_getter=data_loader_prediction.Data_Getter_Pred(self.file_object,self.log_writer) data=data_getter.get_data() #code change # wafer_names=data['Wafer'] # data=data.drop(labels=['Wafer'],axis=1) preprocessor=preprocessing.Preprocessor(self.file_object,self.log_writer) is_null_present=preprocessor.is_null_present(data) if(is_null_present): data=preprocessor.impute_missing_values(data) cols_to_drop=preprocessor.get_columns_with_zero_std_deviation(data) data=preprocessor.remove_columns(data,cols_to_drop) #data=data.to_numpy() file_loader=file_methods.File_Operation(self.file_object,self.log_writer) kmeans=file_loader.load_model('KMeans') ##Code changed #pred_data = data.drop(['Wafer'],axis=1) clusters=kmeans.predict(data.drop(['Wafer'],axis=1))#drops the first column for cluster prediction data['clusters']=clusters clusters=data['clusters'].unique() for i in clusters: #selecting all the records of a perticular cluster type cluster_data= data[data['clusters']==i] #getting all the wafer names wafer_names = list(cluster_data['Wafer']) #dropping wafer and clusters columns cluster_data = data.drop(['Wafer','clusters'],axis=1) #finding the model name for that cluster model_name = file_loader.find_correct_model_file(i) #loading the model using the model name model = file_loader.load_model(model_name) #these are the predicted values pred_values = list(model.predict(cluster_data)) #creating a dataframe with wafernames and predictions result = pandas.DataFrame(list(zip(wafer_names,pred_values)),columns=['Wafer','Prediction']) #path to save the dataframe as csv file path = "Prediction_Output_File/Predictions.csv" #writing to csv files result.to_csv(path,header=True,mode='a+') #appends result to prediction file self.log_writer.log(self.file_object,'End of Prediction') except Exception as ex: self.log_writer.log(self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return path, result.head().to_json(orient="records")
def predictionFromModel(self): try: self.pred_data_val.deletePredictionFile( ) #deletes the existing prediction file from last run! self.log_writer.log(self.file_object, 'Start of Prediction') data_getter = data_loader_prediction.Data_Getter_Pred( self.file_object, self.log_writer) data = data_getter.get_data() preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) data.replace(-1, 1, inplace=True) new_data = data[[ 'H18', 'F76', 'F46', 'G57', 'C13', 'A71', 'E115', 'F56', 'I59', 'A91' ]] #data=data.to_numpy() file_loader = file_methods.File_Operation(self.file_object, self.log_writer) kmeans = file_loader.load_model('KMeans') clusters = kmeans.predict( new_data) #drops the first column for cluster prediction new_data['clusters'] = clusters clusters = new_data['clusters'].unique() result = [] # initialize balnk list for storing predicitons for i in clusters: cluster_data = new_data[new_data['clusters'] == i] cluster_data = cluster_data.drop(['clusters'], axis=1) model_name = file_loader.find_correct_model_file(i) model = file_loader.load_model(model_name) for val in (model.predict(cluster_data)): result.append(val) result = pandas.DataFrame(result, columns=['Prediction']) path = "Prediction_Output_File/Predictions.csv" result.to_csv("Prediction_Output_File/Predictions.csv", header=True, mode='a+') #appends result to prediction file self.log_writer.log(self.file_object, 'End of Prediction') except Exception as ex: self.log_writer.log( self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return path, result.head().to_json(orient="records")
def predictionFromModel(self): try: self.pred_data_val.deletePredictionFile() self.log_writer.log(self.file_object, "Start of Prediction!!") data_getter = Data_Getter_Pred(self.file_object, self.log_writer) data = data_getter.get_data() preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) data = preprocessor.replaceInvalidValuesWithNull(data) is_null_present, cols_with_missing_values = preprocessor.is_null_present( data) if (is_null_present): data = preprocessor.impute_missing_values( data, cols_with_missing_values) file_loader = file_methods.File_Operation(self.file_object, self.log_writer) kmeans = file_loader.load_model('KMeans') clusters = kmeans.predict(data) data['clusters'] = clusters clusters = data['clusters'].unique() results = [] for i in clusters: cluster_data = data[data['clusters'] == i] cluster_data = cluster_data.drop(['clusters'], axis=1) model_name = file_loader.find_correct_model_file(i) model = file_loader.load_model(model_name) for val in (model.predict(cluster_data)): results.append(val) results = pd.DataFrame(results, columns=['Predictions']) path = "Prediction_Output_File/Predictions.csv" results.to_csv("Prediction_Output_File/Predictions.csv", header=True) self.log_writer.log(self.file_object, 'End of Prediction') except Exception as e: self.log_writer.log( self.file_object, 'Error occured while running the prediction!! Error:: %s' % e) raise e return path
def predictionFromModel(self): try: self.pred_data_val.deletePredictionFile( ) #deletes the existing prediction file from last run! self.log_writer.log(self.file_object, 'Start of Prediction') data_getter = data_loader_prediction.Data_Getter_Pred( self.file_object, self.log_writer) data = data_getter.get_data() preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) is_null_present, cols_with_missing_values = preprocessor.is_null_present( data) if (is_null_present): data = preprocessor.impute_missing_values(data) #data = preprocessor.logTransformation(data) #encode the prediction data data_scaled = preprocessor.encodeCategoricalValuesPrediction(data) ###Time features data = preprocessor.create_timefeatures(data) #data=data.to_numpy() file_loader = file_methods.File_Operation(self.file_object, self.log_writer) model = file_loader.load_model('XGBOOST') result.model.predict(data) result = pandas.DataFrame(result, columns=['Predictions']) result['Item_Identifier'] = data["Item_Identifier"] result["Outlet_Identifier"] = data["Outlet_Identifier"] path = "Prediction_Output_File/Predictions.csv" result.to_csv("Prediction_Output_File/Predictions.csv", header=True) #appends result to prediction file self.log_writer.log(self.file_object, 'End of Prediction') except Exception as ex: self.log_writer.log( self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return path
def predictionFromModel(self, input): try: # self.pred_data_val.deletePredictionFile() # deletes the existing prediction file from last run! self.log_writer.log(self.file_object, 'Start of Prediction') file_loader = file_methods.File_Operation(self.file_object, self.log_writer) print('file_loader ready ', file_loader) bike_model = file_loader.load_model('bike_share_rf_model') predval = bike_model.predict(input) except Exception as ex: print('Got some errors') print('Error mesage ', ex) self.log_writer.log( self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return predval
def create_clusters(self, data, number_of_clusters): """ Method Name: create_clusters Description: Create a new dataframe consisting of the cluster information. """ self.logger_object.log( self.file_object, 'Entered the create_clusters method of the KMeansClustering class') self.data = data try: self.kmeans = KMeans(n_clusters=number_of_clusters, init='k-means++', random_state=42) self.y_kmeans = self.kmeans.fit_predict( data) # divide data into clusters self.file_op = file_methods.File_Operation(self.file_object, self.logger_object) # saving the KMeans model to directory # passing 'Model' as the functions need three parameters self.save_model = self.file_op.save_model(self.kmeans, 'KMeans') self.data[ 'Cluster'] = self.y_kmeans # create a new column in dataset for storing the cluster information self.logger_object.log( self.file_object, 'succesfully created ' + str(self.kn.knee) + 'clusters. Exited the create_clusters method of the KMeansClustering class' ) return self.data except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in create_clusters method of the KMeansClustering class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Fitting the data to clusters failed. Exited the create_clusters method of the KMeansClustering class' ) raise Exception()
def create_clusters(self, data, number_of_clusters): """ Create a new DataFrame consisteing of the cluster information. :param data: :param number_of_clusters: :return: """ self.logger_object.log( self.file_object, 'Entered the create_clusters method of KMeansClustering class.') self.data = data try: self.kmeans = KMeans(n_clusters=number_of_clusters, init='k-means++', random_state=42) self.y_means = self.kmeans.fit_predict( data) #divide data into clusters self.file_op = file_methods.File_Operation(self.file_object, self.logger_object) self.save_model = self.file_op.save_model( self.kmeans, 'KMeans') # saving the model to directory self.data[ 'Clusters'] = self.y_means # create a new column in data set to store the cluster information return self.data except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in create_clusters method of the KMeansClustering class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Fitting the data to clusters failed. Exited the create_clusters method of the KMeansClustering class' ) raise Exception()
def trainingModel(self): # Logging the start of Training self.log_writer.log(self.file_object, 'Start of Training') try: # Getting the data from the source data_getter = data_loader.Data_Getter(self.file_object, self.log_writer) data = data_getter.get_data() """doing the data preprocessing""" preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) #removing unwanted columns as discussed in the EDA part in ipynb file data = preprocessor.dropUnnecessaryColumns(data, ['id']) #removing outliers from columns like height, weight, ap_hi, ap_lo data = preprocessor.dropOutliers(data) #processing gender and age columns and add new column BMI as discussed in the EDA part data = preprocessor.dataProcessor(data) # create separate features and labels X, Y = preprocessor.separate_label_feature( data, label_column_name='cardio') """ Applying the clustering approach""" kmeans = clustering.KMeansClustering( self.file_object, self.log_writer) # object initialization. number_of_clusters = kmeans.elbow_plot( X ) # using the elbow plot to find the number of optimum clusters # Divide the data into clusters X = kmeans.create_clusters(X, number_of_clusters) #create a new column in the dataset consisting of the corresponding cluster assignments. X['Labels'] = Y # getting the unique clusters from our dataset list_of_clusters = X['Cluster'].unique() """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster""" for i in list_of_clusters: cluster_data = X[X['Cluster'] == i] # filter the data for one cluster # Prepare the feature and Label columns cluster_features = cluster_data.drop(['Labels', 'Cluster'], axis=1) cluster_label = cluster_data['Labels'] #To handle Imbalance dataset. rdsmple = RandomOverSampler() x_sampled, y_sampled = rdsmple.fit_sample( cluster_features, cluster_label) # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split( x_sampled, y_sampled, test_size=1 / 3, random_state=355) model_finder = tuner.Model_Finder( self.file_object, self.log_writer) # object initialization #getting the best model for each of the clusters best_model_name, best_model = model_finder.get_best_model( x_train, y_train, x_test, y_test) #saving the best model to the directory. file_op = file_methods.File_Operation(self.file_object, self.log_writer) save_model = file_op.save_model(best_model, best_model_name + str(i)) # logging the successful Training self.log_writer.log(self.file_object, 'Successful End of Training') self.file_object.close() except Exception: # logging the unsuccessful Training self.log_writer.log(self.file_object, 'Unsuccessful End of Training') self.file_object.close() raise Exception
def trainingModel(self): # Logging the start of Training self.log_writer.log(self.file_object, 'Start of Training') try: # Getting the data from the source data_getter = data_loader.Data_Getter(self.file_object, self.log_writer) data = data_getter.get_data() """doing the data preprocessing""" preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) data = preprocessor.remove_columns( data, ['Wafer'] ) # remove the unnamed column as it doesn't contribute to prediction. # create separate features and labels X, Y = preprocessor.separate_label_feature( data, label_column_name='Output') # check if missing values are present in the dataset is_null_present = preprocessor.is_null_present(X) # if missing values are there, replace them appropriately. if (is_null_present): X = preprocessor.impute_missing_values( X) # missing value imputation # check further which columns do not contribute to predictions # if the standard deviation for a column is zero, it means that the column has constant values # and they are giving the same output both for good and bad sensors # prepare the list of such columns to drop cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(X) # drop the columns obtained above X = preprocessor.remove_columns(X, cols_to_drop) """ Applying the clustering approach""" kmeans = clustering.KMeansClustering( self.file_object, self.log_writer) # object initialization. number_of_clusters = kmeans.elbow_plot( X ) # using the elbow plot to find the number of optimum clusters # Divide the data into clusters X = kmeans.create_clusters(X, number_of_clusters) #create a new column in the dataset consisting of the corresponding cluster assignments. X['Labels'] = Y # getting the unique clusters from our dataset list_of_clusters = X['Cluster'].unique() """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster""" for i in list_of_clusters: cluster_data = X[X['Cluster'] == i] # filter the data for one cluster # Prepare the feature and Label columns cluster_features = cluster_data.drop(['Labels', 'Cluster'], axis=1) cluster_label = cluster_data['Labels'] # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split( cluster_features, cluster_label, test_size=1 / 3, random_state=355) model_finder = tuner.Model_Finder( self.file_object, self.log_writer) # object initialization #getting the best model for each of the clusters best_model_name, best_model = model_finder.get_best_model( x_train, y_train, x_test, y_test) #saving the best model to the directory. file_op = file_methods.File_Operation(self.file_object, self.log_writer) save_model = file_op.save_model(best_model, best_model_name + str(i)) # logging the successful Training self.log_writer.log(self.file_object, 'Successful End of Training') self.file_object.close() except Exception: # logging the unsuccessful Training self.log_writer.log(self.file_object, 'Unsuccessful End of Training') self.file_object.close() raise Exception
def trainingModel(self): # Logging the start of Training self.log_writer.log(self.file_object, 'Start of Training') try: # Getting the data from the source data_getter = data_loader.Data_Getter(self.file_object, self.log_writer) data = data_getter.get_data() """doing the data preprocessing""" preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) #data=preprocessor.remove_columns(data,['Wafer']) # remove the unnamed column as it doesn't contribute to prediction. data = preprocessor.enocdeCategoricalvalues(data) X = data.drop(['class'], axis=1) Y = data['class'] X, Y = preprocessor.handleImbalanceDataset(X, Y) """ Applying the clustering approach""" kmeans = clustering.KMeansClustering( self.file_object, self.log_writer) # object initialization. number_of_clusters = kmeans.elbow_plot( X ) # using the elbow plot to find the number of optimum clusters # Divide the data into clusters X = kmeans.create_clusters(X, number_of_clusters) #create a new column in the dataset consisting of the corresponding cluster assignments. X['Labels'] = Y # getting the unique clusters from our dataset list_of_clusters = X['Cluster'].unique() """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster""" for i in list_of_clusters: cluster_data = X[X['Cluster'] == i] # filter the data for one cluster # Prepare the feature and Label columns cluster_features = cluster_data.drop(['Labels', 'Cluster'], axis=1) cluster_label = cluster_data['Labels'] # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split( cluster_features, cluster_label, test_size=1 / 3, random_state=355) x_train = preprocessor.scaleData(x_train) x_test = preprocessor.scaleData(x_test) model_finder = tuner.Model_Finder( self.file_object, self.log_writer) # object initialization #getting the best model for each of the clusters best_model_name, best_model = model_finder.get_best_model( x_train, y_train, x_test, y_test) #saving the best model to the directory. file_op = file_methods.File_Operation(self.file_object, self.log_writer) save_model = file_op.save_model(best_model, best_model_name + str(i)) # logging the successful Training self.log_writer.log(self.file_object, 'Successful End of Training') self.file_object.close() except Exception: # logging the unsuccessful Training self.log_writer.log(self.file_object, 'Unsuccessful End of Training') self.file_object.close() raise Exception
def trainingModel(self): # Logging the start of Training self.log_writer.log(self.file_object, 'Start of Training') try: # Getting the data from the source data_getter = data_loader.Data_Getter(self.file_object, self.log_writer) data = data_getter.get_data() """doing the data preprocessing""" preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) #data.replace('?',np.NaN,inplace=True) # replacing '?' with NaN values for imputation # create separate features and labels X, Y = preprocessor.separate_label_feature( data, label_column_name='default payment next month') # check if missing values are present in the dataset is_null_present, cols_with_missing_values = preprocessor.is_null_present( X) # if missing values are there, replace them appropriately. if (is_null_present): X = preprocessor.impute_missing_values( X, cols_with_missing_values) # missing value imputation """ Applying the clustering approach""" kmeans = clustering.KMeansClustering( self.file_object, self.log_writer) # object initialization. number_of_clusters = kmeans.elbow_plot( X ) # using the elbow plot to find the number of optimum clusters # Divide the data into clusters X = kmeans.create_clusters(X, number_of_clusters) #create a new column in the dataset consisting of the corresponding cluster assignments. X['Labels'] = Y # getting the unique clusters from our dataset list_of_clusters = X['Cluster'].unique() """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster""" for i in list_of_clusters: cluster_data = X[X['Cluster'] == i] # filter the data for one cluster # Prepare the feature and Label columns cluster_features = cluster_data.drop(['Labels', 'Cluster'], axis=1) cluster_label = cluster_data['Labels'] # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split( cluster_features, cluster_label, test_size=1 / 3, random_state=355) # Proceeding with more data pre-processing steps train_x = preprocessor.scale_numerical_columns(x_train) test_x = preprocessor.scale_numerical_columns(x_test) model_finder = tuner.Model_Finder( self.file_object, self.log_writer) # object initialization #getting the best model for each of the clusters best_model_name, best_model = model_finder.get_best_model( train_x, y_train, test_x, y_test) #saving the best model to the directory. file_op = file_methods.File_Operation(self.file_object, self.log_writer) save_model = file_op.save_model(best_model, best_model_name + str(i)) # logging the successful Training self.log_writer.log(self.file_object, 'Successful End of Training') self.file_object.close() except Exception as e: # logging the unsuccessful Training self.log_writer.log(self.file_object, 'Unsuccessful End of Training') self.file_object.close() raise Exception
def predictionFromModel(self): try: self.pred_data_val.deletePredictionFile() self.log_writer.log(self.file_object, 'Start of Prediction') data_getter = data_loader_prediction.Data_Getter_Pred( self.file_object, self.log_writer) data = data_getter.get_data() preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) data = preprocessor.remove_columns(data, [ 'policy_number', 'policy_bind_date', 'policy_state', 'insured_zip', 'incident_location', 'incident_date', 'incident_state', 'incident_city', 'insured_hobbies', 'auto_make', 'auto_model', 'auto_year', 'age', 'total_claim_amount' ]) data.replace('?', np.NaN, inplace=True) is_null_present, cols_with_missing_values = preprocessor.is_null_present( data) if (is_null_present): data = preprocessor.impute_missing_values( data, cols_with_missing_values) data = preprocessor.encode_categorical_columns(data) data = preprocessor.scale_numerical_columns(data) file_loader = file_methods.File_Operation(self.file_object, self.log_writer) kmeans = file_loader.load_model('KMeans') clusters = kmeans.predict(data) data['clusters'] = clusters clusters = data['clusters'].unique() predictions = [] for i in clusters: cluster_data = data[data['clusters'] == i] cluster_data = cluster_data.drop(['clusters'], axis=1) model_name = file_loader.find_correct_model_file(i) model = file_loader.load_model(model_name) result = (model.predict(cluster_data)) for res in result: if res == 0: predictions.append('N') else: predictions.append('Y') final = pd.DataFrame(list(zip(predictions)), columns=['Predictions']) path = "Prediction_Output_File/Predictions.csv" final.to_csv("Prediction_Output_File/Predictions.csv", header=True, mode='a+') self.log_writer.log(self.file_object, 'End of Prediction') except Exception as ex: self.log_writer.log( self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return path
def trainingModel(self): # Logging the start of Training self.log_writer.log(self.file_object, 'Start of Training') try: # Getting the data from the source data_getter = data_loader.Data_Getter(self.file_object, self.log_writer) data = data_getter.get_data() """doing the data preprocessing""" preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) #data=preprocessor.remove_columns(data,['Wafer']) # remove the unnamed column as it doesn't contribute to prediction. #removing unwanted columns as discussed in the EDA part in ipynb file data = preprocessor.dropUnnecessaryColumns(data, [ 'DATE', 'Precip', 'WETBULBTEMPF', 'DewPointTempF', 'StationPressure' ]) #repalcing '?' values with np.nan as discussed in the EDA part data = preprocessor.replaceInvalidValuesWithNull(data) # check if missing values are present in the dataset is_null_present, cols_with_missing_values = preprocessor.is_null_present( data) # if missing values are there, replace them appropriately. if (is_null_present): data = preprocessor.impute_missing_values( data) # missing value imputation # get encoded values for categorical data #data = preprocessor.encodeCategoricalValues(data) # create separate features and labels X, Y = preprocessor.separate_label_feature( data, label_column_name='VISIBILITY') # drop the columns obtained above #X=preprocessor.remove_columns(X,cols_to_drop) """ Applying the clustering approach""" kmeans = clustering.KMeansClustering( self.file_object, self.log_writer) # object initialization. number_of_clusters = kmeans.elbow_plot( X ) # using the elbow plot to find the number of optimum clusters # Divide the data into clusters X = kmeans.create_clusters(X, number_of_clusters) #create a new column in the dataset consisting of the corresponding cluster assignments. X['Labels'] = Y # getting the unique clusters from our dataset list_of_clusters = X['Cluster'].unique() """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster""" for i in list_of_clusters: cluster_data = X[X['Cluster'] == i] # filter the data for one cluster # Prepare the feature and Label columns cluster_features = cluster_data.drop(['Labels', 'Cluster'], axis=1) cluster_label = cluster_data['Labels'] # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split( cluster_features, cluster_label, test_size=1 / 3, random_state=36) x_train_scaled = preprocessor.standardScalingData(x_train) x_test_scaled = preprocessor.standardScalingData(x_test) model_finder = tuner.Model_Finder( self.file_object, self.log_writer) # object initialization #getting the best model for each of the clusters best_model_name, best_model = model_finder.get_best_model( x_train_scaled, y_train, x_test_scaled, y_test) #saving the best model to the directory. file_op = file_methods.File_Operation(self.file_object, self.log_writer) save_model = file_op.save_model(best_model, best_model_name + str(i)) # logging the successful Training self.log_writer.log(self.file_object, 'Successful End of Training') self.file_object.close() except Exception: # logging the unsuccessful Training self.log_writer.log(self.file_object, 'Unsuccessful End of Training') self.file_object.close() raise Exception
def trainingModel(self): # Logging the start of Training self.log_writer.log(self.file_object, 'Start of Training') try: # Getting the data from the source data_getter=data_loader.Data_Getter(self.file_object,self.log_writer) data=data_getter.get_data() """ doing the data preprocessing. All the pre processing steps are based on the EDA done previously """ """ 1. Duplicate 2. Remove columns: "serial","rate","listed_in(type)","listed_in(city)" 3. Null removal 4. Convert cost column to number 5. Categorical to Numerical """ preprocessor=preprocessing.Preprocessor(self.file_object,self.log_writer) #removing unwanted columns as discussed in the EDA part in ipynb file data = preprocessor.dropUnnecessaryColumns(data,["serial","rate","listed_in(type)","listed_in(city)"]) # removing the duplicates data=preprocessor.removeDuplicates(data) # check if missing values are present in the dataset is_null_present,cols_with_missing_values=preprocessor.is_null_present(data) # if missing values are there, replace them appropriately. if(is_null_present): # here we won't do any imputation, just to show one more way, we'll drop the missing values data=data.dropna(how='any') # cost value to float data=preprocessor.convertCostToNumber(data) # get encoded values for categorical data data = preprocessor.encodeCategoricalValues(data) # create separate features and labels X, Y = preprocessor.separate_label_feature(data, label_column_name='approx_cost(for two people)') """ Applying the clustering approach""" kmeans=clustering.KMeansClustering(self.file_object,self.log_writer) # object initialization. number_of_clusters=kmeans.elbow_plot(X) # using the elbow plot to find the number of optimum clusters # Divide the data into clusters X=kmeans.create_clusters(X,number_of_clusters) #create a new column in the dataset consisting of the corresponding cluster assignments. X['Labels']=Y # getting the unique clusters from our dataset list_of_clusters=X['Cluster'].unique() """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster""" for i in list_of_clusters: cluster_data=X[X['Cluster']==i] # filter the data for one cluster # Prepare the feature and Label columns cluster_features=cluster_data.drop(['Labels','Cluster'],axis=1) cluster_label= cluster_data['Labels'] # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split(cluster_features, cluster_label, test_size=1 / 3, random_state=36) x_train_scaled = preprocessor.standardScalingData(x_train) x_test_scaled = preprocessor.standardScalingData(x_test) model_finder=tuner.Model_Finder(self.file_object,self.log_writer) # object initialization #getting the best model for each of the clusters best_model_name,best_model=model_finder.get_best_model(x_train_scaled,y_train,x_test_scaled,y_test) #saving the best model to the directory. file_op = file_methods.File_Operation(self.file_object,self.log_writer) save_model=file_op.save_model(best_model,best_model_name+str(i)) # logging the successful Training self.log_writer.log(self.file_object, 'Successful End of Training') self.file_object.close() except Exception as e: # logging the unsuccessful Training self.log_writer.log(self.file_object, 'Unsuccessful End of Training') self.file_object.close() raise e
def trainingModel(self): # Logging the start of Training self.log_writer.log(self.file_object, 'Start of Training') try: self.log_writer.log(self.file_object, 'Starting of Training') # Getting the data from the source data_getter = data_loader.Data_Getter(self.file_object, self.log_writer) data = data_getter.get_data() print(data.head()) """doing the data preprocessing as dicussed in EDA""" preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) #data=preprocessor.remove_columns(data,['Wafer']) # remove the unnamed column as it doesn't contribute to prediction. data = preprocessor.binning(data) #removing unwanted columns as discussed in the EDA part in ipynb file data = preprocessor.dropUnnecessaryColumns(data, ['Ageband']) #print(data.isnull().sum()) data = preprocessor.combiningfornewfeature(data) data = preprocessor.dropUnnecessaryColumns( data, ['Parch', 'Sibsp', 'FamilySize', 'Pid']) data = preprocessor.convertCategoricalfeatureIntonumeric(data) data = preprocessor.binningfare(data) data = preprocessor.dropUnnecessaryColumns(data, ['FareBand']) print(data.head()) #print(data.isnull().sum()) # check if missing values are present in the dataset is_null_present, cols_with_missing_values = preprocessor.is_null_present( data) # if missing values are there, replace them appropriately. if (is_null_present): data = preprocessor.impute_missing_values( data) # missing value imputation # create separate features and labels X, Y = preprocessor.separate_label_feature( data, label_column_name='Survived') print(Y) #We donot need to encode any value as we have opted Binning in this case. #All data is fine and ready to scaling/Modeling. """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster""" #for i in list_of_clusters: # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.33, random_state=36) x_train_scaled = preprocessor.standardScalingData(x_train) x_test_scaled = preprocessor.standardScalingData(x_test) model_finder = tuner.Model_Finder( self.file_object, self.log_writer) # object initialization #getting the best model. best_model_name, best_model, prediction, acc = model_finder.get_best_model( x_train_scaled, y_train, x_test_scaled, y_test) #saving the best model to the directory. print("Predictions:") print(prediction) print("Accuracy:") print(acc) file_op = file_methods.File_Operation(self.file_object, self.log_writer) self.log_writer.log(self.file_object, 'Going to create directory') #save_model=file_op.save_model(best_model,best_model_name+str(i)) save_model = file_op.save_model(best_model, best_model_name) # logging the successful Training self.log_writer.log(self.file_object, 'Successful End of Training') self.file_object.close() except Exception: # logging the unsuccessful Training self.log_writer.log(self.file_object, 'Unsuccessful End of Training') self.file_object.close() raise Exception
def predictionFromModel(self): try: self.pred_data_val.deletePredictionFile( ) #deletes the existing prediction file from last run! self.log_writer.log(self.file_object, 'Start of Prediction') data_getter = data_loader_prediction.Data_Getter_Pred( self.file_object, self.log_writer) data = data_getter.get_data() #code change # wafer_names=data['Wafer'] # data=data.drop(labels=['Wafer'],axis=1) preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) is_null_present, cols_with_missing_values = preprocessor.is_null_present( data) if (is_null_present): data = preprocessor.impute_missing_values(data) #data = preprocessor.logTransformation(data) cols_to_drop = ["Item_Identifier", "Outlet_Identifier"] data_useful = preprocessor.remove_columns(data, cols_to_drop) #scale the prediction data data_scaled = preprocessor.scale_numerical_columns(data_useful) #data=data.to_numpy() file_loader = file_methods.File_Operation(self.file_object, self.log_writer) kmeans = file_loader.load_model('KMeans') ##Code changed #pred_data = data.drop(['Wafer'],axis=1) clusters = kmeans.predict( data_scaled) #drops the first column for cluster prediction data_scaled['clusters'] = clusters clusters = data_scaled['clusters'].unique() result = [] # initialize blank list for storing predicitons # with open('EncoderPickle/enc.pickle', 'rb') as file: #let's load the encoder pickle file to decode the values # encoder = pickle.load(file) for i in clusters: cluster_data = data_scaled[data_scaled['clusters'] == i] cluster_data = cluster_data.drop(['clusters'], axis=1) model_name = file_loader.find_correct_model_file(i) model = file_loader.load_model(model_name) for val in (model.predict(cluster_data.values)): result.append(val) result = pandas.DataFrame(result, columns=['Predictions']) result['Item_Identifier'] = data["Item_Identifier"] result["Outlet_Identifier"] = data["Outlet_Identifier"] path = "Prediction_Output_File/Predictions.csv" result.to_csv("Prediction_Output_File/Predictions.csv", header=True) #appends result to prediction file self.log_writer.log(self.file_object, 'End of Prediction') except Exception as ex: self.log_writer.log( self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return path
def trainingModel(self): # Logging the start of Training self.log_writer.log(self.file_object, 'Start of Training') try: # Getting the data from the source data_getter = data_loader.Data_Getter(self.file_object, self.log_writer) data = data_getter.get_data() """doing the data preprocessing""" preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) # create separate features and labels X, Y = preprocessor.separate_label_feature(data, label_column_name='A1') new_X = X[[ 'H18', 'F76', 'F46', 'G57', 'C13', 'A71', 'E115', 'F56', 'I59', 'A91' ]] """ Applying the clustering approach""" kmeans = clustering.KMeansClustering( self.file_object, self.log_writer) # object initialization. number_of_clusters = kmeans.elbow_plot( new_X ) # using the elbow plot to find the number of optimum clusters # Divide the data into clusters X = kmeans.create_clusters(new_X, number_of_clusters) #create a new column in the dataset consisting of the corresponding cluster assignments. new_X['Labels'] = Y # getting the unique clusters from our dataset list_of_clusters = new_X['Cluster'].unique() """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster""" for i in list_of_clusters: cluster_data = new_X[new_X['Cluster'] == i] # filter the data for one cluster # Prepare the feature and Label columns cluster_features = cluster_data.drop(['Labels', 'Cluster'], axis=1) cluster_label = cluster_data['Labels'] # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split( cluster_features, cluster_label, test_size=1 / 3, random_state=355) model_finder = tuner.Model_Finder( self.file_object, self.log_writer) # object initialization #getting the best model for each of the clusters best_model_name, best_model = model_finder.get_best_model( x_train, y_train, x_test, y_test) #saving the best model to the directory. file_op = file_methods.File_Operation(self.file_object, self.log_writer) save_model = file_op.save_model(best_model, best_model_name + str(i)) # logging the successful Training self.log_writer.log(self.file_object, 'Successful End of Training') self.file_object.close() except Exception: # logging the unsuccessful Training self.log_writer.log(self.file_object, 'Unsuccessful End of Training') self.file_object.close() raise Exception