def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            #data=preprocessor.remove_columns(data,['Wafer']) # remove the unnamed column as it doesn't contribute to prediction.

            data = preprocessor.enocdeCategoricalvalues(data)

            X = data.drop(['class'], axis=1)
            Y = data['class']

            X, Y = preprocessor.handleImbalanceDataset(X, Y)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=355)
                x_train = preprocessor.scaleData(x_train)
                x_test = preprocessor.scaleData(x_test)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
Beispiel #2
0
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            data = preprocessor.remove_columns(
                data, ['Wafer']
            )  # remove the unnamed column as it doesn't contribute to prediction.

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='Output')

            # check if missing values are present in the dataset
            is_null_present = preprocessor.is_null_present(X)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                X = preprocessor.impute_missing_values(
                    X)  # missing value imputation

            # check further which columns do not contribute to predictions
            # if the standard deviation for a column is zero, it means that the column has constant values
            # and they are giving the same output both for good and bad sensors
            # prepare the list of such columns to drop
            cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(X)

            # drop the columns obtained above
            X = preprocessor.remove_columns(X, cols_to_drop)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=355)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
Beispiel #3
0
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            #data=preprocessor.remove_columns(data,['Wafer']) # remove the unnamed column as it doesn't contribute to prediction.

            #removing unwanted columns as discussed in the EDA part in ipynb file
            data = preprocessor.dropUnnecessaryColumns(data, [
                'DATE', 'Precip', 'WETBULBTEMPF', 'DewPointTempF',
                'StationPressure'
            ])

            #repalcing '?' values with np.nan as discussed in the EDA part

            data = preprocessor.replaceInvalidValuesWithNull(data)

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data)  # missing value imputation

            # get encoded values for categorical data

            #data = preprocessor.encodeCategoricalValues(data)

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='VISIBILITY')

            # drop the columns obtained above
            #X=preprocessor.remove_columns(X,cols_to_drop)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=36)

                x_train_scaled = preprocessor.standardScalingData(x_train)
                x_test_scaled = preprocessor.standardScalingData(x_test)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train_scaled, y_train, x_test_scaled, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            #data.replace('?',np.NaN,inplace=True) # replacing '?' with NaN values for imputation

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='default payment next month')

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                X)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                X = preprocessor.impute_missing_values(
                    X, cols_with_missing_values)  # missing value imputation
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=355)
                # Proceeding with more data pre-processing steps
                train_x = preprocessor.scale_numerical_columns(x_train)
                test_x = preprocessor.scale_numerical_columns(x_test)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    train_x, y_train, test_x, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception as e:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log('wafer_log', 'Start of Training')
        try:
            # Getting the data from the source
            # data_getter=data_loader.Data_Getter(self.file_object,self.log_writer)
            data = self.mongo.downlaod_all_from_mongo('wafer_good_data',
                                                      'temp_db')
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor('wafer_log',
                                                      self.log_writer)
            data = preprocessor.remove_columns(
                data, ['Wafer']
            )  # remove the wafer column as it doesn't contribute to prediction.

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='Good/Bad')

            # check if missing values are present in the dataset

            # if missing values are there, replace them appropriately.
            X.replace(to_replace='NULL', value=np.nan,
                      inplace=True)  # consumes  4 sec to compute
            is_null_present = preprocessor.is_null_present(X)
            if (is_null_present):
                X = preprocessor.impute_missing_values(
                    X)  # missing value imputation

            # check further which columns do not contribute to predictions
            # if the standard deviation for a column is zero, it means that the column has constant values
            # and they are giving the same output both for good and bad sensors
            # prepare the list of such columns to drop
            cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(
                X)  # consumes a lot of time
            # drop the columns obtained above
            X = preprocessor.remove_columns(X, cols_to_drop)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                'wafer_log', self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.

            # X=pd.DataFrame.join(X,Y)
            X['Labels'] = Y.values

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for index, i in enumerate(list_of_clusters):
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster
                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=355)

                model_finder = tuner.Model_Finder(
                    'wafer_log', self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                # file_op = file_methods.File_Operation('wafer_log',self.log_writer)
                # save_model=file_op.save_model(best_model,best_model_name+str(i))
                print(best_model)
                best_model = pickle.dumps(best_model)
                self.aws.Upload_To_S3_obj(best_model,
                                          best_model_name + str(index) +
                                          '.sav',
                                          bucket_prefix='wafer-model')

            # logging the successful Training
            self.log_writer.log('wafer_log', 'Successful End of Training')
            # self.file_object.close()

        except Exception as err:
            # logging the unsuccessful Training
            self.log_writer.log('wafer_log', 'Unsuccessful End of Training')
            # self.file_object.close()
            print(str(err))
            raise err
Beispiel #6
0
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter=data_loader.Data_Getter(self.file_object,self.log_writer)
            data=data_getter.get_data()


            """ doing the data preprocessing. 
            All the pre processing steps are based on the EDA done previously
            """
            """
            1. Duplicate
            2. Remove columns: 	"serial","rate","listed_in(type)","listed_in(city)"
            3. Null removal
            4. Convert cost column to number
            5. Categorical to Numerical
            """

            preprocessor=preprocessing.Preprocessor(self.file_object,self.log_writer)


            #removing unwanted columns as discussed in the EDA part in ipynb file
            data = preprocessor.dropUnnecessaryColumns(data,["serial","rate","listed_in(type)","listed_in(city)"])

            # removing the duplicates
            data=preprocessor.removeDuplicates(data)


            # check if missing values are present in the dataset
            is_null_present,cols_with_missing_values=preprocessor.is_null_present(data)

            # if missing values are there, replace them appropriately.
            if(is_null_present):
                # here we won't do any imputation, just to show one more way, we'll drop the missing values
                data=data.dropna(how='any')

            # cost value to float
            data=preprocessor.convertCostToNumber(data)

            # get encoded values for categorical data

            data = preprocessor.encodeCategoricalValues(data)

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(data, label_column_name='approx_cost(for two people)')


            """ Applying the clustering approach"""

            kmeans=clustering.KMeansClustering(self.file_object,self.log_writer) # object initialization.
            number_of_clusters=kmeans.elbow_plot(X)  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X=kmeans.create_clusters(X,number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels']=Y

            # getting the unique clusters from our dataset
            list_of_clusters=X['Cluster'].unique()

            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data=X[X['Cluster']==i] # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features=cluster_data.drop(['Labels','Cluster'],axis=1)
                cluster_label= cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(cluster_features, cluster_label, test_size=1 / 3, random_state=36)

                x_train_scaled = preprocessor.standardScalingData(x_train)
                x_test_scaled = preprocessor.standardScalingData(x_test)

                model_finder=tuner.Model_Finder(self.file_object,self.log_writer) # object initialization

                #getting the best model for each of the clusters
                best_model_name,best_model=model_finder.get_best_model(x_train_scaled,y_train,x_test_scaled,y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,self.log_writer)
                save_model=file_op.save_model(best_model,best_model_name+str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception as e:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object, 'Unsuccessful End of Training')
            self.file_object.close()
            raise e
Beispiel #7
0
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(data,
                                                       label_column_name='A1')

            new_X = X[[
                'H18', 'F76', 'F46', 'G57', 'C13', 'A71', 'E115', 'F56', 'I59',
                'A91'
            ]]
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                new_X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(new_X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            new_X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = new_X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = new_X[new_X['Cluster'] ==
                                     i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=355)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, "start of training")
        try:
            #get the data from the source
            data_getter = Data_Getter(self.file_object, self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""
            preprocessor = Preprocessor(self.file_object, self.log_writer)

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data)  # missing value imputation

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='Concrete_compressive _strength')

            X = preprocessor.logTransformation(X)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(X)

            #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)
            #create a new column in the dataset consisting of the corresponding cluster assignments.
            print('inside train model')
            print(Y)
            X['Labels'] = Y
            print(type(X))

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""
            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster
                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=30)

                x_train_scaled = preprocessor.standardScalingData(x_train)
                x_test_scaled = preprocessor.standardScalingData(x_test)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train_scaled, y_train, x_test_scaled, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
    def trainingModel(self):
        # Logging the start of Training
        self.log_db_writer.log(self.log_database, self.log_collection,
                               "Start of Training")
        print("training started")
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.log_database,
                                                  self.log_collection,
                                                  self.execution_id)
            data = data_getter.get_data()

            if data.__len__() == 0:
                self.log_db_writer.log(self.log_database, self.log_collection,
                                       "No record found to train model")
                print("No previous file available")
                return 0
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.log_database,
                                                      self.log_collection,
                                                      self.execution_id)

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data)  # missing value imputation

            # get encoded values for categorical data

            #data = preprocessor.encodeCategoricalValues(data)

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='Concrete_compressive _strength')
            # drop the columns obtained above
            #X=preprocessor.remove_columns(X,cols_to_drop)

            X = preprocessor.logTransformation(X)
            print(X)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.execution_id)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)
            print("cluster shape details")
            print(X)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=36)

                x_train_scaled = preprocessor.standardScalingData(x_train)
                x_test_scaled = preprocessor.standardScalingData(x_test)

                model_finder = tuner.Model_Finder(
                    self.log_database, self.log_collection,
                    self.execution_id)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train_scaled, y_train, x_test_scaled, y_test)

                model_metrics = model_finder.get_model_metrics(
                    best_model_name + str(i))

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.log_database,
                                                      self.log_collection,
                                                      self.execution_id)
                print(best_model_name + str(i))
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_db_writer.log(self.log_database, self.log_collection,
                                   'Successful End of Training')
            #self.log_database.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_db_writer.log(self.log_database, self.log_collection,
                                   'Unsuccessful End of Training')
            #self.log_database.close()
            raise Exception


#trainModelObj = trainModel(1111)  # object initialization
#trainModelObj.trainingModel()  # training the model for the files in the table
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            #removing unwanted columns as discussed in the EDA part in ipynb file
            data = preprocessor.dropUnnecessaryColumns(data, ['id'])

            #removing outliers from columns like height, weight, ap_hi, ap_lo
            data = preprocessor.dropOutliers(data)

            #processing gender and age columns and add new column BMI as discussed in the EDA part
            data = preprocessor.dataProcessor(data)

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='cardio')
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                #To handle Imbalance dataset.
                rdsmple = RandomOverSampler()
                x_sampled, y_sampled = rdsmple.fit_sample(
                    cluster_features, cluster_label)

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    x_sampled, y_sampled, test_size=1 / 3, random_state=355)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
Beispiel #11
0
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            #select only data which came from SPC
            spc_data = preprocessor.spc_data(data)

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                spc_data)

            #drop the column which has all null values

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data)  # missing value imputation

            # make a list of unnecessary columns
            unnecessaryColumns = [
                'Batch No', 'COMP frst R', 'COMP set', 'Cycle time',
                'Description', 'Filter Dust', 'MICOMP wtr', 'New Sand',
                'Old Sand', 'Premix time', 'Readymix time', 'SPC set moist',
                'Time of storage', 'Tot. Weight', 'Water Type', 'comp frst',
                'cost centre', 'moist act', 'temp act'
            ]

            # drop unnecessary columns
            spc_data = preprocessor.dropUnnecessaryColumns(
                spc_data, unnecessaryColumns)

            # separate good and bad batches, we are only data which has good batches.
            spc_good_data = preprocessor.separateGoodBatchData(spc_data)

            # create new column named TotalWater which add water qty and spc fine water
            spc_good_data = preprocessor.createTotalWaterColumn(spc_good_data)

            #dropping batch type, water qty and spc fine water
            todropcolumns = ['Water Qty.', 'SPC fine water', 'BatchType']
            spc_good_data = preprocessor.dropUnnecessaryColumns(
                spc_good_data, todropcolumns)

            # get encoded values for categorical data

            #data = preprocessor.encodeCategoricalValues(data)

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                spc_good_data, label_column_name='TotalWater')
            # drop the columns obtained above
            #X=preprocessor.remove_columns(X,cols_to_drop)

            X = preprocessor.logTransformation(X)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=36)

                x_train_scaled = preprocessor.standardScalingData(x_train)
                x_test_scaled = preprocessor.standardScalingData(x_test)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train_scaled, y_train, x_test_scaled, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
Beispiel #12
0
    def trainingModel(self):
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()

            #Preprocessing
            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            preprocessor.replaceInvalidValuesWithNull(data)

            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)

            if (is_null_present):
                preprocessor.impute_missing_values(data,
                                                   cols_with_missing_values)

            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='Result')

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  # using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            # create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=36)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                # getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                # saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
    def trainingModel(self):
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            data = preprocessor.remove_columns(
                data, ['Wafer']
            )  # remove the unnamed column as it doesn't contribute to prediction.

            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='Output')

            is_null_present = preprocessor.is_null_present(X)

            if (is_null_present):
                X = preprocessor.impute_missing_values(X)

            cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(X)

            X = preprocessor.remove_columns(X, cols_to_drop)

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            X = kmeans.create_clusters(X, number_of_clusters)

            X['Labels'] = Y

            list_of_clusters = X['Cluster'].unique()

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] == i]

                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=355)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            #removing unwanted columns as discussed in the EDA part in ipynb file
            print('Dropping Unnecessary columns done')
            data = preprocessor.dropUnnecessaryColumns(data, ['veil-type'])
            print('Operation Done!!')

            #repalcing '?' values with np.nan as discussed in the EDA part
            print('Replace Invalid Values with NULL')
            data = preprocessor.replaceInvalidValuesWithNull(data)
            print('Operation Done!!')

            # check if missing values are present in the dataset
            print('Getting columns for NULL values')
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)
            print('Operation Done!!')

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                print('Imputing Missing values!!')
                data = preprocessor.impute_missing_values(
                    data, cols_with_missing_values)  # missing value imputation
                print('Operation Done')

            # get encoded values for categorical data
            data = preprocessor.encodeCategoricalValues(data)

            data.to_csv('tmp.csv', index=False)
            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='class')

            # drop the columns obtained above
            #X=preprocessor.remove_columns(X,cols_to_drop)
            """ Applying the clustering approach"""
            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster
                print(cluster_data.shape)

                if 'Labels' in cluster_data:
                    print('Labels Column Found')
                if 'Cluster' in cluster_data:
                    print('Cluster Column Found')
                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']
                print(cluster_label)
                print('Cluster Label and Features Created')
                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    stratify=cluster_label,
                    random_state=365)
                print('Train Test Split Done!')
                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                print('Finding best model for cluster: ', i)
                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test, i)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))
                self.performance_list.extend(model_finder.perf_data)

            # logging the successful Training
            print(self.performance_list)
            print(type(self.performance_list))
            print('Inserting Performance Metrics to MongoDB')
            for dict_l in self.performance_list:
                self.dbObj.insertOneRecord('mushroomClassifierDB',
                                           'performance_metrics', dict_l)
            self.log_writer.log(self.file_object, 'Successful End of Training')
            print('Successfully end training')

            # Triggering Email
            msg = MIMEMultipart()
            msg['Subject'] = 'MushroomTypeClassifier - Model Train | ' + str(
                datetime.now())
            body = 'Model Training Done Successfully. Please find the models in models/ directory... <br><br> Thanks and Regards, <br> Rahul Garg'
            msg.attach(MIMEText(body, 'html'))
            to_addr = ['*****@*****.**']
            self.emailObj.trigger_mail(to_addr, [], msg)

        except Exception as e:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training: ' + e)
            raise Exception
Beispiel #15
0
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            data = preprocessor.remove_columns(
                data, []
            )  # remove the column as it doesn't contribute to prediction.
            data.replace(
                '?', np.NaN,
                inplace=True)  # replacing '?' with NaN values for imputation

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data, cols_with_missing_values)  # missing value imputation

            # since in our target column we have multi no labels and it is higly imbalance so i m grouping them using below preprocessor method

            data = preprocessor.grouping_values_of_target(data)

            # create separate two data frames one on which we will perform cluster  and other is attached after performing cluster
            X, Y = preprocessor.separate_data_frame(
                data, label_column_name=['Rings', 'Sex'])
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X = pd.concat([X, Y], axis=1, sort=False)

            # encode categorical data
            X = preprocessor.encode_categorical_columns(X)

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""
            df = pd.DataFrame(
                columns=['Cluster_No', 'Best_Model_Name', 'Roc_Auc_score'])

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Rings', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Rings']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=100)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model, Roc_Auc_score = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))
                df = df.append(
                    {
                        'Cluster_No': i,
                        'Best_Model_Name': best_model_name + str(i),
                        'Roc_Auc_score': Roc_Auc_score
                    },
                    ignore_index=True)

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()
            return df

        except Exception as e:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception