Esempio n. 1
0
    def trainingModel(self):
        # Logging start of the training
        self.loggerObj.logger_log("Start of TrainingModel")
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.loggerObj)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.loggerObj)

            # repalcing '?' values with np.nan as discussed in the EDA part
            data = preprocessor.replaceInvalidValuesWithNull(data)

            # Drop the columns which are having missing values more than 50% of total observations
            data = preprocessor.dropUnnecessaryColumns(data)

            # check if missing values are present in the dataset
            is_null_present = preprocessor.is_null_present(data)

            # if missing values are there, impute them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data)  # missing value imputation

            # create separate features and labels
            X, y = preprocessor.separate_label_feature(
                data, label_column_name='classes')

            # Categorical encoding
            X, y = preprocessor.encodeCategoricalValues(X, y)

            # Handling imbalance dataset using SMOTE
            #X, y = preprocessor.handleImbalanceDataset(X, y)

            # splitting the data into training and test set
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.3, random_state=44)

            # Standardisation of X_train data
            X_train = preprocessor.data_standardisation(X_train)

            # applying same standardisation object on X_test data
            X_test = preprocessor.prediction_data_standardisation(X_test)

            model_finder = tuner.Model_Finder(
                self.loggerObj)  # object initialization

            # getting the best model for each of the clusters
            best_model_name, best_model = model_finder.get_best_model(
                X_train, y_train, X_test, y_test)

            # saving the best model to the directory.
            file_op = file_methods.File_Operation(self.loggerObj)
            save_model = file_op.save_model(best_model, best_model_name)

        except Exception:
            # Logging the unsuccessful training
            self.loggerObj.logger_log("Unsuccessful end of training")
            raise Exception
Esempio n. 2
0
    def train_model(self):
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            main_data, additional_data = data_getter.get_data()

            preprocessor = data_preprocessing.PreProcessor(
                self.file_object, self.log_writer)
            is_null_present = preprocessor.is_null_present(main_data)
            if is_null_present == True:
                main_data = preprocessor.impute_missing_values(main_data)
            main_data = preprocessor.map_ip_to_country(main_data,
                                                       additional_data)
            main_data = preprocessor.difference_signup_and_purchase(main_data)
            main_data = preprocessor.encoding_browser(main_data)
            main_data = preprocessor.encoding_source(main_data)
            main_data = preprocessor.encoding_sex(main_data)
            main_data = preprocessor.count_frequency_encoding_country(
                main_data)
            main_data = preprocessor.remove_unwanted_cols(main_data)
            x, y = preprocessor.separate_label_feature(main_data, 'class')

            x_train, x_test, y_train, y_test = train_test_split(x,
                                                                y,
                                                                test_size=0.3)

            #x_train,y_train = preprocessor.over_sampling_smote(x_train,y_train)

            model_finder = tuner.Model_Finder(self.file_object,
                                              self.log_writer)
            best_model_name, best_model = model_finder.get_best_model(
                x_train, y_train, x_test, y_test)

            file_op = file_methods.File_Operation(self.file_object,
                                                  self.log_writer)
            save_model = file_op.save_model(best_model, best_model_name)

            self.log_writer.log(self.file_object,
                                'Successfull End of Training')
            self.file_object.close()

        except Exception as e:
            self.log_writer.log(self.file_object,
                                'Unsuccessfull End of Training')
            self.file_object.close()
            raise e
Esempio n. 3
0
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            self.log_writer.log(self.file_object, 'Starting of Training')
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            print(data.head())
            """doing the data preprocessing as dicussed in EDA"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            #data=preprocessor.remove_columns(data,['Wafer']) # remove the unnamed column as it doesn't contribute to prediction.
            data = preprocessor.binning(data)
            #removing unwanted columns as discussed in the EDA part in ipynb file
            data = preprocessor.dropUnnecessaryColumns(data, ['Ageband'])
            #print(data.isnull().sum())
            data = preprocessor.combiningfornewfeature(data)
            data = preprocessor.dropUnnecessaryColumns(
                data, ['Parch', 'Sibsp', 'FamilySize', 'Pid'])

            data = preprocessor.convertCategoricalfeatureIntonumeric(data)

            data = preprocessor.binningfare(data)
            data = preprocessor.dropUnnecessaryColumns(data, ['FareBand'])
            print(data.head())
            #print(data.isnull().sum())

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data)  # missing value imputation

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='Survived')
            print(Y)
            #We donot need to encode any value as we have opted Binning in this case.
            #All data is fine and ready to scaling/Modeling.
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            #for i in list_of_clusters:

            # splitting the data into training and test set for each cluster one by one
            x_train, x_test, y_train, y_test = train_test_split(
                X, Y, test_size=0.33, random_state=36)
            x_train_scaled = preprocessor.standardScalingData(x_train)
            x_test_scaled = preprocessor.standardScalingData(x_test)

            model_finder = tuner.Model_Finder(
                self.file_object, self.log_writer)  # object initialization
            #getting the best model.
            best_model_name, best_model, prediction, acc = model_finder.get_best_model(
                x_train_scaled, y_train, x_test_scaled, y_test)
            #saving the best model to the directory.
            print("Predictions:")
            print(prediction)
            print("Accuracy:")
            print(acc)

            file_op = file_methods.File_Operation(self.file_object,
                                                  self.log_writer)
            self.log_writer.log(self.file_object, 'Going to create directory')

            #save_model=file_op.save_model(best_model,best_model_name+str(i))
            save_model = file_op.save_model(best_model, best_model_name)

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, "start of training")
        try:
            #get the data from the source
            data_getter = Data_Getter(self.file_object, self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""
            preprocessor = Preprocessor(self.file_object, self.log_writer)

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data)  # missing value imputation

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='Concrete_compressive _strength')

            X = preprocessor.logTransformation(X)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(X)

            #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)
            #create a new column in the dataset consisting of the corresponding cluster assignments.
            print('inside train model')
            print(Y)
            X['Labels'] = Y
            print(type(X))

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""
            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster
                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=30)

                x_train_scaled = preprocessor.standardScalingData(x_train)
                x_test_scaled = preprocessor.standardScalingData(x_test)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train_scaled, y_train, x_test_scaled, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
Esempio n. 5
0
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(data,
                                                       label_column_name='A1')

            new_X = X[[
                'H18', 'F76', 'F46', 'G57', 'C13', 'A71', 'E115', 'F56', 'I59',
                'A91'
            ]]
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                new_X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(new_X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            new_X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = new_X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = new_X[new_X['Cluster'] ==
                                     i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=355)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            #removing unwanted columns as discussed in the EDA part in ipynb file
            print('Dropping Unnecessary columns done')
            data = preprocessor.dropUnnecessaryColumns(data, ['veil-type'])
            print('Operation Done!!')

            #repalcing '?' values with np.nan as discussed in the EDA part
            print('Replace Invalid Values with NULL')
            data = preprocessor.replaceInvalidValuesWithNull(data)
            print('Operation Done!!')

            # check if missing values are present in the dataset
            print('Getting columns for NULL values')
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)
            print('Operation Done!!')

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                print('Imputing Missing values!!')
                data = preprocessor.impute_missing_values(
                    data, cols_with_missing_values)  # missing value imputation
                print('Operation Done')

            # get encoded values for categorical data
            data = preprocessor.encodeCategoricalValues(data)

            data.to_csv('tmp.csv', index=False)
            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='class')

            # drop the columns obtained above
            #X=preprocessor.remove_columns(X,cols_to_drop)
            """ Applying the clustering approach"""
            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster
                print(cluster_data.shape)

                if 'Labels' in cluster_data:
                    print('Labels Column Found')
                if 'Cluster' in cluster_data:
                    print('Cluster Column Found')
                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']
                print(cluster_label)
                print('Cluster Label and Features Created')
                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    stratify=cluster_label,
                    random_state=365)
                print('Train Test Split Done!')
                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                print('Finding best model for cluster: ', i)
                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test, i)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))
                self.performance_list.extend(model_finder.perf_data)

            # logging the successful Training
            print(self.performance_list)
            print(type(self.performance_list))
            print('Inserting Performance Metrics to MongoDB')
            for dict_l in self.performance_list:
                self.dbObj.insertOneRecord('mushroomClassifierDB',
                                           'performance_metrics', dict_l)
            self.log_writer.log(self.file_object, 'Successful End of Training')
            print('Successfully end training')

            # Triggering Email
            msg = MIMEMultipart()
            msg['Subject'] = 'MushroomTypeClassifier - Model Train | ' + str(
                datetime.now())
            body = 'Model Training Done Successfully. Please find the models in models/ directory... <br><br> Thanks and Regards, <br> Rahul Garg'
            msg.attach(MIMEText(body, 'html'))
            to_addr = ['*****@*****.**']
            self.emailObj.trigger_mail(to_addr, [], msg)

        except Exception as e:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training: ' + e)
            raise Exception
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            #data=preprocessor.remove_columns(data,['Wafer']) # remove the unnamed column as it doesn't contribute to prediction.

            data = preprocessor.enocdeCategoricalvalues(data)

            X = data.drop(['class'], axis=1)
            Y = data['class']

            X, Y = preprocessor.handleImbalanceDataset(X, Y)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=355)
                x_train = preprocessor.scaleData(x_train)
                x_test = preprocessor.scaleData(x_test)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
Esempio n. 8
0
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            #data=preprocessor.remove_columns(data,['Wafer']) # remove the unnamed column as it doesn't contribute to prediction.

            #removing unwanted columns as discussed in the EDA part in ipynb file
            data = preprocessor.dropUnnecessaryColumns(data, [
                'DATE', 'Precip', 'WETBULBTEMPF', 'DewPointTempF',
                'StationPressure'
            ])

            #repalcing '?' values with np.nan as discussed in the EDA part

            data = preprocessor.replaceInvalidValuesWithNull(data)

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data)  # missing value imputation

            # get encoded values for categorical data

            #data = preprocessor.encodeCategoricalValues(data)

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='VISIBILITY')

            # drop the columns obtained above
            #X=preprocessor.remove_columns(X,cols_to_drop)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=36)

                x_train_scaled = preprocessor.standardScalingData(x_train)
                x_test_scaled = preprocessor.standardScalingData(x_test)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train_scaled, y_train, x_test_scaled, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
Esempio n. 9
0
    def get_stacked(self, x, y, base_algo, algo_list):
        """
            Method Name: get_stacked
            x : Feature Columns
            y : Target Column
            base_algo : Algorithm which will be used to blend rest of algorithm
            algo_list: Base algorithms used in stacking
        """
        self.logger_object.log(
            self.file_object,
            'Entered the get_stacked method of Stacking class')
        # spliting data into training set and holding 50 % of data
        train, val_train, test, val_test = train_test_split(x,
                                                            y,
                                                            test_size=0.5)

        # spliting training data into training and test data
        x_train, x_test, y_train, y_test = train_test_split(train,
                                                            test,
                                                            test_size=0.2)
        model_finder = tuner.Model_Finder(
            self.file_object, self.log_writer)  # object initialization
        # Here we are creating input data for Meta-Algo
        predict = []
        # Fitting base models with train data
        for i, algo in enumerate(algo_list):
            if algo == "rfr":
                rfr_model = model_finder.get_best_params_for_random_forest(
                    x_train, y_train)
                #z = rfr_model.fit(x_train, y_train)
            elif algo == "lgbm":
                lgbm_model = model_finder.get_best_params_for_lgbm(
                    x_train, y_train)
            elif algo == "svc":
                svc_model = model_finder.get_best_params_for_svc(
                    x_train, y_train)
                #####

            filename = 'base_model_' + str(i) + '.sav'  # Saving Base models
            with open(filename, 'wb') as f:
                pickle.dump(z, f)
            # Here we are giving predictions on base models for validation data
            i = z.predict(val_train)
            predict.append(i)
            predict_val = np.column_stack(
                predict)  # Blending all predictions to feed meta-algo

        # Here we are creating input to check accuracy on Trained Meta-Algo
        test = []
        for i, algo in enumerate(algo_list):
            i = algo.predict(x_test)
            test.append(i)
            predict_test = np.column_stack(test)

        # Fitting meta algorithm with
        model = meta_alg.fit(predict_val, val_test)

        # Uncheck below 3 lines to apply Cross-validation
        # cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
        # scores = cross_val_score(model, x, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
        # print(score)
        print('Test Score of meta algorithm is',
              meta_alg.score(predict_test, y_test))
        print('Train Score of meta algorithm is',
              meta_alg.score(predict_val, val_test))

        # Saving Model for
        filename = 'stacking_model.sav'
        with open(filename, 'wb') as f:
            pickle.dump(model, f)
        print(model.accuracy())

        return model
        for algo in algo_list:
            if algo == "rfr":
                rfr_model = model_finder.get_best_params_for_random_forest()
Esempio n. 10
0
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            #select only data which came from SPC
            spc_data = preprocessor.spc_data(data)

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                spc_data)

            #drop the column which has all null values

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data)  # missing value imputation

            # make a list of unnecessary columns
            unnecessaryColumns = [
                'Batch No', 'COMP frst R', 'COMP set', 'Cycle time',
                'Description', 'Filter Dust', 'MICOMP wtr', 'New Sand',
                'Old Sand', 'Premix time', 'Readymix time', 'SPC set moist',
                'Time of storage', 'Tot. Weight', 'Water Type', 'comp frst',
                'cost centre', 'moist act', 'temp act'
            ]

            # drop unnecessary columns
            spc_data = preprocessor.dropUnnecessaryColumns(
                spc_data, unnecessaryColumns)

            # separate good and bad batches, we are only data which has good batches.
            spc_good_data = preprocessor.separateGoodBatchData(spc_data)

            # create new column named TotalWater which add water qty and spc fine water
            spc_good_data = preprocessor.createTotalWaterColumn(spc_good_data)

            #dropping batch type, water qty and spc fine water
            todropcolumns = ['Water Qty.', 'SPC fine water', 'BatchType']
            spc_good_data = preprocessor.dropUnnecessaryColumns(
                spc_good_data, todropcolumns)

            # get encoded values for categorical data

            #data = preprocessor.encodeCategoricalValues(data)

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                spc_good_data, label_column_name='TotalWater')
            # drop the columns obtained above
            #X=preprocessor.remove_columns(X,cols_to_drop)

            X = preprocessor.logTransformation(X)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=36)

                x_train_scaled = preprocessor.standardScalingData(x_train)
                x_test_scaled = preprocessor.standardScalingData(x_test)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train_scaled, y_train, x_test_scaled, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
Esempio n. 11
0
    def trainingModel(self):
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()

            #Preprocessing
            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            preprocessor.replaceInvalidValuesWithNull(data)

            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)

            if (is_null_present):
                preprocessor.impute_missing_values(data,
                                                   cols_with_missing_values)

            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='Result')

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  # using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            # create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=36)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                # getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                # saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
Esempio n. 12
0
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            data.drop('weather_description', inplace=True, axis=1)
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)
            if (is_null_present):
                data = preprocessor.impute_missing_values(data)

            # data  = preprocessor.logTransformation(data)

            # encode the prediction data
            data_encoded = preprocessor.encodeCategoricalValues(data)
            ###Time features

            data = preprocessor.create_timefeatures(data_encoded)

            print(data_encoded)
            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name="traffic_volume")
            # drop the columns obtained above

            # splitting the data into training and test set for each cluster one by one
            x_train, x_test, y_train, y_test = train_test_split(
                X, Y, test_size=1 / 3, random_state=36)

            model_finder = tuner.Model_Finder(
                self.file_object, self.log_writer)  # object initialization

            #getting the best model for each of the clusters
            best_model_name, best_model = model_finder.get_best_model(
                x_train, y_train, x_test, y_test)

            #saving the best model to the directory.
            file_op = file_methods.File_Operation(self.file_object,
                                                  self.log_writer)
            save_model = file_op.save_model(best_model, best_model_name)

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
Esempio n. 13
0
    def trainingModel(self):
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            data = preprocessor.remove_columns(
                data, ['Wafer']
            )  # remove the unnamed column as it doesn't contribute to prediction.

            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='Output')

            is_null_present = preprocessor.is_null_present(X)

            if (is_null_present):
                X = preprocessor.impute_missing_values(X)

            cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(X)

            X = preprocessor.remove_columns(X, cols_to_drop)

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            X = kmeans.create_clusters(X, number_of_clusters)

            X['Labels'] = Y

            list_of_clusters = X['Cluster'].unique()

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] == i]

                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=355)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
Esempio n. 14
0
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            #data.replace('?',np.NaN,inplace=True) # replacing '?' with NaN values for imputation

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(data,
                                                       label_column_name='y')

            # Dropping column after performing EDA
            preprocessor_cus = preprocess_cus.Preprocessor_cus(
                self.file_object, self.log_writer)
            X = preprocessor_cus.drop_column(X)
            Y.replace({
                "no": 0,
                "yes": 1
            }, inplace=True)  # Encoding Y (Predection label)

            # Response Encoding process
            # cat_cols2 = preprocessor_cus.categorical_column(X)
            # X = preprocessor_cus.ResponseEncoder(cat_cols2, X, Y)
            X = preprocessor_cus.test_data_encode(X)  # Using Predefined Values

            print("Shape of the dataset after encoding: ", X.shape)

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                X)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                X = preprocessor.impute_missing_values(
                    X)  # missing value imputation

            features, label = preprocessor.handleImbalanceDataset(X, Y)
            # splitting the data into training and test set for each cluster one by one
            x_train, x_test, y_train, y_test = train_test_split(
                features, label, test_size=1 / 5, random_state=42)

            model_finder = tuner.Model_Finder(
                self.file_object, self.log_writer)  # object initialization

            #getting the best model for each of the clusters
            best_model_name, best_model = model_finder.get_best_model(
                x_train, y_train, x_test, y_test)

            #saving the best model to the directory.
            file_op = file_methods.File_Operation(self.file_object,
                                                  self.log_writer)
            save_model = file_op.save_model(best_model, best_model_name)

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception as e:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log('wafer_log', 'Start of Training')
        try:
            # Getting the data from the source
            # data_getter=data_loader.Data_Getter(self.file_object,self.log_writer)
            data = self.mongo.downlaod_all_from_mongo('wafer_good_data',
                                                      'temp_db')
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor('wafer_log',
                                                      self.log_writer)
            data = preprocessor.remove_columns(
                data, ['Wafer']
            )  # remove the wafer column as it doesn't contribute to prediction.

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='Good/Bad')

            # check if missing values are present in the dataset

            # if missing values are there, replace them appropriately.
            X.replace(to_replace='NULL', value=np.nan,
                      inplace=True)  # consumes  4 sec to compute
            is_null_present = preprocessor.is_null_present(X)
            if (is_null_present):
                X = preprocessor.impute_missing_values(
                    X)  # missing value imputation

            # check further which columns do not contribute to predictions
            # if the standard deviation for a column is zero, it means that the column has constant values
            # and they are giving the same output both for good and bad sensors
            # prepare the list of such columns to drop
            cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(
                X)  # consumes a lot of time
            # drop the columns obtained above
            X = preprocessor.remove_columns(X, cols_to_drop)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                'wafer_log', self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.

            # X=pd.DataFrame.join(X,Y)
            X['Labels'] = Y.values

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for index, i in enumerate(list_of_clusters):
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster
                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=355)

                model_finder = tuner.Model_Finder(
                    'wafer_log', self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                # file_op = file_methods.File_Operation('wafer_log',self.log_writer)
                # save_model=file_op.save_model(best_model,best_model_name+str(i))
                print(best_model)
                best_model = pickle.dumps(best_model)
                self.aws.Upload_To_S3_obj(best_model,
                                          best_model_name + str(index) +
                                          '.sav',
                                          bucket_prefix='wafer-model')

            # logging the successful Training
            self.log_writer.log('wafer_log', 'Successful End of Training')
            # self.file_object.close()

        except Exception as err:
            # logging the unsuccessful Training
            self.log_writer.log('wafer_log', 'Unsuccessful End of Training')
            # self.file_object.close()
            print(str(err))
            raise err
Esempio n. 16
0
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter=data_loader.Data_Getter(self.file_object,self.log_writer)
            data=data_getter.get_data()


            """ doing the data preprocessing. 
            All the pre processing steps are based on the EDA done previously
            """
            """
            1. Duplicate
            2. Remove columns: 	"serial","rate","listed_in(type)","listed_in(city)"
            3. Null removal
            4. Convert cost column to number
            5. Categorical to Numerical
            """

            preprocessor=preprocessing.Preprocessor(self.file_object,self.log_writer)


            #removing unwanted columns as discussed in the EDA part in ipynb file
            data = preprocessor.dropUnnecessaryColumns(data,["serial","rate","listed_in(type)","listed_in(city)"])

            # removing the duplicates
            data=preprocessor.removeDuplicates(data)


            # check if missing values are present in the dataset
            is_null_present,cols_with_missing_values=preprocessor.is_null_present(data)

            # if missing values are there, replace them appropriately.
            if(is_null_present):
                # here we won't do any imputation, just to show one more way, we'll drop the missing values
                data=data.dropna(how='any')

            # cost value to float
            data=preprocessor.convertCostToNumber(data)

            # get encoded values for categorical data

            data = preprocessor.encodeCategoricalValues(data)

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(data, label_column_name='approx_cost(for two people)')


            """ Applying the clustering approach"""

            kmeans=clustering.KMeansClustering(self.file_object,self.log_writer) # object initialization.
            number_of_clusters=kmeans.elbow_plot(X)  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X=kmeans.create_clusters(X,number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels']=Y

            # getting the unique clusters from our dataset
            list_of_clusters=X['Cluster'].unique()

            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data=X[X['Cluster']==i] # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features=cluster_data.drop(['Labels','Cluster'],axis=1)
                cluster_label= cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(cluster_features, cluster_label, test_size=1 / 3, random_state=36)

                x_train_scaled = preprocessor.standardScalingData(x_train)
                x_test_scaled = preprocessor.standardScalingData(x_test)

                model_finder=tuner.Model_Finder(self.file_object,self.log_writer) # object initialization

                #getting the best model for each of the clusters
                best_model_name,best_model=model_finder.get_best_model(x_train_scaled,y_train,x_test_scaled,y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,self.log_writer)
                save_model=file_op.save_model(best_model,best_model_name+str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception as e:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object, 'Unsuccessful End of Training')
            self.file_object.close()
            raise e
    def trainingModel(self):
        # Logging the start of Training
        self.log_db_writer.log(self.log_database, self.log_collection,
                               "Start of Training")
        print("training started")
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.log_database,
                                                  self.log_collection,
                                                  self.execution_id)
            data = data_getter.get_data()

            if data.__len__() == 0:
                self.log_db_writer.log(self.log_database, self.log_collection,
                                       "No record found to train model")
                print("No previous file available")
                return 0
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.log_database,
                                                      self.log_collection,
                                                      self.execution_id)

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data)  # missing value imputation

            # get encoded values for categorical data

            #data = preprocessor.encodeCategoricalValues(data)

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='Concrete_compressive _strength')
            # drop the columns obtained above
            #X=preprocessor.remove_columns(X,cols_to_drop)

            X = preprocessor.logTransformation(X)
            print(X)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.execution_id)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)
            print("cluster shape details")
            print(X)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=36)

                x_train_scaled = preprocessor.standardScalingData(x_train)
                x_test_scaled = preprocessor.standardScalingData(x_test)

                model_finder = tuner.Model_Finder(
                    self.log_database, self.log_collection,
                    self.execution_id)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train_scaled, y_train, x_test_scaled, y_test)

                model_metrics = model_finder.get_model_metrics(
                    best_model_name + str(i))

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.log_database,
                                                      self.log_collection,
                                                      self.execution_id)
                print(best_model_name + str(i))
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_db_writer.log(self.log_database, self.log_collection,
                                   'Successful End of Training')
            #self.log_database.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_db_writer.log(self.log_database, self.log_collection,
                                   'Unsuccessful End of Training')
            #self.log_database.close()
            raise Exception


#trainModelObj = trainModel(1111)  # object initialization
#trainModelObj.trainingModel()  # training the model for the files in the table
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            #data.replace('?',np.NaN,inplace=True) # replacing '?' with NaN values for imputation

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='default payment next month')

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                X)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                X = preprocessor.impute_missing_values(
                    X, cols_with_missing_values)  # missing value imputation
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=355)
                # Proceeding with more data pre-processing steps
                train_x = preprocessor.scale_numerical_columns(x_train)
                test_x = preprocessor.scale_numerical_columns(x_test)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    train_x, y_train, test_x, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception as e:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            #removing unwanted columns as discussed in the EDA part in ipynb file
            data = preprocessor.dropUnnecessaryColumns(data, ['id'])

            #removing outliers from columns like height, weight, ap_hi, ap_lo
            data = preprocessor.dropOutliers(data)

            #processing gender and age columns and add new column BMI as discussed in the EDA part
            data = preprocessor.dataProcessor(data)

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='cardio')
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                #To handle Imbalance dataset.
                rdsmple = RandomOverSampler()
                x_sampled, y_sampled = rdsmple.fit_sample(
                    cluster_features, cluster_label)

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    x_sampled, y_sampled, test_size=1 / 3, random_state=355)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
Esempio n. 20
0
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            data = preprocessor.remove_columns(
                data, ['Wafer']
            )  # remove the unnamed column as it doesn't contribute to prediction.

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='Output')

            # check if missing values are present in the dataset
            is_null_present = preprocessor.is_null_present(X)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                X = preprocessor.impute_missing_values(
                    X)  # missing value imputation

            # check further which columns do not contribute to predictions
            # if the standard deviation for a column is zero, it means that the column has constant values
            # and they are giving the same output both for good and bad sensors
            # prepare the list of such columns to drop
            cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(X)

            # drop the columns obtained above
            X = preprocessor.remove_columns(X, cols_to_drop)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=355)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
Esempio n. 21
0
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            data = preprocessor.remove_columns(
                data, []
            )  # remove the column as it doesn't contribute to prediction.
            data.replace(
                '?', np.NaN,
                inplace=True)  # replacing '?' with NaN values for imputation

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data, cols_with_missing_values)  # missing value imputation

            # since in our target column we have multi no labels and it is higly imbalance so i m grouping them using below preprocessor method

            data = preprocessor.grouping_values_of_target(data)

            # create separate two data frames one on which we will perform cluster  and other is attached after performing cluster
            X, Y = preprocessor.separate_data_frame(
                data, label_column_name=['Rings', 'Sex'])
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X = pd.concat([X, Y], axis=1, sort=False)

            # encode categorical data
            X = preprocessor.encode_categorical_columns(X)

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""
            df = pd.DataFrame(
                columns=['Cluster_No', 'Best_Model_Name', 'Roc_Auc_score'])

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Rings', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Rings']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=100)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model, Roc_Auc_score = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))
                df = df.append(
                    {
                        'Cluster_No': i,
                        'Best_Model_Name': best_model_name + str(i),
                        'Roc_Auc_score': Roc_Auc_score
                    },
                    ignore_index=True)

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()
            return df

        except Exception as e:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception