コード例 #1
0
class prediction:
    def __init__(self):
        self.log_writer = App_Logger()
        self.file_object = open('./Prediction_log/prediction.txt', 'a+')

    def preprocessing_prediction(self, data):
        """                       Method Name: predict
                                  Description: this method will perform the preprocessing of test data
                                  Output:None
                                  Written by: Harsh Kumar
                                  Version: 1.0
                                  Revisions: None """
        try:
            #featurising the features of the test data
            get_features = post_training()
            data1 = get_features.load_vectorizer(data)

            self.log_writer.log(self.file_object,
                                'Successfully fetched the data')

            return data1
        except Exception as e:
            self.log_writer.log(
                self.file_object,
                'Error %s occurred while preprocessing the data' % e)

    def predict_values(self, test_data, data, sig_clf, clf):
        """                       Method Name: predict_values
                                  Description: this method will perform the prediction of test data
                                  Output:Returns feature importance, predicted probablities abd predicted class.
                                  Written by: Harsh Kumar
                                  Version: 1.0
                                  Revisions: None """
        self.path = './Save_Load/vectorizer.txt'

        try:

            with open(self.path, encoding='utf8') as f:
                path_1 = f.readline()
                print(path_1)
                if path_1[18:24] == 'OneHot':
                    try:

                        predicted_cls = sig_clf.predict(data.iloc[0])[0]

                        predicted_prob = np.round(
                            sig_clf.predict_proba(data.iloc[0], 4))
                        predicted_probs = [
                            (i + 1, v) for i, v in enumerate(predicted_prob[0])
                        ]

                        no_features = 100
                        indices = np.argsort(
                            -1 * abs(clf.coef_))[predicted_cls -
                                                 1][:, :no_features]

                        fea_imp = feature_imp(self.log_writer,
                                              self.file_object)
                        important_features = fea_imp.get_feature_imp(
                            indices[0], test_data['TEXT'].iloc[0],
                            test_data['Gene'].iloc[0].lower(),
                            test_data['Variation'].iloc[0].lower(),
                            no_features, predicted_cls[0])

                        return important_features, predicted_probs, predicted_cls

                    except Exception as e:
                        self.log_writer.log(
                            self.file_object,
                            "Error %s in computing the feature importance of one Hot Encoded features"
                            % e)

                elif path_1[18:23] == 'Tfidf':
                    try:

                        predicted_cls = sig_clf.predict(
                            data.reshape(1, data.shape[1]))

                        predicted_prob = np.round(
                            sig_clf.predict_proba(
                                data.reshape(1, data.shape[1])), 4)
                        predicted_probs = [
                            (i + 1, v) for i, v in enumerate(predicted_prob[0])
                        ]

                        no_features = 100

                        indices = np.argsort(
                            -1 * abs(clf.coef_))[predicted_cls -
                                                 1][:, :no_features]

                        fea_imp = feature_imp(self.log_writer,
                                              self.file_object)
                        important_features = fea_imp.get_feature_imp(
                            indices[0], test_data['TEXT'].iloc[0],
                            test_data['Gene'].iloc[0].lower(),
                            test_data['Variation'].iloc[0].lower(),
                            no_features, predicted_cls[0])

                        self.log_writer.log(
                            self.file_object,
                            'Successfully returned the values')
                        f = open('./Prediction_log/prediction.txt', 'a+')
                        self.log_writer.log(
                            f,
                            'Successfully returned important_features,predicted_prob,predicted_cls values'
                        )
                        f.close()

                        return important_features, predicted_probs, predicted_cls

                    except Exception as e:
                        self.log_writer.log(
                            self.file_object,
                            'Error %s in computing the feature importance for tfidf features'
                            % e)

        except Exception as e:
            self.log_writer.log(
                self.file_object,
                'Error %s in computing the feature importance' % e)
コード例 #2
0
 def __init__(self):
     self.log_writer = App_Logger()
     self.file_object = open('./Prediction_log/prediction.txt', 'a+')
コード例 #3
0
 def __init__(self):
     self.path = 'Training_Database/'
     self.logger = App_Logger()
コード例 #4
0
class Db_operation:
    """
              This class shall be used for handling all the SQL operations.

              Written By: Harsh Kumar
              Version: 1.0
              Revisions: Nonse

              """
    def __init__(self):
        self.path = 'Training_Database/'
        self.logger = App_Logger()

    def database_connection(self, DatabaseName):
        """
                                Method Name: dataBaseConnection
                                Description: This method creates the database with the given name and if Database already exists then opens the connection to the DB.
                                Output: Connection to the DB
                                On Failure: Raise ConnectionError

                                 Written By: Harsh Kumar
                                Version: 1.0
                                Revisions: None

                """
        try:
            conn = sqlite3.connect(self.path + DatabaseName + '.db')
            file = open('Training_Logs/DataBaseConnectionLog.txt', 'a+')
            self.logger.log(file,
                            "Opened %s database successfully" % DatabaseName)
            file.close()
        except ConnectionError:
            file = open('Training_Logs/DataBaseConnectionLog.txt', 'a+')
            self.logger.log(file,
                            'Error while openening %s database' % DatabaseName)
            file.close()
            raise ConnectionError
        return conn

    def createTrainTextTable(self, DatabaseName, column_names):
        """
                                        Method Name: createTextTable
                                        Description: This method creates a table for text data in the given database which will be used to insert training data.
                                        Output: None
                                        On Failure: Raise Exception

                                         Written By: Harsh Kumar
                                        Version: 1.0
                                        Revisions: None

                """
        try:
            file = open('Training_Logs/DbTableCreateLog.txt', 'a+')
            self.logger.log(file, 'Entered into createTrainTextTable table')
            file.close()
            conn = self.database_connection(DatabaseName)
            c = conn.cursor()
            c.execute(
                "SELECT count(name) FROM sqlite_master WHERE type='table' AND name='Train_Text_Table'"
            )
            if c.fetchone()[0] == 1:
                conn.close()
                file = open('Training_Logs/DbTableCreateLog.txt', 'a+')
                self.logger.log(file, 'Table already exist!!')
                file.close()
            else:
                for key in column_names.keys():
                    type = column_names[key]
                    try:
                        # in try block we check if the table exists, if yes then add columns to the table
                        # else in catch block we will create the table

                        c.execute(
                            'ALTER TABLE Train_Text_Table ADD COLUMN {column_name} {dataType}'
                            .format(column_name=key, dataType=type))
                    except:
                        c.execute(
                            'CREATE TABLE Train_Text_Table({column_name} {dataType})'
                            .format(column_name=key, dataType=type))

                conn.close()
                file = open("Training_Logs/DbTableCreateLog.txt", 'a+')
                self.logger.log(file, 'Table created successfully')
                file.close()

                file = open("Training_Logs/DatabaseConnectionLog.txt", 'a+')
                self.logger.log(
                    file, 'Closed %s database successfully' % DatabaseName)
                file.close()

        except Exception as e:
            file = open('Training_Logs/DbTableCreateLog.txt', 'a+')
            self.logger.log(file, 'Error while creating the table %s' % e)
            file.close()
            file = open('Training_Logs/DatabaseConnectionLog.txt', 'a+')
            self.logger.log(file,
                            'Closed %s database successfully' % DatabaseName)
            file.close()
            raise e

    def createPredTextTable(self, DatabaseName, column_names):
        """
                                        Method Name: createPredTextTable
                                        Description: This method creates a table for prediction text data in the given database which will be used to insert Prediction data.
                                        Output: None
                                        On Failure: Raise Exception

                                         Written By: Harsh Kumar
                                        Version: 1.0
                                        Revisions: None

                """
        try:
            file = open('Training_Logs/DbTableCreateLog.txt', 'a+')
            self.logger.log(file, 'Entered into createPredTextTable table')
            file.close()
            conn = self.database_connection(DatabaseName)
            c = conn.cursor()
            c.execute(
                "SELECT count(name) FROM sqlite_master WHERE type='table' AND name='Pred_Text_Table'"
            )
            if c.fetchone()[0] == 1:
                conn.close()
                file = open('Training_Logs/DbTableCreateLog.txt', 'a+')
                self.logger.log(file, 'Table already exist!!')
                file.close()
            else:
                for key in column_names.keys():
                    type = column_names[key]
                    try:
                        # in try block we check if the table exists, if yes then add columns to the table
                        # else in catch block we will create the table

                        c.execute(
                            'ALTER TABLE Pred_Text_Table ADD COLUMN {column_name} {dataType}'
                            .format(column_name=key, dataType=type))
                    except:
                        c.execute(
                            'CREATE TABLE Pred_Text_Table({column_name} {dataType})'
                            .format(column_name=key, dataType=type))

                conn.close()
                file = open("Training_Logs/DbTableCreateLog.txt", 'a+')
                self.logger.log(file, 'Table created successfully')
                file.close()

                file = open("Training_Logs/DatabaseConnectionLog.txt", 'a+')
                self.logger.log(
                    file, 'Closed %s database successfully' % DatabaseName)
                file.close()

        except Exception as e:
            file = open('Training_Logs/DbTableCreateLog.txt', 'a+')
            self.logger.log(file, 'Error while creating the table %s' % e)
            file.close()
            file = open('Training_Logs/DatabaseConnectionLog.txt', 'a+')
            self.logger.log(file,
                            'Closed %s database successfully' % DatabaseName)
            file.close()
            raise e

    def createTrainGeneVarTable(self, DatabaseName, column_names):
        """
                                                Method Name: createTrainGeneVarTable
                                                Description: This method creates a table gor gene and variation features in the given database which will be used to insert training data.
                                                Output: None
                                                On Failure: Raise Exception

                                                 Written By: Harsh Kumar
                                                Version: 1.0
                                                Revisions: None

                        """
        try:
            file = open('Training_Logs/DbTableCreateLog.txt', 'a+')
            self.logger.log(file,
                            'Entered in createTrainGeneVarTable function')
            conn = self.database_connection(DatabaseName)
            c = conn.cursor()
            c.execute(
                "SELECT COUNT(name) from sqlite_master where type='table' AND name='TrainGeneVarTable'"
            )
            if c.fetchone()[0] == 1:
                conn.close()
                file = open('Training_Logs/DbTableCreateLog.txt', 'a+')

                self.logger.log(file,
                                'TrainGeneVarTable table already exists!!!')

            else:
                for key in column_names.keys():
                    type = column_names[key]
                    try:

                        c.execute(
                            'ALTER TABLE TrainGeneVarTable ADD COLUMN "{column_name}" {dataType}'
                            .format(column_name=key, dataType=type))

                    except:
                        c.execute(
                            'CREATE TABLE TrainGeneVarTable({column_name} {datatype})'
                            .format(column_name=key, datatype=type))

                self.logger.log(file,
                                'TrainGeneVarTable created successfully!!')
                file.close()
                conn.close()
        except Exception as e:
            file = open('Training_Logs/DbTableCreateLog.txt', 'a+')
            self.logger.log(file, 'Error while creating the table %s' % e)
            file.close()
            file = open('Training_Logs/DatabaseConnectionLog.txt', 'a+')
            self.logger.log(file,
                            'Closed %s database successfully' % DatabaseName)
            file.close()
            raise e

    def createPredGeneVarTable(self, DatabaseName, column_names):
        """
                                                Method Name: createPredGeneVarTable
                                                Description: This method creates a table for gene and variation features in the given database which will be used to insert Prediction data.
                                                Output: None
                                                On Failure: Raise Exception

                                                 Written By: Harsh Kumar
                                                Version: 1.0
                                                Revisions: None

                        """
        try:

            file = open('Training_Logs/DbTableCreateLog.txt', 'a+')
            self.logger.log(file, 'Entered in createPredGeneVarTable function')
            conn = self.database_connection(DatabaseName)
            c = conn.cursor()
            c.execute(
                "SELECT COUNT(name) from sqlite_master where type='table' AND name='PredGeneVarTable'"
            )
            if c.fetchone()[0] == 1:
                conn.close()
                file = open('Training_Logs/DbTableCreateLog.txt', 'a+')
                self.logger.log(file, 'PredGeneVarTable table already exists')

            else:
                for key in column_names.keys():
                    type = column_names[key]
                    if key == 'Class':
                        continue
                    else:
                        try:
                            c.execute(
                                'ALTER TABLE PredGeneVarTable ADD COLUMN "{column_name}" {dataType}'
                                .format(column_name=key, dataType=type))

                        except:
                            c.execute(
                                'CREATE TABLE PredGeneVarTable({column_name} {datatype})'
                                .format(column_name=key, datatype=type))
                self.logger.log(
                    file, 'PredGeneVarTable table created Successfully!!')
                conn.close()
        except Exception as e:
            file = open('Training_Logs/DbTableCreateLog.txt', 'a+')
            self.logger.log(file, 'Error while creating the table %s' % e)
            file.close()
            file = open('Training_Logs/DatabaseConnectionLog.txt', 'a+')
            self.logger.log(file,
                            'Closed %s database successfully' % DatabaseName)
            file.close()
            raise e

    def insertDataIntoTextTable(self, DatabaseName, name):
        """
                                                   Method Name: insertDataIntoTrainTable
                                                   Description: This method inserts the text data  into the
                                                                above created table.
                                                   Output: None
                                                   On Failure: Raise Exception

                                                    Written By: Harsh Kumar
                                                   Version: 1.0
                                                   Revisions: None

                            """
        file = open('Training_Logs/DbInsertLog.txt', 'a+')
        if name == 'train':
            self.logger.log(
                file,
                'Entered into insertDataIntoTrainTable for inserting data')
            conn = self.database_connection(DatabaseName)
            c = conn.cursor()
            for i in c.execute('SELECT COUNT(*) FROM Train_Text_Table'):
                if i[0] >= 1:
                    self.logger.log(file, 'Values already inserted')
                    file.close()
                else:
                    sql_insert = ''' INSERT INTO Train_Text_Table(ID,TEXT) VALUES (?,?)'''
                    with open('RawTrainData/training_text',
                              encoding='utf8') as f:
                        for line in f.readlines():
                            if line == 'ID,Text\n':
                                continue
                            else:
                                try:
                                    line = line.split('||')
                                    c.execute(sql_insert,
                                              (int(line[0]), line[1]))
                                    conn.commit()
                                except Exception as e:
                                    self.logger.log(
                                        file,
                                        "Error while inserting into table: %s "
                                        % e)
                                    file.close()

                    self.logger.log(file, 'Data inserted successfully')
        elif name == 'Pred':
            self.logger.log(
                file,
                'Entered into insertDataIntoPredTable for inserting data')
            conn = self.database_connection((DatabaseName))
            c = conn.cursor()
            for i in c.execute(('SELECT COUNT(*) FROM Pred_Text_Table')):
                if i[0] >= 1:
                    self.logger.log(file, 'Values are already inserted')
                    file.close()
                else:
                    sql_insert = ''' INSERT INTO Pred_Text_Table(ID,TEXT) VALUES (?,?)'''
                    with open('RawPredictionData/test_text',
                              encoding='utf8') as f:
                        for line in f.readlines():
                            if line == 'ID,Text\n':
                                continue
                            else:
                                try:
                                    line = line.split('||')
                                    c.execute(sql_insert,
                                              (int(line[0]), line[1]))
                                    conn.commit()
                                except Exception as e:
                                    self.logger.log(
                                        file,
                                        "Error while inserting into table: %s "
                                        % e)
                                    file.close()
                    self.logger.log(file, 'Data inserted successfully')
                    file.close()

    def insertDataIntoGenVarTable(self, DatabaseName, name):
        """""
                                                   Method Name: insertDataIntoGenVarTable
                                                   Description: This method inserts the data files  into the
                                                                above created table.
                                                   Output: None
                                                   On Failure: Raise Exception

                                                    Written By: Harsh Kumar
                                                   Version: 1.0
                                                   Revisions: None

                            """
        file = open('Training_Logs/DbInsertLog.txt', 'a+')
        if name == 'train':
            self.logger.log(
                file,
                'Entered into TrainGeneVarTable table for inserting data')
            conn = self.database_connection(DatabaseName)
            c = conn.cursor()
            for i in c.execute(('SELECT COUNT(*) FROM TrainGeneVarTable')):
                if i[0] >= 1:
                    self.logger.log(file, 'Values are already inserted')
                    file.close()
                else:
                    sql_insert = '''INSERT INTO TrainGeneVarTable(ID,Gene,Variation,Class) VALUES (?,?,?,?)'''
                    with open('RawTrainData/training_variants',
                              encoding='utf8') as f:
                        for line in f.readlines():
                            if line == 'ID,Gene,Variation,Class\n':
                                continue
                            else:
                                try:
                                    c.execute(sql_insert,
                                              (int(line.split(',')[0]),
                                               line.split(',')[1],
                                               line.split(',')[2],
                                               int(line.split(',')[3][0])))
                                    conn.commit()
                                except Exception as e:
                                    self.logger.log(
                                        file,
                                        "Error while inserting into table: %s "
                                        % e)
                                    file.close()
                    self.logger.log(
                        file,
                        'Data Entered into TrainGeneVarTable successfully !!')
                    file.close()

        elif name == 'Pred':
            self.logger.log(
                file, 'Entered into PredGeneVarTable table for inserting data')
            conn = self.database_connection(DatabaseName)
            c = conn.cursor()
            for i in c.execute(('SELECT COUNT(*) FROM PredGeneVarTable')):
                if i[0] >= 1:
                    self.logger.log(file, 'Values are already inserted')
                    file.close()
                else:
                    sql_insert = '''INSERT INTO PredGeneVarTable(ID,Gene,Variation) VALUES (?,?,?)'''
                    with open('RawPredictionData/test_variants',
                              encoding='utf8') as f:
                        for line in f.readlines():
                            if line == 'ID,Gene,Variation\n':
                                continue
                            else:
                                try:
                                    c.execute(sql_insert,
                                              (int(line.split(',')[0]),
                                               line.split(',')[1],
                                               line.split(',')[2]))
                                    conn.commit()
                                except Exception as e:
                                    self.logger.log(
                                        file,
                                        "Error while inserting into table: %s "
                                        % e)
                                    file.close()
                    self.logger.log(
                        file,
                        'Data Entered into PredGeneVarTable successfully !!')
                    file.close()
コード例 #5
0
 def __init__(self):
     self.log_writer = App_Logger()
     self.file_object = open('Training_Logs/Training_Main_Log.txt', 'a+')
コード例 #6
0
class training:
    def __init__(self):
        self.log_writer = App_Logger()
        self.file_object = open('Training_Logs/Training_Main_Log.txt', 'a+')

    def training_model(self):
        """                          Method Name: training
                                     Description: this method will get all the training operations required.
                                     This is the Entry point for Machine Learning Model
                                     Output: None
                                     Written by: Harsh Kumar
                                     Version: 1.0
                                     Revisions: None """
        try:
            self.log_writer.log(self.file_object, 'start of training')

            #calling the Pre_Training method for all table creation, table insertion and unzipping all the files
            self.log_writer.log(
                self.file_object,
                'Entering the Pre_Training Method for all the unzipping and database operations'
            )
            pre_training = Pretrain(self.log_writer, self.file_object)
            pre_training.Pre_training()
            self.log_writer.log(
                self.file_object,
                'All Pre_training task completed successfully')

            #getting the data into pandas dataframe format from database
            self.log_writer.log(
                self.file_object,
                'Entering into get_data method for retrieving data into dataFrame'
            )
            get_data = data_getter(self.log_writer, self.file_object)
            self.train_text, self.train_Gene_Var, self.Pred_text, self.Pred_Gene_Var = get_data.get_data(
            )
            self.log_writer.log(self.file_object,
                                'Got the data into Pandas Dataframe')

            # Merging the text data and the GeneVar data.
            self.log_writer.log(
                self.file_object,
                'Entering into    method for merging Text table and GeneVarTable'
            )
            get_data = data_getter(self.log_writer, self.file_object)
            self.data = get_data.merge_data(self.train_Gene_Var,
                                            self.train_text)
            self.log_writer.log(self.file_object,
                                'Merged Two pandas dataframe successfully')

            # Drop the null rows
            self.log_writer.log(self.file_object,
                                'Entering into drop_null method')
            pre = preprocess(self.log_writer, self.file_object)
            self.data = pre.drop_null(self.data)
            self.log_writer.log(
                self.file_object,
                'Dropped all the null values rows succesfully')

            # Removing all the Stop words and special characters from the TEXT feature
            self.log_writer.log(self.file_object,
                                'Entering into text_preprocessing method')
            pre = preprocess(self.log_writer, self.file_object)
            self.data = pre.text_preprocessing(self.data)
            self.log_writer.log(
                self.file_object,
                'Preprocessing of the TEXT feature completed successfully')

            # Splitting data for Training ,Cross validation and Test
            self.log_writer.log(
                self.file_object,
                'Entering into splitData method for splitting data into train,Cv,Test'
            )
            pre = preprocess(self.log_writer, self.file_object)
            train_df, test_df, cv_df, y_train, y_test, y_cv = pre.splitData(
                self.data)
            self.log_writer.log(self.file_object,
                                'Splitted data successfully!!')

            # Generating the One Hot Encoding features
            self.log_writer.log(
                self.file_object,
                'Entering into OneHot method for generating the one Hot encodded features and  saving the respective objects'
            )
            pre = preprocess(self.log_writer, self.file_object)
            train_x_onehotCoding, test_x_onehotCoding, cv_x_onehotCoding = pre.OneHot(
                train_df, cv_df, test_df)
            self.log_writer.log(
                self.file_object,
                'Generated One Hot Encoded Features for train,test and cv successfullyy!!'
            )

            #Generating the Tf-idf Encoding features
            self.log_writer.log(
                self.file_object,
                'Entering into the TfIdf method for generating the Tf-idf encodded feature and saving the respective objectcs'
            )
            pre = preprocess(self.log_writer, self.file_object)
            train_x_Tfidf, test_x_Tfidf, cv_x_Tfidf = pre.TfIdf(
                train_df, cv_df, test_df)

            #finding the best model
            self.log_writer.log(
                self.file_object,
                'Entering into the FindBestModel method for finding the best model'
            )
            tune = Tunner()
            self.best_model, self.feature_imp_model = tune.FindBestModel(
                train_x_onehotCoding, test_x_onehotCoding, cv_x_onehotCoding,
                train_x_Tfidf, test_x_Tfidf, cv_x_Tfidf, y_train, y_test, y_cv)
            self.log_writer.log(
                self.file_object,
                'Found the best Model and the original model for feature importance '
            )

            #saving the mdoel
            self.log_writer.log(
                self.file_object,
                'Entering the saving method for saving the model')
            save_model = post_training()
            save_model.save(self.best_model, self.feature_imp_model)
            self.log_writer.log(
                self.file_object,
                'Best Model and the model for feature importance saved successfully!!'
            )

        except Exception as e:
            self.log_writer.log(
                self.file_object,
                'Error %s occurred while training the model' % e)
コード例 #7
0
 def __init__(self):
     self.log_writer = App_Logger()
     self.file_object = open('Training_Logs/Model_Tunning_Logs.txt', 'a+')
     self.path = './Save_Load/vectorizer.txt'
コード例 #8
0
class Tunner:
    def __init__(self):
        self.log_writer = App_Logger()
        self.file_object = open('Training_Logs/Model_Tunning_Logs.txt', 'a+')
        self.path = './Save_Load/vectorizer.txt'

    def FindBestModel(self, train_x_onehotCoding, test_x_onehotCoding,
                      cv_x_onehotCoding, train_x_Tfidf, test_x_Tfidf,
                      cv_x_Tfidf, y_train, y_test, y_cv):
        """                          Method Name: FindBestModel
                                     Description: this method will find the best model that gives us least log-loss
                                     Return :Best model found with least log loss
                                     Written by: Harsh Kumar
                                     Version: 1.0
                                     Revisions: None """

        try:
            loss = []
            # training Logistic regression with balanced class and One Hot Encodded features
            try:
                self.alpha_1 = [pow(10, x) for x in range(-3, 3)]
                cv_log_error_array = []
                for i in self.alpha_1:
                    print('for alpha i', i)
                    clf = SGDClassifier(class_weight='balanced',
                                        alpha=i,
                                        penalty='l2',
                                        loss='log',
                                        random_state=123)
                    clf.fit(train_x_onehotCoding, y_train)
                    sig_clf = CalibratedClassifierCV(clf, method='sigmoid')
                    sig_clf.fit(train_x_onehotCoding, y_train)
                    sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding)
                    cv_log_error_array.append(
                        log_loss(y_cv,
                                 sig_clf_probs,
                                 labels=clf.classes_,
                                 eps=1e-15))
                    # to avoid rounding error while multiplying probabilites we use log-probability estimates
                    print("Log Loss :", log_loss(y_cv, sig_clf_probs))

                self.best_alpha_1 = np.argmin(cv_log_error_array)
                clf = SGDClassifier(class_weight='balanced',
                                    alpha=self.alpha_1[self.best_alpha_1],
                                    penalty='l2',
                                    loss='log',
                                    random_state=123)
                clf.fit(train_x_onehotCoding, y_train)
                sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
                sig_clf.fit(train_x_onehotCoding, y_train)

                predict_y = sig_clf.predict_proba(test_x_onehotCoding)
                self.log_writer.log(
                    self.file_object,
                    'For Logistic Reg. with balanced class and OneHot features the best alpha value is:{}, The Test loss is {}'
                    .format(
                        self.alpha_1[self.best_alpha_1],
                        log_loss(y_test,
                                 predict_y,
                                 labels=clf.classes_,
                                 eps=1e-15)))

                self.loss_lr_OneHot_bal = log_loss(y_test,
                                                   predict_y,
                                                   labels=clf.classes_,
                                                   eps=1e-15)
                loss.append(self.loss_lr_OneHot_bal)
                self.log_writer.log(
                    self.file_object,
                    'Logistic Reg. with balanced class and OneHot features trainied successfully!!'
                )

            except Exception as e:
                self.log_writer.log(
                    self.file_object,
                    'Error %s Occurred while finding the log loss of Logistic Reg whith OneHot features and class balanced'
                    % e)

            #training Logistic regression with clsss balanced and Tf-idf features and
            try:
                self.alpha_2 = [10**x for x in range(-3, 3)]
                cv_log_error_array = []
                for i in self.alpha_2:
                    print("for alpha =", i)
                    clf = SGDClassifier(class_weight='balanced',
                                        alpha=i,
                                        penalty='l2',
                                        loss='log',
                                        random_state=42)
                    clf.fit(train_x_Tfidf, y_train)
                    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
                    sig_clf.fit(train_x_Tfidf, y_train)
                    sig_clf_probs = sig_clf.predict_proba(cv_x_Tfidf)
                    cv_log_error_array.append(
                        log_loss(y_cv,
                                 sig_clf_probs,
                                 labels=clf.classes_,
                                 eps=1e-15))
                    # to avoid rounding error while multiplying probabilites we use log-probability estimates
                    print("Log Loss :", log_loss(y_cv, sig_clf_probs))

                self.best_alpha_2 = np.argmin(cv_log_error_array)
                clf = SGDClassifier(class_weight='balanced',
                                    alpha=self.alpha_2[self.best_alpha_2],
                                    penalty='l2',
                                    loss='log',
                                    random_state=42)
                clf.fit(train_x_Tfidf, y_train)
                sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
                sig_clf.fit(train_x_Tfidf, y_train)

                predict_y = sig_clf.predict_proba(test_x_Tfidf)
                self.log_writer.log(
                    self.file_object,
                    'For Logistic Reg with balanced class and Tfidf features the best alpha value is:{}, The Test loss is {}'
                    .format(
                        self.alpha_2[self.best_alpha_2],
                        log_loss(y_test,
                                 predict_y,
                                 labels=clf.classes_,
                                 eps=1e-15)))
                self.loss_lr_Tfidf_bal = log_loss(y_test,
                                                  predict_y,
                                                  labels=clf.classes_,
                                                  eps=1e-15)
                loss.append(self.loss_lr_Tfidf_bal)
                self.log_writer.log(
                    self.file_object,
                    'Logistic Reg with balanced class and Tfidf features trainined successfully'
                )

            except Exception as e:
                self.log_writer.log(
                    self.file_object,
                    'Error %s occurred while training the Logistic Reg with balanced class and Tfidf features model '
                    % e)

            #Logistic Regression with Tfidf features and without class balancing
            try:
                self.alpha_3 = [10**x for x in range(-3, 3)]
                cv_log_error_array = []
                for i in self.alpha_3:
                    print("for alpha =", i)
                    clf = SGDClassifier(alpha=i,
                                        penalty='l2',
                                        loss='log',
                                        random_state=42)
                    clf.fit(train_x_Tfidf, y_train)
                    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
                    sig_clf.fit(train_x_Tfidf, y_train)
                    sig_clf_probs = sig_clf.predict_proba(cv_x_Tfidf)
                    cv_log_error_array.append(
                        log_loss(y_cv,
                                 sig_clf_probs,
                                 labels=clf.classes_,
                                 eps=1e-15))
                    # to avoid rounding error while multiplying probabilites we use log-probability estimates
                    print("Log Loss :", log_loss(y_cv, sig_clf_probs))

                self.best_alpha_3 = np.argmin(cv_log_error_array)
                clf = SGDClassifier(alpha=self.alpha_3[self.best_alpha_3],
                                    penalty='l2',
                                    loss='log',
                                    random_state=42)
                clf.fit(train_x_Tfidf, y_train)
                sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
                sig_clf.fit(train_x_Tfidf, y_train)

                predict_y = sig_clf.predict_proba(test_x_Tfidf)

                self.log_writer.log(
                    self.file_object,
                    'For Logistic Reg without balanced class and Tfidf features the best alpha value is:{}, The Test loss is {}'
                    .format(
                        self.alpha_3[self.best_alpha_3],
                        log_loss(y_test,
                                 predict_y,
                                 labels=clf.classes_,
                                 eps=1e-15)))
                self.loss_lr_Tfidf_Notbal = log_loss(y_test,
                                                     predict_y,
                                                     labels=clf.classes_,
                                                     eps=1e-15)
                loss.append(self.loss_lr_Tfidf_Notbal)
                self.log_writer.log(
                    self.file_object,
                    'Logistic Reg without balanced class and Tfidf features trainined successfully'
                )

            except Exception as e:
                self.log_writer.log(
                    self.file_object,
                    'Error %s occurred while training the Logistic Reg without balanced class and Tfidf features model '
                    % e)

            #finding the best model
            least_loss_indices = np.argmin(loss)
            least_loss = loss[least_loss_indices]
            if least_loss == self.loss_lr_OneHot_bal:
                #finally training the model with best parameters
                clf = SGDClassifier(class_weight='balanced',
                                    alpha=self.alpha_1[self.best_alpha_1],
                                    penalty='l2',
                                    loss='log',
                                    random_state=42)
                clf.fit(train_x_onehotCoding, y_train)
                sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
                sig_clf.fit(train_x_onehotCoding, y_train)
                self.log_writer.log(
                    self.file_object,
                    'Best Model choosen is Logistic Reg. with balanced class and OneHot features with Log loss {}'
                    .format(least_loss))

                path_1 = './CountVectorizer/OneHotGene.pkl'
                path_2 = './CountVectorizer/OneHotVariation.pkl'
                path_3 = './CountVectorizer/OneHotText.pkl'

                f = open(self.path, 'a+')
                f.write(path_1 + "\n")
                f.write(path_2 + "\n")
                f.write(path_3 + "\n")
                f.close()

                return sig_clf, clf

            elif least_loss == self.loss_lr_Tfidf_bal:
                # finally training the model with best parameters
                clf = SGDClassifier(class_weight='balanced',
                                    alpha=self.alpha_2[self.best_alpha_2],
                                    penalty='l2',
                                    loss='log',
                                    random_state=42)
                clf.fit(train_x_Tfidf, y_train)
                sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
                sig_clf.fit(train_x_Tfidf, y_train)
                self.log_writer.log(
                    self.file_object,
                    'Best Model choosen is Logistic Reg with balanced class and Tfidf features with Log loss {}'
                    .format(least_loss))
                path_1 = './CountVectorizer/TfidfGene.pkl'
                path_2 = './CountVectorizer/TfidfVariation.pkl'
                path_3 = './CountVectorizer/TfidfText.pkl'

                f = open(self.path, 'a+')
                f.write(path_1 + "\n")
                f.write(path_2 + "\n")
                f.write(path_3 + "\n")
                f.close()

                return sig_clf, clf

            else:
                clf = SGDClassifier(alpha=self.alpha_3[self.best_alpha_3],
                                    penalty='l2',
                                    loss='log',
                                    random_state=42)
                clf.fit(train_x_Tfidf, y_train)
                sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
                sig_clf.fit(train_x_Tfidf, y_train)
                self.log_writer.log(
                    self.file_object,
                    'Best Model choosen is Logistic Reg without balanced class and Tfidf features with Log loss {}'
                    .format(least_loss))
                path_1 = './CountVectorizer/TfidfGene.pkl'
                path_2 = './CountVectorizer/TfidfVariation.pkl'
                path_3 = './CountVectorizer/TfidfText.pkl'

                f = open(self.path, 'a+')
                f.write(path_1 + "\n")
                f.write(path_2 + "\n")
                f.write(path_3 + "\n")
                f.close()

                return sig_clf, clf
        except Exception as e:
            self.log_writer.log(
                self.file_object,
                'Error %s occurred while finding the best model' % e)
コード例 #9
0
class post_training:
    def __init__(self):
        self.log_writer = App_Logger()
        self.file_object = open('./Prediction_log/prediction.txt', 'a+')
        self.model_directory = './Model/'

    def save(self, calibrated_model, original_model):
        """                          Method Name: save
                                     Description: this method will save the best model that has been trained.
                                     Return :None
                                     Written by: Harsh Kumar
                                     Version: 1.0
                                     Revisions: None """
        self.best_model = calibrated_model
        self.feature_imp_model = original_model
        try:
            f = open(self.model_directory + 'best_model' + '.pkl', 'wb')
            pickle.dump(self.best_model, f)
            f.close()
        except Exception as e:
            self.log_writer.log(
                self.file_object,
                'Error %s occurred while saving the best model' % e)
        try:
            f = open(self.model_directory + 'feature_imp_model' + '.pkl', 'wb')
            pickle.dump(self.feature_imp_model, f)
            f.close()
        except Exception as e:
            self.log_writer.log(
                self.file_object,
                'Error %s occurred while saving the original model which will give the festure importance'
                % e)

    def load_model(self):
        """                          Method Name: load_model
                                     Description: this method will load the best model that has been trained.
                                     Return :best model
                                     Written by: Harsh Kumar
                                     Version: 1.0
                                     Revisions: None """
        self.path = './Model/'
        try:
            self.model = pickle.load(
                open(self.path + 'best_model' + '.pkl', 'rb'))

        except Exception as e:
            self.log_writer.log(
                self.file_object,
                'Error %s occurred while loading the best model' % e)

        try:
            self.feature_imp_model = pickle.load(
                open(self.path + 'feature_imp_model' + '.pkl', 'rb'))
            return self.model, self.feature_imp_model
        except Exception as e:
            self.log_writer.log(
                self.file_object,
                'Error %s occurred while loading the original model for feature importance'
                % e)

    def load_vectorizer(self, data):
        """                                 Method Name: load_vectorizer
                                            Description: this method will load the vectorizer required for featuring our original features
                                            Return :
                                            Written by: Harsh Kumar
                                            Version: 1.0
                                            Revisions: None """
        self.path = './Save_Load/vectorizer.txt'
        try:
            with open(self.path, encoding='utf8') as f:
                path_1 = f.readline()
                path_2 = f.readline()
                path_3 = f.readline()
            if path_1[18:24] == 'OneHot':
                try:
                    # featuring the OneHot coding for test data point

                    gene_vectorizer = CountVectorizer(
                        vocabulary=pickle.load(open(path_1[:-1], 'rb')))
                    variation_vectorizer = CountVectorizer(
                        vocabulary=pickle.load(open(path_2[:-1], 'rb')))
                    text_vectorizer = CountVectorizer(
                        min_df=3,
                        vocabulary=pickle.load(open(path_3[:-1], 'rb')))

                    test_gene_feature_onehotCoding = gene_vectorizer.fit_transform(
                        data['Gene']).toarray()
                    test_variation_feature_onehotCoding = variation_vectorizer.fit_transform(
                        data['Variation']).toarray()
                    test_text_feature_onehotCoding = text_vectorizer.fit_transform(
                        data['TEXT']).toarray()

                    #normalize the TEXT feature
                    test_text_feature_onehotCoding = normalize(
                        test_text_feature_onehotCoding, axis=0)

                    # merging test_gene_feature_onehotCoding and test_variation_feature_onehotCoding
                    test_gene_var_onehotCoding = np.hstack(
                        test_gene_feature_onehotCoding,
                        test_variation_feature_onehotCoding)

                    # Finally merging and getting final test_x_onehotCoding
                    test_x_onehotCoding = np.hstack(
                        test_gene_var_onehotCoding,
                        test_text_feature_onehotCoding)

                    self.log_writer.log(self.file_object,
                                        'Successfully featurized ')
                    return test_x_onehotCoding
                except Exception as e:
                    self.log_writer.log(
                        self.file_object,
                        'Error %s occurred while featurizing OneHotEncoding for the test data point'
                        % e)

            elif path_1[18:23] == 'Tfidf':
                try:

                    # featurizing the Tfidf coding for test data point
                    gene_tfidf_vectorizer = TfidfVectorizer(
                        vocabulary=pickle.load(open(path_1[:-1], 'rb')))
                    variation_tfidf_vectorizer = TfidfVectorizer(
                        vocabulary=pickle.load(open(path_2[:-1], 'rb')))
                    text_tfidf_vectorizer = TfidfVectorizer(
                        vocabulary=pickle.load(open(path_3[:-1], 'rb')))

                    test_gene_feature_Tfidf = gene_tfidf_vectorizer.fit_transform(
                        data['Gene'].values).toarray()
                    test_variation_feature_Tfidf = variation_tfidf_vectorizer.fit_transform(
                        data['Variation'].values).toarray()
                    test_text_feature_Tfidf = text_tfidf_vectorizer.fit_transform(
                        data['TEXT'].values).toarray()

                    # normalizing
                    test_text_feature_Tfidf = normalize(
                        test_text_feature_Tfidf, axis=0)

                    # merging test_gene_feature_Tfidf and test_variation_feature_Tfidf
                    test_gene_var_Tfidf = np.hstack(
                        (test_gene_feature_Tfidf,
                         test_variation_feature_Tfidf))

                    # Finally merging and getting test_x_Tfidf
                    test_x_Tfidf = np.hstack(
                        (test_gene_var_Tfidf, test_text_feature_Tfidf))

                    self.log_writer.log(self.file_object,
                                        'Successfully featurized ')
                    return test_x_Tfidf

                except Exception as e:
                    self.log_writer.log(
                        self.file_object,
                        'Error %s occurred while featurizing Tfidf for the test data point'
                        % e)

        except Exception as e:
            self.log_writer.log(
                self.file_object,
                'Error %s Occurred while preprocessing the test data point' %
                e)