Ejemplo n.º 1
0
    def train_model_cv(self,
                       train_file,
                       normalize,
                       is_bool_value,
                       is_percentage,
                       cv=10,
                       save_model=False):
        # training
        self.logger.info("Training Model")
        features_array, label_array, feature_names = self.get_features_array_label_array_from_file(
            train_file,
            normalize=normalize,
            is_bool_value=is_bool_value,
            is_percentage=is_percentage)
        # TODO: you can change the model here. Now we are using 10-cross valication for the model.
        # self.model = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
        # self.model = linear_model.Lasso(alpha = 0.1)
        self.model = linear_model.LassoCV(cv=cv,
                                          normalize=False,
                                          verbose=True,
                                          max_iter=10000)
        print("Model Settings:", self.model)
        self.model.fit(features_array, label_array)

        # TODO: Save the model to pickle. Uncommend the following two lines if you want to save the model.
        if save_model == True:
            Pickle_Helper.save_model_to_pickle(self.model,
                                               self.dump_model_fname)
            Pickle_Helper.save_model_to_pickle(self.standard_scaler,
                                               self.dump_standard_scaler_fname)

        self.print_linear_regression_formular(feature_names)
Ejemplo n.º 2
0
    def load_model_if_exists(self,
                             dump_model_dir=config.PREROCESS_PICKLES_DIR):

        # Load the file is not already done so. If there is no pickle created, train one for it.
        self.dump_model_dir = dump_model_dir
        if not os.path.exists(dump_model_dir):
            os.makedirs(dump_model_dir)

        self.generate_model_name()

        self.dump_standard_scaler_fname = os.path.join(
            dump_model_dir,
            "{}_standard_scaler.pickle".format(self.model_name))
        self.dump_one_hot_encode_fname = os.path.join(
            dump_model_dir, "{}_onehot_encoder.pickle".format(self.model_name))
        self.dump_dictionary_fname = os.path.join(
            dump_model_dir, "{}_dictionary.pickle".format(self.model_name))
        self.dump_counter_vec_fname = os.path.join(
            dump_model_dir, "{}_countvec.pickle".format(self.model_name))
        # self.dump_label_encoder_fname = os.path.join(dump_model_dir, "{}_label.pickle".format(self.model_name))
        self.dump_vocab_processor_fname = os.path.join(
            dump_model_dir, "{}_embedding.pickle".format(self.model_name))

        if self.replace_exists is not True:
            self.standard_scaler = Pickle_Helper.load_model_from_pickle(
                self.dump_standard_scaler_fname)
            self.one_hot_encoder = Pickle_Helper.load_model_from_pickle(
                self.dump_one_hot_encode_fname)
            self.dictionary = Pickle_Helper.load_model_from_pickle(
                self.dump_dictionary_fname)
            self.counter_vector = Pickle_Helper.load_model_from_pickle(
                self.dump_counter_vec_fname)
            # self.label_encoder = Pickle_Helper.load_model_from_pickle(self.dump_label_encoder_fname)
            self.vocab_processor = Pickle_Helper.load_model_from_pickle(
                self.dump_vocab_processor_fname)
Ejemplo n.º 3
0
    def load_data_if_exists(self,
                             dump_model_dir=config.PREROCESS_PICKLES_DIR):
        # Load the file is not already done so. If there is no pickle created, train one for it.
        self.dump_model_dir = dump_model_dir
        if not os.path.exists(dump_model_dir):
            os.makedirs(dump_model_dir)

        self.generate_model_name()

        self.logger.info("Load data model {}".format(self.model_name))

        self.dump_X_train_fname = os.path.join(dump_model_dir, "{}_X_train.pickle".format(self.model_name))
        self.dump_y_train_fname = os.path.join(dump_model_dir, "{}_y_train.pickle".format(self.model_name))
        self.dump_X_test_fname = os.path.join(dump_model_dir, "{}_X_test.pickle".format(self.model_name))
        self.dump_y_test_fname = os.path.join(dump_model_dir, "{}_y_test.pickle".format(self.model_name))

        if self.replace_exists == False:
            self.X_train = Pickle_Helper.load_model_from_pickle(self.dump_X_train_fname)
            self.y_train = Pickle_Helper.load_model_from_pickle(self.dump_y_train_fname)
            self.X_test = Pickle_Helper.load_model_from_pickle(self.dump_X_test_fname)
            self.y_test = Pickle_Helper.load_model_from_pickle(self.dump_y_test_fname)

        self.dump_kfold_fname = os.path.join(dump_model_dir, "{}_kfold.pickle".format(self.model_name))
        if self.replace_exists == False:
            self.kfold = Pickle_Helper.load_model_from_pickle(self.dump_kfold_fname)
Ejemplo n.º 4
0
    def load_model(self, train_file, is_bool_value, standardize):
        # Load the file is not already done so. If there is no pickle created, train one for it.
        self.logger.info("Load Model")
        if self.model is None:
            self.model = Pickle_Helper.load_model_from_pickle(
                self.dump_model_fname)
            self.dictionary = Pickle_Helper.load_model_from_pickle(
                self.dump_dictionary_fname)
            self.standard_scaler = Pickle_Helper.load_model_from_pickle(
                self.dump_standard_scaler_fname)

        if self.model is None:
            self.train_model(train_file, is_bool_value, standardize)
    def load_model(self):
        # Load the file is not already done so. If there is no pickle created, train one for it.
        self.logger.info("Load Model")
        if self.model is None:
            self.model = Pickle_Helper.load_model_from_pickle(
                self.dump_model_fname)
            self.dictionary = Pickle_Helper.load_model_from_pickle(
                self.dump_dictionary_fname)
            self.label_encoder = Pickle_Helper.load_model_from_pickle(
                self.dump_label_encoder_fname)

        if self.model is None:
            self.train_model(config.WASHINGTON_TOPIC_DATA)
Ejemplo n.º 6
0
    def load_feature_bin_vector_model(self):

        self.dump_catbinvector_fname = os.path.join(
            self.dump_model_dir,
            "{}_embd_feavector.pickle".format(self.model_name))
        if self.replace_exists is False:
            self.custom_feature_vector = Pickle_Helper.load_model_from_pickle(
                self.dump_catbinvector_fname)

        if self.custom_feature_vector is None:
            start = datetime.datetime.now()
            end = datetime.datetime.now()

            feature = self.custom_feature_list
            self.custom_feature_vector = CountVectorizer(
                binary=self.custom_feature_binary)
            self.custom_feature_vector.fit(feature)

            self.logger.info("It takes {}s to load {} features.".format(
                (end - start).total_seconds(),
                len(self.custom_feature_vector.vocabulary_)))
            self.embedding_vector_dimension = len(self.custom_feature_list)
            self.logger.info(
                "The actual embedding_vector_dimension is {}".format(
                    self.embedding_vector_dimension))
            self.store_feature_bin_vector()
Ejemplo n.º 7
0
    def store_data(self, replace_exists=False):
        if not os.path.exists(self.dump_X_train_fname) or replace_exists is True:
            if self.X_train is not None:
                Pickle_Helper.save_model_to_pickle(self.X_train, self.dump_X_train_fname)
        if not os.path.exists(self.dump_y_train_fname) or replace_exists is True:
            if self.y_train is not None:
                Pickle_Helper.save_model_to_pickle(self.y_train, self.dump_y_train_fname)
        if not os.path.exists(self.dump_X_test_fname) or replace_exists is True:
            if self.X_test is not None:
                Pickle_Helper.save_model_to_pickle(self.X_test, self.dump_X_test_fname)
        if not os.path.exists(self.dump_y_test_fname) or replace_exists is True:
            if self.y_test is not None:
                Pickle_Helper.save_model_to_pickle(self.y_test, self.dump_y_test_fname)

        if not os.path.exists(self.dump_kfold_fname) or replace_exists is True:
            if self.kfold is not None:
                Pickle_Helper.save_model_to_pickle(self.kfold, self.dump_kfold_fname)
Ejemplo n.º 8
0
    def load_model(self, train_file, test_size, cv, normalize, is_bool_value,
                   is_percentage):
        # Load the file is not already done so. If there is no pickle created, train one for it.
        self.logger.info("Load Model")
        if self.model is None:
            self.model = Pickle_Helper.load_model_from_pickle(
                self.dump_model_fname)
            self.standard_scaler = Pickle_Helper.load_model_from_pickle(
                self.dump_standard_scaler_fname)

        if self.model is None:
            if test_size is None:
                # Cross validation
                self.train_model_cv(train_file, normalize, is_bool_value,
                                    is_percentage, cv)
            else:
                # Otherwise
                self.train_model(train_file, normalize, is_bool_value,
                                 is_percentage)
Ejemplo n.º 9
0
    def load_init_model(self):

        if not os.path.exists(self.dump_model_dir):
            os.makedirs(self.dump_model_dir)

        self.generate_model_name()

        # Load the file is not already done so. If there is no pickle created, train one for it.
        self.logger.info("Load Model {}".format(self.model_name))

        self.dump_tokenizer_fname = os.path.join(
            self.dump_model_dir, "{}_tokenizer.pickle".format(self.model_name))
        self.dump_embmatrix_fname = os.path.join(
            self.dump_model_dir, "{}_embmatrix.pickle".format(self.model_name))
        if self.replace_exists is False:
            self.tokenizer = Pickle_Helper.load_model_from_pickle(
                self.dump_tokenizer_fname)
            self.embedding_matrix = Pickle_Helper.load_model_from_pickle(
                self.dump_embmatrix_fname)
Ejemplo n.º 10
0
    def train_model(self,
                    train_file,
                    normalize,
                    is_bool_value,
                    is_percentage,
                    test_size=0.10,
                    alpha=0.1,
                    save_model=False):
        # training
        self.logger.info("Training Model")
        features_array, label_array, feature_names = self.get_features_array_label_array_from_file(
            train_file,
            normalize=normalize,
            is_bool_value=is_bool_value,
            is_percentage=is_percentage)

        X_train, X_test, y_train, y_test = train_test_split(
            features_array, label_array, test_size=test_size)

        # TODO: you can change the model here. Now we aplit the training set and test set.
        # self.model = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
        # self.model = linear_model.LassoCV(cv=10, normalize=False, verbose=True, max_iter=10000)
        # We could use the best alpha learnt from cross validation.
        self.model = linear_model.Lasso(alpha=alpha)
        print("Model Settings:", self.model)
        self.logger.info("Training Model")
        self.model.fit(X_train, y_train)
        score = self.model.score(X_test, y_test)
        print("R score", score)

        y_predict = self.model.predict(X_test)
        regression_model_mse = mean_squared_error(y_predict, y_test)
        print("alpha", self.model.alpha)
        print("mse", regression_model_mse)

        if save_model == True:
            Pickle_Helper.save_model_to_pickle(self.model,
                                               self.dump_model_fname)
            Pickle_Helper.save_model_to_pickle(self.standard_scaler,
                                               self.dump_standard_scaler_fname)

        self.print_linear_regression_formular(feature_names)
Ejemplo n.º 11
0
    def load_label_model(self, dump_model_dir=config.PREROCESS_PICKLES_DIR):
        # Load the file is not already done so. If there is no pickle created, train one for it.
        self.dump_model_dir = dump_model_dir
        if not os.path.exists(dump_model_dir):
            os.makedirs(dump_model_dir)

        self.dump_label_encoder_fname = os.path.join(
            dump_model_dir, "{}_label.pickle".format(self.data_lable_name))
        self.label_encoder = Pickle_Helper.load_model_from_pickle(
            self.dump_label_encoder_fname)
        print("load label", self.dump_label_encoder_fname)
Ejemplo n.º 12
0
 def get_X_y_featurenames_from_pickle(self, filename,
                                      feature_columns:list=None,
                                      label_colnames:list=None,
                                      drop_colnames:list=None):
     df = Pickle_Helper.load_model_from_pickle(pickle_fname=filename)
     return self.get_X_y_featurenames_from_dateframe(
         df,
         feature_columns=feature_columns,
         label_colnames=label_colnames,
         drop_colnames=drop_colnames
     )
Ejemplo n.º 13
0
    def train_model(self, train_file, is_bool_value=False, standardize=False):
        # training
        self.logger.info("Get Features")
        features_array, label_array = self.get_features_array_label_array_from_file(
            train_file,
            is_training=True,
            is_bool_value=is_bool_value,
            standardize=standardize)
        # TODO: check about the different parameters.
        # self.model = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
        # self.model = linear_model.Lasso(alpha = 0.1)
        self.model = linear_model.LassoCV(cv=10,
                                          normalize=False,
                                          verbose=True,
                                          n_jobs=2)
        print(self.model)
        self.logger.info("Training Model")
        self.model.fit(features_array, label_array)

        Pickle_Helper.save_model_to_pickle(self.model, self.dump_model_fname)
        Pickle_Helper.save_model_to_pickle(self.dictionary,
                                           self.dump_dictionary_fname)
        Pickle_Helper.save_model_to_pickle(self.standard_scaler,
                                           self.dump_standard_scaler_fname)
        self.print_linear_regression_formular()
Ejemplo n.º 14
0
    def load_model_if_exists(self,
                             classifier_name="general",
                             preprocess_name="general",
                             dump_model_dir=config.PREROCESS_PICKLES_DIR):
        # Load the file is not already done so. If there is no pickle created, train one for it.
        self.logger.info("Load Model")

        self.dump_model_dir = dump_model_dir
        if not os.path.exists(dump_model_dir):
            os.makedirs(dump_model_dir)

        self.model_name = "{}_{}".format(classifier_name, preprocess_name)

        self.dump_model_fname = os.path.join(
            dump_model_dir, "{}.pickle".format(self.model_name))

        self.model = Pickle_Helper.load_model_from_pickle(
            self.dump_model_fname)
Ejemplo n.º 15
0
    def train_model(self, train_file):
        # training
        self.logger.info("Training Model")
        features_array, label_array = self.get_features_array_label_array_from_file(
            train_file, is_training=True)
        self.model = LogisticRegression(solver='lbfgs',
                                        multi_class='multinomial',
                                        class_weight='balanced')
        self.model.fit(features_array, label_array)

        Pickle_Helper.save_model_to_pickle(self.model, self.dump_model_fname)
        Pickle_Helper.save_model_to_pickle(self.dictionary,
                                           self.dump_dictionary_fname)
        Pickle_Helper.save_model_to_pickle(self.label_encoder,
                                           self.dump_label_encoder_fname)
Ejemplo n.º 16
0
 def store_lable_model(self, replace_exists=False):
     if not os.path.exists(
             self.dump_label_encoder_fname) or replace_exists is True:
         if self.label_encoder is not None:
             Pickle_Helper.save_model_to_pickle(
                 self.label_encoder, self.dump_label_encoder_fname)
Ejemplo n.º 17
0
 def store_model(self, replace_exists=False):
     if not os.path.exists(
             self.dump_standard_scaler_fname) or replace_exists is True:
         if self.standard_scaler is not None:
             Pickle_Helper.save_model_to_pickle(
                 self.standard_scaler, self.dump_standard_scaler_fname)
     if not os.path.exists(
             self.dump_one_hot_encode_fname) or replace_exists is True:
         if self.one_hot_encoder is not None:
             Pickle_Helper.save_model_to_pickle(
                 self.one_hot_encoder, self.dump_one_hot_encode_fname)
     if not os.path.exists(
             self.dump_dictionary_fname) or replace_exists is True:
         if self.dictionary is not None:
             Pickle_Helper.save_model_to_pickle(self.dictionary,
                                                self.dump_dictionary_fname)
     if not os.path.exists(
             self.dump_counter_vec_fname) or replace_exists is True:
         if self.counter_vector is not None:
             Pickle_Helper.save_model_to_pickle(self.counter_vector,
                                                self.dump_counter_vec_fname)
     if not os.path.exists(
             self.dump_label_encoder_fname) or replace_exists is True:
         if self.label_encoder is not None:
             Pickle_Helper.save_model_to_pickle(
                 self.label_encoder, self.dump_label_encoder_fname)
     if not os.path.exists(
             self.dump_vocab_processor_fname) or replace_exists is True:
         if self.vocab_processor is not None:
             Pickle_Helper.save_model_to_pickle(
                 self.vocab_processor, self.dump_vocab_processor_fname)
Ejemplo n.º 18
0
 def store_tokenzier(self, replace_exists=False):
     if not os.path.exists(
             self.dump_tokenizer_fname) or replace_exists is True:
         if self.tokenizer is not None:
             Pickle_Helper.save_model_to_pickle(self.tokenizer,
                                                self.dump_tokenizer_fname)
Ejemplo n.º 19
0
 def store_embedding_matrix(self, replace_exists=False):
     if not os.path.exists(
             self.dump_embmatrix_fname) or replace_exists is True:
         if self.embedding_matrix is not None:
             Pickle_Helper.save_model_to_pickle(self.embedding_matrix,
                                                self.dump_embmatrix_fname)
Ejemplo n.º 20
0
 def store_feature_bin_vector(self):
     if not os.path.exists(
             self.dump_catbinvector_fname) or self.replace_exists is True:
         if self.custom_feature_vector is not None:
             Pickle_Helper.save_model_to_pickle(
                 self.custom_feature_vector, self.dump_catbinvector_fname)
Ejemplo n.º 21
0
 def store_model_if_not_exits(self, replace_exists=False):
     if not os.path.exists(self.dump_model_fname) or replace_exists is True:
         Pickle_Helper.save_model_to_pickle(self.model,
                                            self.dump_model_fname)
Ejemplo n.º 22
0
 def get_df_from_pickle(self, filename):
     ret = Pickle_Helper.load_model_from_pickle(filename)
     df = pd.DataFrame(ret)
     # print("columns", df.columns.values)
     return df