コード例 #1
0
    def __init__(self,
                 data_name="data",
                 feature_name="f1.f2",
                 target_name="t",
                 num_crossvalidation=10,
                 random_state=312018,
                 test_split=None,
                 replace_exists=False,
                 logger=None):
        self.logger = logger or File_Logger_Helper.get_logger(logger_fname="data_preprocessing")
        self.model_name = ""
        self.data_name= data_name
        self.feature_name= feature_name
        self.target_name= target_name
        self.num_crossvalidation= num_crossvalidation
        self.random_state= random_state
        self.test_split= test_split
        self.replace_exists= replace_exists
        self.y_train = None
        self.X_test= None
        self.y_test = None
        self.kfold = None
        self.kod = 0

        self.load_data_if_exists()
コード例 #2
0
 def __init__(self,
              dump_model_dir,
              filter_stopword=True,
              use_stemm=False,
              label_dict=None,
              logger=None):
     self.logger = logger or File_Logger_Helper.get_logger(
         logger_fname="bow_logistic_regression")
     self.classifier_name = "logistic_regression_bow"
     self.use_stem = use_stemm
     self.stemmer = PorterStemmer()
     self.filter_stopword = filter_stopword
     self.stopwrods = stopwords.words('english')
     self.label_dict = label_dict
     if not os.path.exists(dump_model_dir):
         os.makedirs(dump_model_dir)
     self.dump_model_fname = os.path.join(
         dump_model_dir, "bow_logistic_regression_model.pickle")
     self.dump_dictionary_fname = os.path.join(
         dump_model_dir, "bow_logistic_regression_dictionary.pickle")
     self.dump_label_encoder_fname = os.path.join(
         dump_model_dir, "bow_logistic_regression_label_encoder.pickle")
     self.model = None
     self.dictionary = None
     self.label_encoder = None
     self.load_model()
コード例 #3
0
 def __init__(self,
              dump_model_dir,
              train_file=None,
              model_name="gen",
              test_size=None,
              cv=10,
              normalize=False,
              is_bool_value=False,
              is_percentage=False,
              logger=None):
     self.logger = logger or File_Logger_Helper.get_logger(
         logger_fname="linear_regression")
     self.classifier_name = "linear_regression"
     if not os.path.exists(dump_model_dir):
         os.makedirs(dump_model_dir)
     if test_size == None:
         general_model_name = "lassocv{}_regression".format(cv)
     else:
         general_model_name = "lasso_regression_split{}".format(test_size)
     self.dump_model_fname = os.path.join(
         dump_model_dir,
         "{}_{}_model.pickle".format(model_name, general_model_name))
     self.dump_standard_scaler_fname = os.path.join(
         dump_model_dir,
         "{}_{}_standard_scaler.pickle".format(model_name,
                                               general_model_name))
     if model_name is None and train_file is not None:
         model_name = re.sub(r"\.\w+$", "", basename(train_file))
     self.model_name = "{}_{}".format(model_name, general_model_name)
     self.out_coef_fname = os.path.join(
         config.ML_OUTPUT_DIR, "{}_coeficient.csv".format(self.model_name))
     self.model = None
     self.standard_scaler = None
     self.load_model(train_file, test_size, cv, normalize, is_bool_value,
                     is_percentage)
コード例 #4
0
 def __init__(self,
              embedding_fname=os.path.join(config.WORD_EMBEDDING_DIR,
                                           'glove.6B.100d.txt'),
              embedding_name="token_vector",
              num_words: int = 10000,
              embedding_vector_dimension: int = 100,
              max_text_len: int = 100,
              data_name="data",
              feature_name="f1.f2",
              target_name="t",
              replace_exists=False,
              logger=None):
     self.logger = logger or File_Logger_Helper.get_logger(
         logger_fname="word_embedding")
     self.embedding_fname = embedding_fname
     self.embedding_name = embedding_name
     self.num_words = num_words
     self.embedding_vector_dimension = embedding_vector_dimension
     self.max_text_len = max_text_len
     self.data_name = data_name
     self.feature_name = feature_name
     self.target_name = target_name
     self.replace_exists = replace_exists
     self.tokenizer = None
     self.embedding_matrix = None
     self.dump_model_dir = config.PREROCESS_PICKLES_DIR
     self.load_init_model()
コード例 #5
0
 def __init__(self,
              dump_model_dir,
              train_file=None,
              is_bool_value=False,
              standardize=False,
              model_name="gen",
              logger=None):
     self.logger = logger or File_Logger_Helper.get_logger(
         logger_fname="linear_regression")
     self.classifier_name = "linear_regression"
     if not os.path.exists(dump_model_dir):
         os.makedirs(dump_model_dir)
     general_model_name = "lassocv10_linear_regression"
     self.dump_model_fname = os.path.join(
         dump_model_dir,
         "{}_{}_model.pickle".format(model_name, general_model_name))
     self.dump_dictionary_fname = os.path.join(
         dump_model_dir,
         "{}_{}_dictionary.pickle".format(model_name, general_model_name))
     self.dump_standard_scaler_fname = os.path.join(
         dump_model_dir,
         "{}_{}_standard_scaler.pickle".format(model_name,
                                               general_model_name))
     self.out_coef_fname = os.path.join(
         config.ML_OUTPUT_DIR,
         "{}_{}_coeficient.csv".format(model_name, general_model_name))
     self.model_name = "{}_{}".format(model_name, general_model_name)
     self.model = None
     self.dictionary = None
     self.standard_scaler = None
     self.load_model(train_file, is_bool_value, standardize)
コード例 #6
0
    def __init__(self,
                 unique_name="feature_preprocess",
                 is_sparse: bool = True,
                 standardize: bool = False,
                 one_hot_encode: bool = False,
                 convert_bool: bool = False,
                 convert_row_percentage: bool = False,
                 normalize_text: bool = True,
                 use_stem: bool = False,
                 min_tokens: int = None,
                 bag_of_word: bool = False,
                 max_token_number: int = None,
                 counter_ngram: int = None,
                 embedding: bool = False,
                 sentence_size_percentage: float = 1,
                 min_word_freq: int = 1,
                 data_name="data",
                 feature_name="f1.f2",
                 target_name="t",
                 replace_exists=False,
                 logger=None):
        self.logger = logger or File_Logger_Helper.get_logger(
            logger_fname="preprocessing")

        self.stemmer = SnowballStemmer('english')

        self.unique_name = unique_name
        self.is_sparse = is_sparse
        self.standardize = standardize
        self.one_hot_encode = one_hot_encode
        self.convert_bool = convert_bool
        self.convert_row_percentage = convert_row_percentage
        self.normalize_text = normalize_text
        self.use_stem = use_stem
        self.min_tokens: int = min_tokens
        self.bag_of_word = bag_of_word
        self.max_token_number = max_token_number
        self.counter_ngram = counter_ngram
        self.embedding = embedding
        self.sentence_size_percentage: float = sentence_size_percentage
        self.min_word_freq = 1,

        # Fixme: we should use dictionary to save the features to vector
        self.standard_scaler = None
        self.one_hot_encoder = None
        self.dictionary = None
        self.counter_vector = None
        self.vocab_processor = None
        self.label_encoder = None

        self.replace_exists = replace_exists

        self.load_model_if_exists()

        # TODO: FIX ME in the future...........
        # self.data_lable_name = "fp_{}_{}_{}".format(data_name, feature_name, target_name)
        self.data_lable_name = "fp_yes_no".format(data_name, feature_name,
                                                  target_name)
        self.load_label_model()
コード例 #7
0
    def __init__(self, logger=None):
        self.logger = logger or File_Logger_Helper.get_logger(
            logger_fname="training")

        self.preprocessor = Feature_Processing()

        self.model_name = None
        self.model = None
コード例 #8
0
 def __init__(self,
              y_gold: list,
              y_pred: list,
              X_gold: pd.Series = None,
              is_multi_class=False,
              logger=None):
     # Please note that the list of gold and predict should have the original label when they pass in.
     self.X_gold = X_gold
     self.y_gold = y_gold
     self.y_pred = y_pred
     self.is_multi_class = is_multi_class
     if is_multi_class is True:
         self.class_names = list(range(self.y_gold.shape[1]))
     else:
         self.class_names = list(set(self.y_gold + self.y_pred))
     self.logger = logger or File_Logger_Helper.get_logger(
         logger_fname="evaluate.log")
コード例 #9
0
    def __init__(self,
                 setting_file_list,
                 X,
                 y,
                 multi_class=None,
                 data_name="data",
                 text_feature_name_list=("f1", "f2"),
                 other_feature_name_list=("f1", "f2"),
                 target_name_list=("t1", "t2"),
                 num_crossvalidation=10,
                 random_state=312018,
                 test_split=None,
                 evaluate_dir=os.path.join(config.EVALUATE_DATA_DIR,
                                           "crossv10"),
                 predict_dir=os.path.join(config.EVALUATE_DATA_DIR,
                                          "crossv10"),
                 logger=None):

        self.logger = logger or File_Logger_Helper.get_logger(
            logger_fname="exp")
        self.X = X
        self.y = y
        self.y = y
        self.multi_class = multi_class
        self.data_name = data_name
        self.text_feature_name_list = text_feature_name_list
        self.other_feature_name_list = other_feature_name_list
        self.feature_name = []
        if text_feature_name_list is not None:
            self.feature_name += text_feature_name_list
        if other_feature_name_list is not None:
            self.feature_name += other_feature_name_list
        self.target_name_list = target_name_list
        self.num_crossvalidation = num_crossvalidation
        self.random_state = random_state
        self.test_split = test_split
        # self.X_unlable = X_unlable
        self.evaluate_dir = evaluate_dir
        self.predict_dir = predict_dir
        self.mid = 0

        self.cv_evaluate_fname = os.path.join(
            evaluate_dir, "crossv10_{}.csv".format(time.time()))
        self.setting_file_list = setting_file_list
コード例 #10
0
    def __init__(self, logger=None):
        self.logger = logger or File_Logger_Helper.get_logger(logger_fname="cv.log")
        self.mid_list = []
        self.roc_auc_list = []
        self.accuracy_list = []

        self.micro_precision_list = []
        self.micro_recall_list = []
        self.micro_f1_list = []
        self.macro_precision_list = []
        self.macro_recall_list = []
        self.macro_f1_list = []
        self.weighted_precision_list = []
        self.weighted_recall_list = []
        self.weighted_f1_list = []

        self.best_macro_f1 = None
        self.best_macro_f1_mid = None
        self.best_micro_f1 = None
        self.best_micro_f1_mid = None
        self.best_weighted_f1 = None
        self.best_weighted_f1_mid = None
コード例 #11
0
    def __init__(self,
                 classifier_name="cnn",
                 num_words=10000,
                 max_text_len=1600,
                 embedding_vector_dimension=100,
                 glove_fname=os.path.join(config.WORD_EMBEDDING_DIR,
                                          'glove.6B.100d.txt'),
                 data_name="data",
                 feature_name="f1.f2",
                 target_name="t",
                 num_class=1,
                 kernel_initializer='glorot_uniform',
                 num_lstm_layer=5,
                 drop_perc=0.1,
                 learning_rate=1e-3,
                 weight_decate_rate=0.7,
                 l2_constraint=0,
                 batch_size=100,
                 epochs=10,
                 logger=None):

        self.logger = logger or File_Logger_Helper.get_logger(
            logger_fname="CNN.log")
        self.feature_preprocessing = Feature_Processing()
        self.classifier_name = classifier_name
        self.num_class = num_class
        self.kernel_initializer = kernel_initializer
        self.num_words = num_words
        self.num_steps = max_text_len
        self.num_lstm_layer = num_lstm_layer
        self.drop_perc = drop_perc
        self.learning_rate = learning_rate
        self.weight_decate_rate = weight_decate_rate
        self.weight_decay = 1e-4
        self.l2_constraint = l2_constraint
        self.batch_size = batch_size
        self.epochs = epochs
        self.model = None

        # Initial the embedding layer.
        if glove_fname is not None:

            self.embedding_helper = Word_Embedding(embedding_fname=glove_fname)
            if embedding_vector_dimension != self.embedding_helper.embedding_vector_dimension:
                self.logger.error(
                    "Error, the embedding vector dimension should be {} instead of {}. Fix embedding_vector_dimension to {}"
                    .format(
                        self.embedding_helper.embedding_vector_dimension,
                        embedding_vector_dimension,
                        self.embedding_helper.embedding_vector_dimension,
                    ))

            self.embedding_vector_dimension = self.embedding_helper.embedding_vector_dimension
            self.embedding_name = "{}_{}_{}_{}".format(
                re.sub(r"\.txt", "_", os.path.basename(glove_fname)),
                data_name, feature_name, target_name)
        else:
            # If the embedding is not specified, we would use the plain token vector.
            self.embedding_helper = Word_Embedding()
            self.embedding_vector_dimension = embedding_vector_dimension
            self.embedding_name = "{}_{}_{}_{}".format("token_vector",
                                                       data_name, feature_name,
                                                       target_name)

        preprocess_name = self.embedding_helper.generate_model_name(
            embedding_name=self.embedding_name,
            num_words=num_words,
            embedding_vector_dimension=embedding_vector_dimension,
            max_text_len=max_text_len)

        self.load_model_if_exists(classifier_name=classifier_name,
                                  preprocess_name=preprocess_name)