Esempio n. 1
0
class LoadPolicy(object):
    def __init__(self, exp_dir, iter):
        model_dir = exp_dir + '/models'
        parser = argparse.ArgumentParser()
        params = json.loads(open(exp_dir + '/config.json').read())
        for key, val in params.items():
            parser.add_argument("-" + key, default=val)
        self.args = parser.parse_args()
        self.policy = Policy4Toyota(self.args)
        self.policy.load_weights(model_dir, iter)
        self.preprocessor = Preprocessor((self.args.obs_dim, ),
                                         self.args.obs_preprocess_type,
                                         self.args.reward_preprocess_type,
                                         self.args.obs_scale,
                                         self.args.reward_scale,
                                         self.args.reward_shift,
                                         gamma=self.args.gamma)
        # self.preprocessor.load_params(load_dir)

    @tf.function
    def run(self, obs):
        processed_obs = self.preprocessor.tf_process_obses(obs)
        action, logp = self.policy.compute_action(processed_obs[np.newaxis, :])
        return action[0]

    @tf.function
    def values(self, obs):
        processed_obs = self.preprocessor.tf_process_obses(obs)
        obj_v = self.policy.compute_obj_v(processed_obs)
        con_v = self.policy.compute_con_v(processed_obs)
        return obj_v, con_v
Esempio n. 2
0
        def _data_loader_fn():
            feats_preps = [
                Preprocessor(vocab, add_bos=False, add_eos=False)
                for vocab in feats_vocabs
            ]
            labels_prep = Preprocessor(labels_vocab,
                                       add_bos=False,
                                       add_eos=False)
            feats_readers = [TextFileReader(path) for path in feats_paths]
            labels_reader = TextFileReader(labels_path)

            feats_gen = [
                SentenceGenerator(reader,
                                  vocab,
                                  args.batch_size,
                                  max_length=args.max_len,
                                  preprocessor=prep,
                                  allow_residual=True) for reader, vocab, prep
                in zip(feats_readers, feats_vocabs, feats_preps)
            ]
            labels_gen = SentenceGenerator(
                labels_reader,
                labels_vocab,
                args.batch_size,
                max_length=args.max_len,
                preprocessor=labels_prep,
                allow_residual=True,
            )

            return feats_gen + [labels_gen]
Esempio n. 3
0
def main():
    a = input()
    b = output()
    prep1 = Preprocessor()
    i = 1
    c = weights()
    f = feature_set()
    while 1:
        try:
            with open('%d.txt' % i):
                inputdataset = a.input_from_file('%d.txt' % i)
                filter1 = prep1.to_lower_case(inputdataset)
                filter2 = prep1.stop_word_eliminate(filter1)
                filter3 = prep1.stem_word(filter2)
                b.write_to_file('out%i.txt' % i, filter3)
                f.all_features += filter3 + ' '
                i += 1
        except IOError:
            break

    f.get_tot_files(i-1)
    f.update_unique_features(f.all_features)

    for each_word in f.unique_features.split():
        c.ret_tot_freq().append(f.all_features.count(each_word))

    #j = 0
    #for each_word in f.unique_features.split():
    #    stdout.write(each_word + ' ' + str(c.ret_tot_freq()[j]) + '\n')
    #    j += 1

    c.update_term_freq_matrix(f.ret_tot_files())
    c.update_inverse_document_freq(f.ret_tot_files(),f.unique_features)
Esempio n. 4
0
    def __training_setup(self, input_data):
        """ Method to initialize all the sub models/objects used as part of the classifier model"""
        logger.info("Setting up model for classifier")
        # Get Data if provided

        self.preprocessor = Preprocessor(input_data)
        self.x_train, self.x_test, self.y_train, self.y_test = self.preprocessor.get_data()

        logger.info("Setting up Vectorizer")
        # Vectorizer
        if self.vectorizer_type == 'tfidf':
            self.vectorizer = TfidfLocVectorizer(max_feat=self.max_feat, maxdf=0.8,
                                                 mindf=15, n_gram_range=(1, 3))

        elif self.vectorizer_type == 'spacy':
            import spacy
            from utils.spacy_vectorizer import SpacyVectorTransformer
            nlp = spacy.load("en_core_web_md")
            self.vectorizer = SpacyVectorTransformer(nlp=nlp)
        else:
            raise ValueError("incorrect vectorizer_type, please use tfidf or spacy")
        # Balance the data
        if self.use_data_under_balancer:
            logger.info("Setting up Naive Balance the data")

            self.data_under_balancer = RandomUnderSampler(sampling_strategy=
                                                          {l: min(70, number - 1) for l, number in
                                                           self.y_test.value_counts().items()})

        logger.info("Run dimension reduction algorithm")
        self.dimension_reduction = TruncatedLocSVD(self.optimum_n_components, total_variance=0.8)

        logger.info("Setting up Classifier")
        # Classifier
        if self.classifier_type == 'xgb':
            self.classifier = XGBClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=5,
                                            min_child_weight=11, n_estimators=1000, n_jobs=4,
                                            objective='binary:multiclass', random_state=RANDOM_STATE, subsample=0.8)
        elif self.classifier_type == 'lgbm':
            params = {'num_leaves': 5,
                      'objective': 'multiclass',
                      'num_class': len(np.unique(self.y_train)),
                      'learning_rate': 0.01,
                      'max_depth': 5,
                      'random_state': RANDOM_STATE
                      }
            self.classifier = lgb.LGBMClassifier(**params)

        else:
            self.classifier = LogisticRegression(multi_class="multinomial",
                                                 class_weight='balanced',
                                                 solver='newton-cg',
                                                 max_iter=100)

        # MLFlow Config
        logger.info("Setting up MLFlow Config")
        mlflow.set_experiment('classifier-model')
Esempio n. 5
0
    def generate_STM(self):    

        preprocessor = Preprocessor()
        for sentence in self.__sentenceList:
            preprocessed_words = preprocessor.preprocess_sentence(sentence)
            sentence_weight = []
            for feature in self.tot_weight_dict().keys():
                if feature in preprocessed_words:
                    sentence_weight.append(self.__tot_weight_dict[feature])
                else:
                    sentence_weight.append(0)
                
            self.__sentenceWeight_dict[sentence] = sentence_weight
Esempio n. 6
0
class LoadPolicy(object):
    def __init__(self, exp_dir, iter):
        model_dir = exp_dir + '/models'
        parser = argparse.ArgumentParser()
        params = json.loads(open(exp_dir + '/config.json').read())
        for key, val in params.items():
            parser.add_argument("-" + key, default=val)
        self.args = parser.parse_args()
        env = CrossroadEnd2end(
            training_task=self.args.env_kwargs_training_task,
            num_future_data=self.args.env_kwargs_num_future_data)
        self.policy = Policy4Toyota(self.args)
        self.policy.load_weights(model_dir, iter)
        self.preprocessor = Preprocessor((self.args.obs_dim, ),
                                         self.args.obs_preprocess_type,
                                         self.args.reward_preprocess_type,
                                         self.args.obs_scale,
                                         self.args.reward_scale,
                                         self.args.reward_shift,
                                         gamma=self.args.gamma)
        # self.preprocessor.load_params(load_dir)
        init_obs = env.reset()
        self.run(init_obs)
        self.obj_value(init_obs)

    @tf.function
    def run(self, obs):
        processed_obs = self.preprocessor.np_process_obses(obs)
        action, _ = self.policy.compute_action(processed_obs[np.newaxis, :])
        return action[0]

    @tf.function
    def obj_value(self, obs):
        processed_obs = self.preprocessor.np_process_obses(obs)
        value = self.policy.compute_obj_v(processed_obs[np.newaxis, :])
        return value

    @tf.function
    def run_batch(self, obses):
        processed_obses = self.preprocessor.np_process_obses(obses)
        actions, _ = self.policy.compute_action(processed_obses)
        return actions

    @tf.function
    def obj_value_batch(self, obses):
        processed_obses = self.preprocessor.np_process_obses(obses)
        values = self.policy.compute_obj_v(processed_obses)
        return values
Esempio n. 7
0
class CharRNN(object):
    def __init__(self, weights, encoding, rnn_type, depth, hidden_size, softmax_temp=0.9, output_lim=144):
        self.preprocessor = Preprocessor()
        self.model = char_rnn.Model(weights, encoding, rnn_type, depth, hidden_size)
        self.output_lim = output_lim
        self.temperature = softmax_temp
        self.buffer = '\n'

    def set_input(self, text):
        if len(text) > 0:
            data = self.preprocessor.process_text(text, newline=False)
            if len(data) == 0 or data[-1] == '\n':
                data = '\n'
            for c in data:
                self.buffer = str(self.model.forward(c, self.temperature))
        else:
            self.buffer = self.buffer[-1]

    def get_output(self):
        for i in range(self.output_lim - 1):
            c = self.buffer[-1]
            self.buffer += str(self.model.forward(c, self.temperature))
            if self.buffer[-1] == '\n':
                break
        return self.buffer
Esempio n. 8
0
 def __init__(self, exp_dir, iter):
     model_dir = exp_dir + '/models'
     parser = argparse.ArgumentParser()
     params = json.loads(open(exp_dir + '/config.json').read())
     for key, val in params.items():
         parser.add_argument("-" + key, default=val)
     self.args = parser.parse_args()
     self.policy = Policy4Toyota(self.args)
     self.policy.load_weights(model_dir, iter)
     self.preprocessor = Preprocessor((self.args.obs_dim, ),
                                      self.args.obs_preprocess_type,
                                      self.args.reward_preprocess_type,
                                      self.args.obs_scale,
                                      self.args.reward_scale,
                                      self.args.reward_shift,
                                      gamma=self.args.gamma)
Esempio n. 9
0
 def __init__(self, exp_dir, iter):
     model_dir = exp_dir + '/models'
     parser = argparse.ArgumentParser()
     params = json.loads(open(exp_dir + '/config.json').read())
     for key, val in params.items():
         parser.add_argument("-" + key, default=val)
     self.args = parser.parse_args()
     env = CrossroadEnd2end(training_task=self.args.env_kwargs_training_task,
                            num_future_data=self.args.env_kwargs_num_future_data)
     self.policy = Policy4Toyota(self.args)
     self.policy.load_weights(model_dir, iter)
     self.preprocessor = Preprocessor((self.args.obs_dim,), self.args.obs_preprocess_type, self.args.reward_preprocess_type,
                                      self.args.obs_scale, self.args.reward_scale, self.args.reward_shift,
                                      gamma=self.args.gamma)
     # self.preprocessor.load_params(load_dir)
     init_obs = env.reset()
     self.run_batch(init_obs[np.newaxis, :])
     self.obj_value_batch(init_obs[np.newaxis, :])
Esempio n. 10
0
    def run(self, model, log_dir):
        test_dataset = DATASETS[self._config["dataset"]["name"]](self._config["dataset"], 'test', loop=False)

        preprocessor = Preprocessor(self._config["preprocessor"])

        from evaluation import EVALUATORS
        evaluator = EVALUATORS[self._config["evaluator"]]

        gt_filenames = []
        prediction_filenames = []

        for img_filename, label_filename in test_dataset:
            prediction_filename = self.trainLabelToEvalFilename(label_filename, self._config["eval_dir"])

            # only run prediction if prediction image does not exist yet
            if not os.path.exists(prediction_filename):

                prediction_dir = os.path.dirname(prediction_filename)
                if not os.path.exists(prediction_dir):
                    os.makedirs(prediction_dir)

                img = imread(img_filename)

                assert img is not None

                # feed the image through the network
                x = preprocessor.process(img)
                y_pred = model.predict(np.asarray([x])).squeeze()
                y_label_pred = np.argmax(y_pred, axis=2)
                y_label_pred = np.asarray(y_label_pred, dtype=np.uint8)

                y_label_pred = resize(y_label_pred, (img.shape[1], img.shape[0]), interpolation=INTER_NEAREST)

                # store it in the eval folder
                imwrite(prediction_filename, y_label_pred)

            gt_filenames.append(label_filename)
            prediction_filenames.append(prediction_filename)

        evaluator.run(prediction_filenames, gt_filenames)
Esempio n. 11
0
 def summarize(self,input_path):
     dataset_preprocessor = Preprocessor()
     dataset_FeatureReducer = FeatureReducer()
     dataset_WeightsHandler = WeightsHandler()
     
     files = [f for f in os.listdir(input_path) if os.path.isfile(input_path + f)]
     
     preprocessed_list = dataset_preprocessor.preprocess(files, input_path)
     sentencelist = dataset_preprocessor.extract_sentences(files, input_path)
     
     dataset_WeightsHandler.set_preprocessed_list(preprocessed_list)
     dataset_WeightsHandler.set_sentence_list(sentencelist)
     dataset_WeightsHandler.update_totfreq_dict()
     dataset_WeightsHandler.replace_totfreq_dict(dataset_FeatureReducer.reduceFeatures(dataset_WeightsHandler.tot_freq_dict()))
     dataset_WeightsHandler.generate_inv_doc_freq_dict(preprocessed_list)      
     dataset_WeightsHandler.generate_tot_weight_dict()    
     dataset_WeightsHandler.generate_STM()
     
     vector_dict = dataset_WeightsHandler.sentence_weight_dict() #vector_dict[sentence]=vector
     
     dataset_FeatureReducer.remove_features_with_zero_weight(vector_dict)
     sentencelist_without_stopwords = dataset_preprocessor.remove_stop_words_from_sentencelist(sentencelist) 
     
     VectorSineRelationExtractor = SineRelationExtractor()
     sine_matrix = VectorSineRelationExtractor.extract_sine_similarity(vector_dict)
     
     synonym_assigner = SynonymAssigner()
     synonym_dict = synonym_assigner.assign_synonyms(sentencelist_without_stopwords)
 
     SentenceDissimilarityScorer = DissimilarityScorer()
     dissimilarity_matrix = SentenceDissimilarityScorer.assign_dissimilarity_score(synonym_dict, sentencelist_without_stopwords)
     final_score_matrix = SentenceDissimilarityScorer.multiply_sine(dissimilarity_matrix, sine_matrix)
     
     SentenceRanker = NodeRanker()
     scorelist_of_sentences= SentenceRanker.calculate_score_of_each_sentence(final_score_matrix)
     ranked_indices = SentenceRanker.rank_nodes(scorelist_of_sentences)
     
     for each_index in ranked_indices:
         print sentencelist[each_index]
Esempio n. 12
0
    def _data_loader_fn():
        feats_preps = [Preprocessor(vocab) for vocab in feats_vocabs]
        feats_readers = [TextFileReader(path) for path in args.feats_path]

        feats_gen = [
            SentenceGenerator(reader,
                              vocab,
                              args.batch_size,
                              max_length=args.max_length,
                              preprocessor=prep,
                              allow_residual=True) for reader, vocab, prep in
            zip(feats_readers, feats_vocabs, feats_preps)
        ]

        return feats_gen
Esempio n. 13
0
    def run(self, model, log_dir):


        train_dataset = DATASETS[self._config["dataset"]["name"]](self._config["dataset"], 'train')
        val_dataset = DATASETS[self._config["dataset"]["name"]](self._config["dataset"], 'val')

        augmenter = Augmenter(self._config["augmenter"])
        preprocessor = Preprocessor(self._config["preprocessor"])

        from keras.callbacks import TensorBoard
        tensor_board = TensorBoard(log_dir=log_dir)

        from keras.optimizers import get as get_optimizer
        optimizer = get_optimizer(self._config["optimizer"])

        model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["categorical_accuracy"])

        train_data_producer = TrainDataProducer(config=self._config, dataset=train_dataset, augmenter=augmenter,
                                                preprocessor=preprocessor)
        valid_data_producer = ValidDataProducer(config=self._config, dataset=val_dataset, preprocessor=preprocessor)

        train_steps_per_epoch = train_dataset.num_samples() / self._config["batch_size"]
        val_steps_per_epoch = val_dataset.num_samples() / self._config["batch_size"]

        model.fit_generator(generator=train_data_producer,
                            steps_per_epoch=train_steps_per_epoch,
                            validation_data=valid_data_producer,
                            validation_steps=val_steps_per_epoch,
                            epochs=self._config["epochs"],
                            callbacks=[tensor_board])

        # save the model in the log directory
        import os
        trained_model_filename = os.path.join(log_dir, 'trained_model.h5')

        print("Saving trained model to %s" % trained_model_filename)

        # If the saving does not work, take a look at
        # https://github.com/keras-team/keras/issues/6766
        # and then upgrade keras!
        model.save(trained_model_filename)
Esempio n. 14
0
# df_without_undesired_words = remove_undesired_words(df_without_bot_posts)

# print("Row count after undesired words removal: ", len(df_without_undesired_words))

# output_filepath = OUTPUT_PATH + get_filename(original_data_path) + "[duplicates_bots_removed]" + FILE_EXTENSION

# os.makedirs(os.path.dirname(output_filepath), exist_ok=True)

# json.dump(df_without_undesired_words.to_dict(orient='records'), open(output_filepath, WRITE_MODE))

# print("Data without duplicates dumped to ", output_filepath)

data = np.array(original_data_frame[field_of_interest], dtype='object')

processor = Preprocessor(posCategories, lang, lemmatize_activated)

processed_data = processor.preprocess(data, stopwords_file)

print("Size of data after preprocessing: ", len(processed_data))

df_after_preprocessing = original_data_frame.assign(body=processed_data)

df_after_preprocessing = df_after_preprocessing[
    df_after_preprocessing['body'].map(lambda field: len(field)) > 0]

print(
    f'Row count after removal of rows with empty "{field_of_interest}" fields: {len(df_after_preprocessing)}'
)

output_filepath = OUTPUT_PATH + get_filename(
Esempio n. 15
0
    else:
        summary = (f'\n{classifier} Accuracy:\n' +
                   '  > Before preprocessing: N/A' +
                   '\n  > After preprocessing: {:0.2f}%'.format(
                       processedScore * 100) +
                   '\n  > Acceptable? {}'.format(acceptable))
    return summary


if __name__ == "__main__":
    predictor = Predictor(test_size=0.2,
                          random_state=27,
                          verbose=args.verbose,
                          save=args.save,
                          acceptance=args.acceptance)
    preprocessor = Preprocessor(verbose=args.verbose)
    samples = [1, 2, 3, 4, 5] if args.all else args.sample

    raw = []
    processed = []
    for i in samples:
        raw_res, processed_res = runTests(i)
        raw.append(raw_res)
        processed.append(processed_res)

    if not args.verbose:
        print('Format: [SVC, KNN, GNB, DTREE]\n')
        for i in range(len(raw)):
            print(f'Sample {samples[i]}:', f'\n  > Raw: {raw[i]}',
                  f'\n  > Processed: {processed[i]}\n')
Esempio n. 16
0
from utils.preprocessor import Preprocessor
import os
from sets.size import Size
from sets.intersections import Intersections
from sets.scorer import Scorer
from graphs.node_ranker import NodeRanker
from sets.distributed_ranks import RankDistributor



input_path = '/home/animesh/T-Sum/Data sets/Inception/'
files = [f for f in os.listdir(input_path) if os.path.isfile(input_path + f)]
prep = Preprocessor()
sentence_list = prep.extract_sentences(files, input_path)
preprocessed_words_in_each_sentence = []

for s in sentence_list:
    preprocessed_words_in_each_sentence.append(prep.preprocess_sentence(s)) 

size = Size()
intersections = Intersections()
scorer = Scorer()
ranker = NodeRanker()
rank_counter_in_0_to_1 = RankDistributor()

size_of_sets = size.calculate_size_of_set(preprocessed_words_in_each_sentence)
number_of_intersections_of_each_sentence = intersections.count_itersections_of_each_set(preprocessed_words_in_each_sentence)
scores = scorer.score_sentences(number_of_intersections_of_each_sentence, size_of_sets)

normalised_scores = scorer.normalise_score(scores)
distributed_ranks = rank_counter_in_0_to_1.distribute_ranks(normalised_scores)
Esempio n. 17
0
    parser.add_argument('-v',
                        '--test',
                        type=str,
                        help='Test dataset (filename, csv)', required=True)
    args = parser.parse_args()

    # Load data
    df_train = pd.read_csv(args.train, usecols=[1, 2])
    X_train_raw = df_train["question"].tolist()
    y_train_raw = df_train["intention"].tolist()
    df_test = pd.read_csv(args.test, usecols=[1, 2])
    X_test_raw = df_test["question"].tolist()
    y_test_raw = df_test["intention"].tolist()

    # Preproccessing
    preprocessor = Preprocessor()
    preprocessor.fit(X_train_raw, y_train_raw)
    X_train = preprocessor.build_sequence(X_train_raw)
    X_test = preprocessor.build_sequence(X_test_raw)
    y_train = preprocessor.label_transform(y_train_raw)
    y_test = preprocessor.label_transform(y_test_raw)

    # Intent classifier prediction
    params = {
        "batch_size": 64,
        "num_epochs": 1,
        "embedding_size": 32,
        "filter_sizes": [3, 4, 5],
        'num_filters': 258,
        "patience": 20,
        "dropout": 0.7,
Esempio n. 18
0
                        type=str,
                        default="../data/val.raw",
                        help="The path to output the validation data")
    parser.add_argument('--max_train_size', type=int, default=1e6)
    parser.add_argument('--max_val_size', type=int, default=0)
    args = parser.parse_args()

    if not (os.path.isfile(args.msg_path)):
        print("Downloading from gitter...")
        download_messages(args.gitter_token, args.chat_room, args.msg_path)

    with open(args.msg_path, 'r') as input:
        print("Loading messages form disk...")
        messages = json.load(input)

    preprocessor = Preprocessor()
    print("Preprocessing...")
    messages_ = []
    for idx, message in enumerate(messages):
        if "fromUser" in message:
            messages[idx]['text'] = preprocessor.process_text(message['text'])
            messages[idx]['fromUser']['username'] = preprocessor.process_text(
                message['fromUser']['username'], newline=False)
            messages_.append(message)
    messages = messages_

    encoder = Encoder()
    if not os.path.isfile(args.encoding_file):
        print("Generating encoding dictionary...")
        encoder.gen_dict(msg2txt(messages))
        encoder.save_enc_dict_json(path='../data/encoding.json')
Esempio n. 19
0
    #     labels.extend(subset_labels_tensor.cpu().detach().numpy())
    #     counter += 1

    # evaluator.evaluate(labels, outputs)

    if ("-predict" in sys.argv):
        # with open(conf.readValue("lstm_model_path"), "rb") as file:
        #     model = pickle.load(file)
        model = PolarityLSTM(embedding_dim, vocab_size, hidden_dim,
                             output_size, n_layers)
        model.load_state_dict(torch.load(conf.readValue("lstm_model_path")))

        model.eval()
        if ("-gpu" in sys.argv):
            model.cuda(device)
        prep = Preprocessor()

        index = sys.argv.index("-predict")

        text = sys.argv[index + 1]
        text = prep.setText(text).correctSpelling().setLemmatizeFlag(
        ).setStopWordsFlag().build()
        text = [text]

        vectorized_seqs = []
        for seq in text:
            vectorized_seqs.append([
                vocab_to_int.get(word, 1) for word in TOKENIZER.tokenize(seq)
            ])

        seq_lengths = torch.LongTensor(list(map(len, vectorized_seqs)))
Esempio n. 20
0
class ClassifierModel:
    """The model which helps in identifying the search term classes"""

    def __init__(self,
                 input_data=None,
                 tfidf_max_feat=500,
                 mlflow_local=False,
                 classifier_type: str = 'lgbm',
                 use_data_under_balancer: bool = False,
                 vectorizer_type: str = 'tfidf',
                 optimum_n_components: int = None,
                 pipeline=None,
                 mlflow_local_url='mlruns_new',
                 model_save_loc='Model_Save'):

        """
        Constructor for classifier
        :param input_data:
        :param tfidf_max_feat:
        :param mlflow_local:
        :param classifier_type:
        :param use_data_under_balancer:
        :param vectorizer_type:
        :param optimum_n_components:
        :param pipeline:
        :param mlflow_local_url:
        :param model_save_loc:
        """

        self.mlflow_local_url = mlflow_local_url
        self.model_save_loc = model_save_loc

        self.text_cleaner = CleanTextTransformer()
        self.classifier = None
        self.classifier_type = classifier_type
        self.vectorizer = None
        self.vectorizer_type = vectorizer_type
        self.pipeline = pipeline
        self.use_data_under_balancer = use_data_under_balancer
        self.data_under_balancer = None
        self.dimension_reduction = None
        self.optimum_n_components = optimum_n_components

        self.max_feat = tfidf_max_feat

        self.x_train = self.y_train = self.x_test = self.y_test = None
        self.input_data = input_data
        self.is_local = mlflow_local

    def __training_setup(self, input_data):
        """ Method to initialize all the sub models/objects used as part of the classifier model"""
        logger.info("Setting up model for classifier")
        # Get Data if provided

        self.preprocessor = Preprocessor(input_data)
        self.x_train, self.x_test, self.y_train, self.y_test = self.preprocessor.get_data()

        logger.info("Setting up Vectorizer")
        # Vectorizer
        if self.vectorizer_type == 'tfidf':
            self.vectorizer = TfidfLocVectorizer(max_feat=self.max_feat, maxdf=0.8,
                                                 mindf=15, n_gram_range=(1, 3))

        elif self.vectorizer_type == 'spacy':
            import spacy
            from utils.spacy_vectorizer import SpacyVectorTransformer
            nlp = spacy.load("en_core_web_md")
            self.vectorizer = SpacyVectorTransformer(nlp=nlp)
        else:
            raise ValueError("incorrect vectorizer_type, please use tfidf or spacy")
        # Balance the data
        if self.use_data_under_balancer:
            logger.info("Setting up Naive Balance the data")

            self.data_under_balancer = RandomUnderSampler(sampling_strategy=
                                                          {l: min(70, number - 1) for l, number in
                                                           self.y_test.value_counts().items()})

        logger.info("Run dimension reduction algorithm")
        self.dimension_reduction = TruncatedLocSVD(self.optimum_n_components, total_variance=0.8)

        logger.info("Setting up Classifier")
        # Classifier
        if self.classifier_type == 'xgb':
            self.classifier = XGBClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=5,
                                            min_child_weight=11, n_estimators=1000, n_jobs=4,
                                            objective='binary:multiclass', random_state=RANDOM_STATE, subsample=0.8)
        elif self.classifier_type == 'lgbm':
            params = {'num_leaves': 5,
                      'objective': 'multiclass',
                      'num_class': len(np.unique(self.y_train)),
                      'learning_rate': 0.01,
                      'max_depth': 5,
                      'random_state': RANDOM_STATE
                      }
            self.classifier = lgb.LGBMClassifier(**params)

        else:
            self.classifier = LogisticRegression(multi_class="multinomial",
                                                 class_weight='balanced',
                                                 solver='newton-cg',
                                                 max_iter=100)

        # MLFlow Config
        logger.info("Setting up MLFlow Config")
        mlflow.set_experiment('classifier-model')

    def train(self, train_x=None, train_y=None, grid_search=True, run_version=None):
        """ Method to train the model
        :param train_x: independent data to train model with
        :param train_y: dependent data to train model with
        :param grid_search: perform grid_search
        :param run_version: Load previous run_version for training if needed
        :return: None
        """
        self.__training_setup(self.input_data)

        logger.info("Training for search term classifier model")
        if not train_x:
            train_x = self.x_train
            train_y = self.y_train

        # Search for previous runs and get run_id if present
        logger.info("Searching for previous runs for given model type")
        df_runs = mlflow.search_runs(filter_string="tags.Model = '{0}'".format('XGB'))
        df_runs = df_runs.loc[~df_runs['tags.Version'].isna(), :] if 'tags.Version' in df_runs else pd.DataFrame()
        if not run_version:
            run_id = None
            load_prev = False
        else:
            try:
                run_id = df_runs.loc[df_runs['tags.Version'] == run_version, 'run_id'].iloc[0]
                load_prev = True
            except Exception as e:
                raise ValueError('run_id with version {0} not found'.format(run_version))
        run_version = len(df_runs) + 1

        # Start the MLFlow Run and train the model
        logger.info("Starting MLFlow run to train model")
        with mlflow.start_run(run_id=run_id):
            # Build pipeline. Load previous pipeline if needed
            if load_prev:
                artifact_uri = mlflow.get_artifact_uri(self.model_save_loc)
                try:
                    load_pipeline = mlflow.sklearn.load_model(artifact_uri)
                    self.pipeline = load_pipeline
                except Exception as e:
                    raise ValueError("Existing model not found / couldn't be loaded.\n" + str(e))
            else:
                if self.use_data_under_balancer:
                    self.pipeline = Pipeline([('clean_text', self.text_cleaner),
                                              (self.vectorizer_type, self.vectorizer),
                                              ('balancer', self.data_under_balancer),
                                              ('dimension_reduction', self.dimension_reduction),
                                              (self.classifier_type, self.classifier)])
                else:
                    self.pipeline = Pipeline([('clean_text', self.text_cleaner),
                                              (self.vectorizer_type, self.vectorizer),
                                              ('dimension_reduction', self.dimension_reduction),
                                              (self.classifier_type, self.classifier)])
                # Todo: Grid Search for LGBM
                if grid_search:
                    xgb_parameters = {
                        'clf__njobs': [4],
                        'clf__objective': ['multiclass'],
                        'clf__learning_rate': [0.05],
                        'clf__max_depth': [6, 12, 18],
                        'clf__min_child_weight': [11, 13, 15],
                        'clf__subsample': [0.7, 0.8],
                        'clf__colsample_bytree': [0.6, 0.7],
                        'clf__n_estimators': [5, 50, 100, 1000],
                        'clf__missing': [-999],
                        'clf__random_state': [RANDOM_STATE]
                    }
                    if self.use_data_under_balancer:
                        xgb_pipeline = Pipeline([('clean_text', self.text_cleaner),
                                                 (self.vectorizer_type, self.vectorizer),
                                                 ('balancer', self.data_under_balancer),
                                                 ('dimension_reduction', self.dimension_reduction),
                                                 ('clf', XGBClassifier())])
                    else:
                        self.pipeline = Pipeline([('clean_text', self.text_cleaner),
                                                  (self.vectorizer_type, self.vectorizer),
                                                  ('dimension_reduction', self.dimension_reduction),
                                                  (self.classifier_type, self.classifier)])

                    self.pipeline = GridSearchCV(xgb_pipeline, xgb_parameters, n_jobs=1, verbose=2, refit=True,
                                                 cv=StratifiedKFold(n_splits=3, shuffle=True))

            # Train the model
            self.pipeline.fit(train_x, train_y)
            logger.info("train is done")
            train_pred = self.pipeline.predict(train_x)

            # read the dict with correct labels
            with open('data/relabeling_dict.yml', 'r') as f:
                relabeling_dict = yaml.load(f)
            labeling_dict = dict(map(reversed, relabeling_dict.items()))

            # classification report on train set
            df = pd.DataFrame(classification_report(train_y, train_pred, output_dict=True)).transpose()
            logger.info("test is done")
            # Save tags and model metrics
            logger.info("Training Complete. Logging results into MLFlow")

            mlflow.log_metric("insam_macro_f1", np.round(df.loc["macro avg", "f1-score"], 5))
            mlflow.log_metric("insam_weighted_f1", np.round(df.loc["weighted avg", "f1-score"], 5))
            df = df.reset_index()
            df.columns = ['category', 'precision', 'recall', 'f1-score', 'support']
            df.loc[:, 'category'] = df['category'].apply(lambda x: labeling_dict[eval(x)] if x.isdigit() else x)
            df.to_csv("insam_full_report.csv")
            mlflow.log_artifact("insam_full_report.csv")
            os.remove("insam_full_report.csv")
            # Log params
            if self.classifier_type in ('lgbm', 'xgb'):
                if grid_search:
                    mlflow.log_param("Best Params", self.pipeline.best_params_)
                    mlflow.log_param("Best Score", self.pipeline.best_score_)
                else:
                    params = self.classifier.get_xgb_params() if self.classifier_type == 'xgb' \
                        else self.classifier.get_params()
                    for key in params:
                        mlflow.log_param(key, params[key])
            else:
                mlflow.log_param('class_weight', 'balanced')
                mlflow.log_param('solver', 'newton-cg')
                mlflow.log_param('max_iter', 100)

            if len(self.x_test):
                test_pred = self.pipeline.predict(self.x_test)
                # classification report on test set
                test_df = pd.DataFrame(classification_report(self.y_test, test_pred, output_dict=True)).transpose()

                mlflow.log_metric("macro_f1", np.round(test_df.loc["macro avg", "f1-score"], 5))
                mlflow.log_metric("weighted_f1", np.round(test_df.loc["weighted avg", "f1-score"], 5))
                test_df = test_df.reset_index()
                test_df.columns = ['category', 'precision', 'recall', 'f1-score', 'support']
                test_df.loc[:, 'category'] = test_df['category'].apply(lambda x: labeling_dict[eval(x)] if x.isdigit() else x)
                test_df.to_csv("full_report.csv")
                mlflow.log_artifact("full_report.csv")
                os.remove("full_report.csv")

            mlflow.sklearn.log_model(self.pipeline, self.model_save_loc, serialization_format='pickle')
            mlflow.set_tag("Model", self.classifier_type)
            mlflow.set_tag("Version", run_version)
            logger.info("Model Trained and saved into MLFlow artifact location")

    def predict(self, data_x=None, proba=False):
        """ Method to use the model to predict
        :param data_x: input
        :param proba: result is probability
        :return:
        """
        logger.info("Predicting using classifier model")
        data_x = data_x.loc[:, 'search_term']

        if not proba:
            test_pred = self.pipeline.predict(data_x)

        else:
            dis_index = list(self.pipeline.classes_).index('category')
            test_pred = [x[dis_index] for x in self.pipeline.predict_proba(data_x)]
        return test_pred
Esempio n. 21
0
 def __init__(self, weights, encoding, rnn_type, depth, hidden_size, softmax_temp=0.9, output_lim=144):
     self.preprocessor = Preprocessor()
     self.model = char_rnn.Model(weights, encoding, rnn_type, depth, hidden_size)
     self.output_lim = output_lim
     self.temperature = softmax_temp
     self.buffer = '\n'
Esempio n. 22
0
        'KNN_95_0426212533.pkl',
        'SVC_35_0426230110.pkl',
        'RF_97_0426211322.pkl',
        'RF_71_0426211600.pkl'
    ]

    models = []
    directory = os.path.join(os.getcwd(), 'models')
    for num, pickleFile in enumerate(pickleFiles, start=1):
        filePath = os.path.join(directory, f'Sample{num}', pickleFile)
        with open(filePath, 'rb') as file:
            print(f'Opening: {filePath}')
            models.append(pickle.load(file))

    tester = Tester()
    preprocessor = Preprocessor()
    for idx, model in enumerate(models, start=1):
        print(idx)
        logging.info('importing data')
        data = importTestData(idx)
        print(f'Data {idx}', data.head())
        # print(f'Data: {data_np}')
        # print(f'type: ', type(data_np))
        logging.info('cleaning data')
        data_mean_replacement = preprocessor.replaceMissingWithMean(data)
        # processed_data = preprocessor.select_features(data_mean_replacement)
        # newData = processed_data.to_numpy()
        # print(f'type: {type(processed_data)}')

        classifier = pickleFiles[idx - 1].split('_')[0]
        logging.info('sending to tester')
Esempio n. 23
0
                        help="The path to output the train data")
    parser.add_argument('--val_output', type=str, default="../data/val.raw",
                        help="The path to output the validation data")
    parser.add_argument('--max_train_size', type=int, default=1e6)
    parser.add_argument('--max_val_size', type=int, default=0)
    args = parser.parse_args()

    if not (os.path.isfile(args.msg_path)):
        print("Downloading from gitter...")
        download_messages(args.gitter_token, args.chat_room, args.msg_path)

    with open(args.msg_path, 'r') as input:
        print("Loading messages form disk...")
        messages = json.load(input)

    preprocessor = Preprocessor()
    print("Preprocessing...")
    for idx, message in enumerate(messages):
        messages[idx]['text'] = preprocessor.process_text(message['text'])
        messages[idx]['fromUser']['username'] = preprocessor.process_text(message['fromUser']['username'],
                                                                          newline=False)

    encoder = Encoder()
    if args.encoding_file is None:
        print("Generating encoding dictionary...")
        encoder.gen_dict(msg2txt(messages))
        encoder.save_enc_dict_json(path='../data/encoding.json')
        encoder.save_dec_dict_binary(path='../data/encoding.raw')
    else:
        print("Loading encoding dictionary from disk...")
        encoder.load_dict(args.encoding_file)
Esempio n. 24
0
def get_preprocessors(lang_in, data_cfg, model_cfg):
    preproc = Preprocessor(lang_in, data_cfg["train_set"], select_preprocessor_features(model_cfg, data_cfg))
    train_ldr = make_loader(lang_in, data_cfg["train_set"], preproc, batch_size)
    dev_ldr = make_loader(lang_in, data_cfg["dev_set"], preproc, batch_size)
    return preproc, train_ldr, dev_ldr
Esempio n. 25
0
def get_data(data_dir,
             source,
             target,
             source_train_path,
             target_train_path,
             source_extension,
             target_extension,
             height,
             width,
             batch_size,
             re=0,
             workers=8):

    dataset = DA(data_dir, source, target, source_train_path,
                 target_train_path, source_extension, target_extension)

    normalizer = T.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    source_num_classes = dataset.num_source_train_ids
    train_transformer = T.Compose([
        T.RandomSizedRectCrop(height, width),
        T.RandomHorizontalFlip(),
        T.ToTensor(),
        normalizer,
        T.RandomErasing(EPSILON=re),
    ])
    test_transformer = T.Compose([
        T.Resize((height, width), interpolation=3),
        T.ToTensor(),
        normalizer,
    ])
    source_train_loader = DataLoader(Preprocessor(
        dataset.source_train,
        root=osp.join(dataset.source_images_dir, dataset.source_train_path),
        transform=train_transformer),
                                     batch_size=batch_size,
                                     num_workers=0,
                                     shuffle=True,
                                     pin_memory=False,
                                     drop_last=True)
    target_train_loader = DataLoader(Preprocessor(
        dataset.target_train,
        root=osp.join(dataset.target_images_dir, dataset.target_train_path),
        transform=train_transformer),
                                     batch_size=batch_size,
                                     num_workers=0,
                                     shuffle=True,
                                     pin_memory=False,
                                     drop_last=True)
    # source_train_loader = DataLoader(
    #     UnsupervisedCamStylePreprocessor(dataset.source_train, root=osp.join(dataset.source_images_dir, dataset.source_train_path),
    #                                      camstyle_root=osp.join(dataset.source_images_dir, dataset.source_train_path),
    #                  transform=train_transformer),
    #     batch_size=batch_size, num_workers=0,
    #     shuffle=True, pin_memory=False, drop_last=True)
    # target_train_loader = DataLoader(
    #     UnsupervisedCamStylePreprocessor(dataset.target_train,
    #                                      root=osp.join(dataset.target_images_dir, dataset.target_train_path),
    #                                      camstyle_root=osp.join(dataset.target_images_dir,
    #                                                             dataset.target_train_camstyle_path),
    #                                      num_cam=dataset.target_num_cam, transform=train_transformer),
    #     batch_size=batch_size, num_workers=workers,
    #     shuffle=True, pin_memory=True, drop_last=True)
    query_loader = DataLoader(Preprocessor(dataset.query,
                                           root=osp.join(
                                               dataset.target_images_dir,
                                               dataset.query_path),
                                           transform=test_transformer),
                              batch_size=batch_size,
                              num_workers=workers,
                              shuffle=False,
                              pin_memory=True)
    gallery_loader = DataLoader(Preprocessor(dataset.gallery,
                                             root=osp.join(
                                                 dataset.target_images_dir,
                                                 dataset.gallery_path),
                                             transform=test_transformer),
                                batch_size=batch_size,
                                num_workers=workers,
                                shuffle=False,
                                pin_memory=True)
    return dataset, source_num_classes, source_train_loader, target_train_loader, query_loader, gallery_loader
Esempio n. 26
0
import os
import subprocess
from utils.preprocessor import Preprocessor
from utils.test_set_splitter import main

DATA_FOLDER = "data"
AGGREGATED = "aggregated"
DATA_SET = "data_set"
TEST_SET = "test_set"
if __name__ == "__main__":
    listdir = os.listdir()
    if not DATA_FOLDER in listdir:
        subprocess.run(["./download_dataset.sh"])
    if not AGGREGATED in listdir:
        os.mkdir(AGGREGATED)
        Preprocessor.aggregateData()
    if not DATA_SET in listdir or not TEST_SET in listdir:
        main()
Esempio n. 27
0
logging.info(f'Is to remove stopwords? {remove_stopwords}')
logging.info(f'Is to remove POS categories? {remove_pos}')
logging.info(f'POS categories to keep: {posCategories}')

data_string = json.load(open(original_data_path, READ_MODE))
logging.info(f'Total of original documents: {len(data_string)}')

original_data_frame = pd.DataFrame.from_dict(data_string)

logging.info(original_data_frame.head())

data = np.array(original_data_frame[field_of_interest], dtype='object')

processor = Preprocessor(posCategories,
                         logger=logging.info,
                         language=lang,
                         lemmatize_activated=lemmatize_activated,
                         remove_pos=remove_pos,
                         remove_stopwords=remove_stopwords)

processed_data, stopwords = processor.preprocess(data, stopwords_file)
del data

logging.info(f'Size of data after preprocessing: {len(processed_data)}')

df_after_preprocessing = original_data_frame.assign(body=processed_data)

df_after_preprocessing = df_after_preprocessing[
    df_after_preprocessing['body'].map(lambda field: len(field)) > 0]

logging.info(
    f'Row count after removal of rows with empty "{field_of_interest}" fields: {len(df_after_preprocessing)}'
Esempio n. 28
0
def preprocessor():
    metadata_loader = MetadataLoader("D:\\shared/birdsong-recognition")
    return Preprocessor(metadata_loader)