Exemple #1
0
        def _data_loader_fn():
            feats_preps = [
                Preprocessor(vocab, add_bos=False, add_eos=False)
                for vocab in feats_vocabs
            ]
            labels_prep = Preprocessor(labels_vocab,
                                       add_bos=False,
                                       add_eos=False)
            feats_readers = [TextFileReader(path) for path in feats_paths]
            labels_reader = TextFileReader(labels_path)

            feats_gen = [
                SentenceGenerator(reader,
                                  vocab,
                                  args.batch_size,
                                  max_length=args.max_len,
                                  preprocessor=prep,
                                  allow_residual=True) for reader, vocab, prep
                in zip(feats_readers, feats_vocabs, feats_preps)
            ]
            labels_gen = SentenceGenerator(
                labels_reader,
                labels_vocab,
                args.batch_size,
                max_length=args.max_len,
                preprocessor=labels_prep,
                allow_residual=True,
            )

            return feats_gen + [labels_gen]
    def __training_setup(self, input_data):
        """ Method to initialize all the sub models/objects used as part of the classifier model"""
        logger.info("Setting up model for classifier")
        # Get Data if provided

        self.preprocessor = Preprocessor(input_data)
        self.x_train, self.x_test, self.y_train, self.y_test = self.preprocessor.get_data()

        logger.info("Setting up Vectorizer")
        # Vectorizer
        if self.vectorizer_type == 'tfidf':
            self.vectorizer = TfidfLocVectorizer(max_feat=self.max_feat, maxdf=0.8,
                                                 mindf=15, n_gram_range=(1, 3))

        elif self.vectorizer_type == 'spacy':
            import spacy
            from utils.spacy_vectorizer import SpacyVectorTransformer
            nlp = spacy.load("en_core_web_md")
            self.vectorizer = SpacyVectorTransformer(nlp=nlp)
        else:
            raise ValueError("incorrect vectorizer_type, please use tfidf or spacy")
        # Balance the data
        if self.use_data_under_balancer:
            logger.info("Setting up Naive Balance the data")

            self.data_under_balancer = RandomUnderSampler(sampling_strategy=
                                                          {l: min(70, number - 1) for l, number in
                                                           self.y_test.value_counts().items()})

        logger.info("Run dimension reduction algorithm")
        self.dimension_reduction = TruncatedLocSVD(self.optimum_n_components, total_variance=0.8)

        logger.info("Setting up Classifier")
        # Classifier
        if self.classifier_type == 'xgb':
            self.classifier = XGBClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=5,
                                            min_child_weight=11, n_estimators=1000, n_jobs=4,
                                            objective='binary:multiclass', random_state=RANDOM_STATE, subsample=0.8)
        elif self.classifier_type == 'lgbm':
            params = {'num_leaves': 5,
                      'objective': 'multiclass',
                      'num_class': len(np.unique(self.y_train)),
                      'learning_rate': 0.01,
                      'max_depth': 5,
                      'random_state': RANDOM_STATE
                      }
            self.classifier = lgb.LGBMClassifier(**params)

        else:
            self.classifier = LogisticRegression(multi_class="multinomial",
                                                 class_weight='balanced',
                                                 solver='newton-cg',
                                                 max_iter=100)

        # MLFlow Config
        logger.info("Setting up MLFlow Config")
        mlflow.set_experiment('classifier-model')
Exemple #3
0
    def _data_loader_fn():
        feats_preps = [Preprocessor(vocab) for vocab in feats_vocabs]
        feats_readers = [TextFileReader(path) for path in args.feats_path]

        feats_gen = [
            SentenceGenerator(reader,
                              vocab,
                              args.batch_size,
                              max_length=args.max_length,
                              preprocessor=prep,
                              allow_residual=True) for reader, vocab, prep in
            zip(feats_readers, feats_vocabs, feats_preps)
        ]

        return feats_gen
 def __init__(self, exp_dir, iter):
     model_dir = exp_dir + '/models'
     parser = argparse.ArgumentParser()
     params = json.loads(open(exp_dir + '/config.json').read())
     for key, val in params.items():
         parser.add_argument("-" + key, default=val)
     self.args = parser.parse_args()
     self.policy = Policy4Toyota(self.args)
     self.policy.load_weights(model_dir, iter)
     self.preprocessor = Preprocessor((self.args.obs_dim, ),
                                      self.args.obs_preprocess_type,
                                      self.args.reward_preprocess_type,
                                      self.args.obs_scale,
                                      self.args.reward_scale,
                                      self.args.reward_shift,
                                      gamma=self.args.gamma)
Exemple #5
0
 def __init__(self, exp_dir, iter):
     model_dir = exp_dir + '/models'
     parser = argparse.ArgumentParser()
     params = json.loads(open(exp_dir + '/config.json').read())
     for key, val in params.items():
         parser.add_argument("-" + key, default=val)
     self.args = parser.parse_args()
     env = CrossroadEnd2end(training_task=self.args.env_kwargs_training_task,
                            num_future_data=self.args.env_kwargs_num_future_data)
     self.policy = Policy4Toyota(self.args)
     self.policy.load_weights(model_dir, iter)
     self.preprocessor = Preprocessor((self.args.obs_dim,), self.args.obs_preprocess_type, self.args.reward_preprocess_type,
                                      self.args.obs_scale, self.args.reward_scale, self.args.reward_shift,
                                      gamma=self.args.gamma)
     # self.preprocessor.load_params(load_dir)
     init_obs = env.reset()
     self.run_batch(init_obs[np.newaxis, :])
     self.obj_value_batch(init_obs[np.newaxis, :])
Exemple #6
0
    def run(self, model, log_dir):


        train_dataset = DATASETS[self._config["dataset"]["name"]](self._config["dataset"], 'train')
        val_dataset = DATASETS[self._config["dataset"]["name"]](self._config["dataset"], 'val')

        augmenter = Augmenter(self._config["augmenter"])
        preprocessor = Preprocessor(self._config["preprocessor"])

        from keras.callbacks import TensorBoard
        tensor_board = TensorBoard(log_dir=log_dir)

        from keras.optimizers import get as get_optimizer
        optimizer = get_optimizer(self._config["optimizer"])

        model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["categorical_accuracy"])

        train_data_producer = TrainDataProducer(config=self._config, dataset=train_dataset, augmenter=augmenter,
                                                preprocessor=preprocessor)
        valid_data_producer = ValidDataProducer(config=self._config, dataset=val_dataset, preprocessor=preprocessor)

        train_steps_per_epoch = train_dataset.num_samples() / self._config["batch_size"]
        val_steps_per_epoch = val_dataset.num_samples() / self._config["batch_size"]

        model.fit_generator(generator=train_data_producer,
                            steps_per_epoch=train_steps_per_epoch,
                            validation_data=valid_data_producer,
                            validation_steps=val_steps_per_epoch,
                            epochs=self._config["epochs"],
                            callbacks=[tensor_board])

        # save the model in the log directory
        import os
        trained_model_filename = os.path.join(log_dir, 'trained_model.h5')

        print("Saving trained model to %s" % trained_model_filename)

        # If the saving does not work, take a look at
        # https://github.com/keras-team/keras/issues/6766
        # and then upgrade keras!
        model.save(trained_model_filename)
Exemple #7
0
    def run(self, model, log_dir):
        test_dataset = DATASETS[self._config["dataset"]["name"]](self._config["dataset"], 'test', loop=False)

        preprocessor = Preprocessor(self._config["preprocessor"])

        from evaluation import EVALUATORS
        evaluator = EVALUATORS[self._config["evaluator"]]

        gt_filenames = []
        prediction_filenames = []

        for img_filename, label_filename in test_dataset:
            prediction_filename = self.trainLabelToEvalFilename(label_filename, self._config["eval_dir"])

            # only run prediction if prediction image does not exist yet
            if not os.path.exists(prediction_filename):

                prediction_dir = os.path.dirname(prediction_filename)
                if not os.path.exists(prediction_dir):
                    os.makedirs(prediction_dir)

                img = imread(img_filename)

                assert img is not None

                # feed the image through the network
                x = preprocessor.process(img)
                y_pred = model.predict(np.asarray([x])).squeeze()
                y_label_pred = np.argmax(y_pred, axis=2)
                y_label_pred = np.asarray(y_label_pred, dtype=np.uint8)

                y_label_pred = resize(y_label_pred, (img.shape[1], img.shape[0]), interpolation=INTER_NEAREST)

                # store it in the eval folder
                imwrite(prediction_filename, y_label_pred)

            gt_filenames.append(label_filename)
            prediction_filenames.append(prediction_filename)

        evaluator.run(prediction_filenames, gt_filenames)
Exemple #8
0
                        type=str,
                        default="../data/val.raw",
                        help="The path to output the validation data")
    parser.add_argument('--max_train_size', type=int, default=1e6)
    parser.add_argument('--max_val_size', type=int, default=0)
    args = parser.parse_args()

    if not (os.path.isfile(args.msg_path)):
        print("Downloading from gitter...")
        download_messages(args.gitter_token, args.chat_room, args.msg_path)

    with open(args.msg_path, 'r') as input:
        print("Loading messages form disk...")
        messages = json.load(input)

    preprocessor = Preprocessor()
    print("Preprocessing...")
    messages_ = []
    for idx, message in enumerate(messages):
        if "fromUser" in message:
            messages[idx]['text'] = preprocessor.process_text(message['text'])
            messages[idx]['fromUser']['username'] = preprocessor.process_text(
                message['fromUser']['username'], newline=False)
            messages_.append(message)
    messages = messages_

    encoder = Encoder()
    if not os.path.isfile(args.encoding_file):
        print("Generating encoding dictionary...")
        encoder.gen_dict(msg2txt(messages))
        encoder.save_enc_dict_json(path='../data/encoding.json')
Exemple #9
0
def get_data(data_dir,
             source,
             target,
             source_train_path,
             target_train_path,
             source_extension,
             target_extension,
             height,
             width,
             batch_size,
             re=0,
             workers=8):

    dataset = DA(data_dir, source, target, source_train_path,
                 target_train_path, source_extension, target_extension)

    normalizer = T.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    source_num_classes = dataset.num_source_train_ids
    train_transformer = T.Compose([
        T.RandomSizedRectCrop(height, width),
        T.RandomHorizontalFlip(),
        T.ToTensor(),
        normalizer,
        T.RandomErasing(EPSILON=re),
    ])
    test_transformer = T.Compose([
        T.Resize((height, width), interpolation=3),
        T.ToTensor(),
        normalizer,
    ])
    source_train_loader = DataLoader(Preprocessor(
        dataset.source_train,
        root=osp.join(dataset.source_images_dir, dataset.source_train_path),
        transform=train_transformer),
                                     batch_size=batch_size,
                                     num_workers=0,
                                     shuffle=True,
                                     pin_memory=False,
                                     drop_last=True)
    target_train_loader = DataLoader(Preprocessor(
        dataset.target_train,
        root=osp.join(dataset.target_images_dir, dataset.target_train_path),
        transform=train_transformer),
                                     batch_size=batch_size,
                                     num_workers=0,
                                     shuffle=True,
                                     pin_memory=False,
                                     drop_last=True)
    # source_train_loader = DataLoader(
    #     UnsupervisedCamStylePreprocessor(dataset.source_train, root=osp.join(dataset.source_images_dir, dataset.source_train_path),
    #                                      camstyle_root=osp.join(dataset.source_images_dir, dataset.source_train_path),
    #                  transform=train_transformer),
    #     batch_size=batch_size, num_workers=0,
    #     shuffle=True, pin_memory=False, drop_last=True)
    # target_train_loader = DataLoader(
    #     UnsupervisedCamStylePreprocessor(dataset.target_train,
    #                                      root=osp.join(dataset.target_images_dir, dataset.target_train_path),
    #                                      camstyle_root=osp.join(dataset.target_images_dir,
    #                                                             dataset.target_train_camstyle_path),
    #                                      num_cam=dataset.target_num_cam, transform=train_transformer),
    #     batch_size=batch_size, num_workers=workers,
    #     shuffle=True, pin_memory=True, drop_last=True)
    query_loader = DataLoader(Preprocessor(dataset.query,
                                           root=osp.join(
                                               dataset.target_images_dir,
                                               dataset.query_path),
                                           transform=test_transformer),
                              batch_size=batch_size,
                              num_workers=workers,
                              shuffle=False,
                              pin_memory=True)
    gallery_loader = DataLoader(Preprocessor(dataset.gallery,
                                             root=osp.join(
                                                 dataset.target_images_dir,
                                                 dataset.gallery_path),
                                             transform=test_transformer),
                                batch_size=batch_size,
                                num_workers=workers,
                                shuffle=False,
                                pin_memory=True)
    return dataset, source_num_classes, source_train_loader, target_train_loader, query_loader, gallery_loader
Exemple #10
0
# df_without_undesired_words = remove_undesired_words(df_without_bot_posts)

# print("Row count after undesired words removal: ", len(df_without_undesired_words))

# output_filepath = OUTPUT_PATH + get_filename(original_data_path) + "[duplicates_bots_removed]" + FILE_EXTENSION

# os.makedirs(os.path.dirname(output_filepath), exist_ok=True)

# json.dump(df_without_undesired_words.to_dict(orient='records'), open(output_filepath, WRITE_MODE))

# print("Data without duplicates dumped to ", output_filepath)

data = np.array(original_data_frame[field_of_interest], dtype='object')

processor = Preprocessor(posCategories, lang, lemmatize_activated)

processed_data = processor.preprocess(data, stopwords_file)

print("Size of data after preprocessing: ", len(processed_data))

df_after_preprocessing = original_data_frame.assign(body=processed_data)

df_after_preprocessing = df_after_preprocessing[
    df_after_preprocessing['body'].map(lambda field: len(field)) > 0]

print(
    f'Row count after removal of rows with empty "{field_of_interest}" fields: {len(df_after_preprocessing)}'
)

output_filepath = OUTPUT_PATH + get_filename(
Exemple #11
0
    else:
        summary = (f'\n{classifier} Accuracy:\n' +
                   '  > Before preprocessing: N/A' +
                   '\n  > After preprocessing: {:0.2f}%'.format(
                       processedScore * 100) +
                   '\n  > Acceptable? {}'.format(acceptable))
    return summary


if __name__ == "__main__":
    predictor = Predictor(test_size=0.2,
                          random_state=27,
                          verbose=args.verbose,
                          save=args.save,
                          acceptance=args.acceptance)
    preprocessor = Preprocessor(verbose=args.verbose)
    samples = [1, 2, 3, 4, 5] if args.all else args.sample

    raw = []
    processed = []
    for i in samples:
        raw_res, processed_res = runTests(i)
        raw.append(raw_res)
        processed.append(processed_res)

    if not args.verbose:
        print('Format: [SVC, KNN, GNB, DTREE]\n')
        for i in range(len(raw)):
            print(f'Sample {samples[i]}:', f'\n  > Raw: {raw[i]}',
                  f'\n  > Processed: {processed[i]}\n')
Exemple #12
0
def get_preprocessors(lang_in, data_cfg, model_cfg):
    preproc = Preprocessor(lang_in, data_cfg["train_set"], select_preprocessor_features(model_cfg, data_cfg))
    train_ldr = make_loader(lang_in, data_cfg["train_set"], preproc, batch_size)
    dev_ldr = make_loader(lang_in, data_cfg["dev_set"], preproc, batch_size)
    return preproc, train_ldr, dev_ldr
Exemple #13
0
 def __init__(self, weights, encoding, rnn_type, depth, hidden_size, softmax_temp=0.9, output_lim=144):
     self.preprocessor = Preprocessor()
     self.model = char_rnn.Model(weights, encoding, rnn_type, depth, hidden_size)
     self.output_lim = output_lim
     self.temperature = softmax_temp
     self.buffer = '\n'
Exemple #14
0
def preprocessor():
    metadata_loader = MetadataLoader("D:\\shared/birdsong-recognition")
    return Preprocessor(metadata_loader)
Exemple #15
0
    #     labels.extend(subset_labels_tensor.cpu().detach().numpy())
    #     counter += 1

    # evaluator.evaluate(labels, outputs)

    if ("-predict" in sys.argv):
        # with open(conf.readValue("lstm_model_path"), "rb") as file:
        #     model = pickle.load(file)
        model = PolarityLSTM(embedding_dim, vocab_size, hidden_dim,
                             output_size, n_layers)
        model.load_state_dict(torch.load(conf.readValue("lstm_model_path")))

        model.eval()
        if ("-gpu" in sys.argv):
            model.cuda(device)
        prep = Preprocessor()

        index = sys.argv.index("-predict")

        text = sys.argv[index + 1]
        text = prep.setText(text).correctSpelling().setLemmatizeFlag(
        ).setStopWordsFlag().build()
        text = [text]

        vectorized_seqs = []
        for seq in text:
            vectorized_seqs.append([
                vocab_to_int.get(word, 1) for word in TOKENIZER.tokenize(seq)
            ])

        seq_lengths = torch.LongTensor(list(map(len, vectorized_seqs)))
logging.info(f'Is to remove stopwords? {remove_stopwords}')
logging.info(f'Is to remove POS categories? {remove_pos}')
logging.info(f'POS categories to keep: {posCategories}')

data_string = json.load(open(original_data_path, READ_MODE))
logging.info(f'Total of original documents: {len(data_string)}')

original_data_frame = pd.DataFrame.from_dict(data_string)

logging.info(original_data_frame.head())

data = np.array(original_data_frame[field_of_interest], dtype='object')

processor = Preprocessor(posCategories,
                         logger=logging.info,
                         language=lang,
                         lemmatize_activated=lemmatize_activated,
                         remove_pos=remove_pos,
                         remove_stopwords=remove_stopwords)

processed_data, stopwords = processor.preprocess(data, stopwords_file)
del data

logging.info(f'Size of data after preprocessing: {len(processed_data)}')

df_after_preprocessing = original_data_frame.assign(body=processed_data)

df_after_preprocessing = df_after_preprocessing[
    df_after_preprocessing['body'].map(lambda field: len(field)) > 0]

logging.info(
    f'Row count after removal of rows with empty "{field_of_interest}" fields: {len(df_after_preprocessing)}'