Example #1
0
def main(args):

    # loading datasets from jsonl files
    with open(args.input_data_path) as f:
        valid = [json.loads(valid) for valid in f]

    logging.info('Collecting documents...')
    documents = ([sample['text'] for sample in valid])

    logging.info('Collecting words in documents...')
    tokenizer = Tokenizer(lower=True)
    words = tokenizer.collect_words(documents)

    logging.info('Loading embedding...')
    """
    embedding = Embedding("./glove.6B.300d.txt", words=words)
    with open('./embedding.pkl', 'wb') as f:
        pickle.dump(embedding, f)
    """
    with open('./embedding.pkl', 'rb') as file:
        embedding = pickle.load(file)
    tokenizer.set_vocab(embedding.vocab)

    logging.info('Creating valid dataset...')
    create_seq2seq_dataset(process_samples(tokenizer, valid), 'data.pkl',
                           tokenizer.pad_token_id)
Example #2
0
    def __init__(self,
                 topics_path,
                 min_depth=0,
                 max_depth=None,
                 metadata=True,
                 lemmatization=True,
                 use_stop=True,
                 pattern=None,
                 exclude_pattern=None,
                 **kwargs):
        super(TrecTopics, self).__init__(topics_path,
                                         dictionary={},
                                         metadata=metadata,
                                         min_depth=min_depth,
                                         max_depth=max_depth,
                                         pattern=pattern,
                                         exclude_pattern=exclude_pattern,
                                         lines_are_documents=True,
                                         **kwargs)

        self.topics = {}
        self.topics_vecs = None
        self.topic_row_maps = {}
        self.oov = {}

        self.tokenizer = Tokenizer(minimum_len=TOKEN_MIN_LEN,
                                   maximum_len=TOKEN_MAX_LEN,
                                   lowercase=True,
                                   output_lemma=lemmatization,
                                   use_stopwords=use_stop,
                                   extra_stopwords=EXTRA_STOPWORDS)
def main(args):
    
    with open(args.test_input) as f:
        test = [json.loads(line) for line in f]

    logging.info('Collecting documents...')
    documents = (
        [sample['text'] for sample in test]
    )

    logging.info('Collecting words in documents...')
    tokenizer = Tokenizer(lower=True)
    words = tokenizer.collect_words(documents)

    logging.info('Loading embedding...')
    with open(args.embedding_file, 'rb') as f:
        embedding = pickle.load(f)

    tokenizer.set_vocab(embedding.vocab)

  
    logging.info('Creating test dataset...')
    create_seq2seq_dataset(
        process_samples(tokenizer, test),
        args.test_output,
        tokenizer.pad_token_id
    )
Example #4
0
def train_val(test_only=False):
    ''' Train on the training set, and validate on seen and unseen splits. '''
    setup()
    vocab = read_vocab(TRAIN_VOCAB)
    tok = Tokenizer(vocab=vocab, encoding_length=args.maxInput)

    feat_dict = read_img_features(features, test_only=test_only)

    if test_only:
        featurized_scans = None
        val_env_names = ['val_train_seen']
    else:
        featurized_scans = set(
            [key.split("_")[0] for key in list(feat_dict.keys())])
        val_env_names = ['val_train_seen', 'val_seen', 'val_unseen']

    if not args.test_obj:
        print('Loading compact pano-caffe object features ... (~3 seconds)')
        import pickle as pkl
        with open('img_features/objects/pano_object_class.pkl', 'rb') as f_pc:
            pano_caffe = pkl.load(f_pc)
    else:
        pano_caffe = None

    train_env = R2RBatch(feat_dict,
                         pano_caffe,
                         batch_size=args.batchSize,
                         splits=['train'],
                         tokenizer=tok)
    from collections import OrderedDict

    if args.submit:
        val_env_names.append('test')

    val_envs = OrderedDict(((split, (R2RBatch(feat_dict,
                                              pano_caffe,
                                              batch_size=args.batchSize,
                                              splits=[split],
                                              tokenizer=tok),
                                     Evaluation([split], featurized_scans,
                                                tok)))
                            for split in val_env_names))

    if args.train == 'listener':
        train(train_env, tok, args.iters, val_envs=val_envs)
    elif args.train == 'validlistener':
        if args.beam:
            beam_valid(train_env, tok, val_envs=val_envs)
        else:
            valid(train_env, tok, val_envs=val_envs)
    elif args.train == 'speaker':
        train_speaker(train_env, tok, args.iters, val_envs=val_envs)
    elif args.train == 'validspeaker':
        valid_speaker(tok, val_envs)
    else:
        assert False
 def _pre_process(self):
     self.model_file = os.path.join(self.model_dir, 'model.ckpt')
     self.meta_file = os.path.join(self.model_dir, 'model.ckpt.meta')
     var_file = os.path.join(self.model_dir, 'var.pkl')
     with open(var_file, 'rb') as f:
         self.var, self.config = pickle.load(f)
     basic_config = config.basic_config()
     basic_config.__dict__.update(self.config)
     self.config = basic_config
     vocab_file = './data/vocab.txt'
     self.data_tools = Data(vocab_file, None, basic_config, logging)
     self.tokenizer = Tokenizer(logging)
Example #6
0
def main(args):
    # Read test file
    with open(args.input_dataname) as f:
        test = [json.loads(line) for line in f]
    # Read embedding
    with open(str(args.output_dir) + '/embedding_tag.pkl', 'rb') as f:
        embedding = pickle.load(f)

    tokenizer = Tokenizer(lower=True)
    tokenizer.set_vocab(embedding.vocab)

    logging.info('Creating test dataset...')
    create_seq_tag_dataset(process_seq_tag_samples(tokenizer, test),
                           args.output_dir / 'test_tag.pkl',
                           tokenizer.pad_token_id)