def main(args): # loading datasets from jsonl files with open(args.input_data_path) as f: valid = [json.loads(valid) for valid in f] logging.info('Collecting documents...') documents = ([sample['text'] for sample in valid]) logging.info('Collecting words in documents...') tokenizer = Tokenizer(lower=True) words = tokenizer.collect_words(documents) logging.info('Loading embedding...') """ embedding = Embedding("./glove.6B.300d.txt", words=words) with open('./embedding.pkl', 'wb') as f: pickle.dump(embedding, f) """ with open('./embedding.pkl', 'rb') as file: embedding = pickle.load(file) tokenizer.set_vocab(embedding.vocab) logging.info('Creating valid dataset...') create_seq2seq_dataset(process_samples(tokenizer, valid), 'data.pkl', tokenizer.pad_token_id)
def __init__(self, topics_path, min_depth=0, max_depth=None, metadata=True, lemmatization=True, use_stop=True, pattern=None, exclude_pattern=None, **kwargs): super(TrecTopics, self).__init__(topics_path, dictionary={}, metadata=metadata, min_depth=min_depth, max_depth=max_depth, pattern=pattern, exclude_pattern=exclude_pattern, lines_are_documents=True, **kwargs) self.topics = {} self.topics_vecs = None self.topic_row_maps = {} self.oov = {} self.tokenizer = Tokenizer(minimum_len=TOKEN_MIN_LEN, maximum_len=TOKEN_MAX_LEN, lowercase=True, output_lemma=lemmatization, use_stopwords=use_stop, extra_stopwords=EXTRA_STOPWORDS)
def main(args): with open(args.test_input) as f: test = [json.loads(line) for line in f] logging.info('Collecting documents...') documents = ( [sample['text'] for sample in test] ) logging.info('Collecting words in documents...') tokenizer = Tokenizer(lower=True) words = tokenizer.collect_words(documents) logging.info('Loading embedding...') with open(args.embedding_file, 'rb') as f: embedding = pickle.load(f) tokenizer.set_vocab(embedding.vocab) logging.info('Creating test dataset...') create_seq2seq_dataset( process_samples(tokenizer, test), args.test_output, tokenizer.pad_token_id )
def train_val(test_only=False): ''' Train on the training set, and validate on seen and unseen splits. ''' setup() vocab = read_vocab(TRAIN_VOCAB) tok = Tokenizer(vocab=vocab, encoding_length=args.maxInput) feat_dict = read_img_features(features, test_only=test_only) if test_only: featurized_scans = None val_env_names = ['val_train_seen'] else: featurized_scans = set( [key.split("_")[0] for key in list(feat_dict.keys())]) val_env_names = ['val_train_seen', 'val_seen', 'val_unseen'] if not args.test_obj: print('Loading compact pano-caffe object features ... (~3 seconds)') import pickle as pkl with open('img_features/objects/pano_object_class.pkl', 'rb') as f_pc: pano_caffe = pkl.load(f_pc) else: pano_caffe = None train_env = R2RBatch(feat_dict, pano_caffe, batch_size=args.batchSize, splits=['train'], tokenizer=tok) from collections import OrderedDict if args.submit: val_env_names.append('test') val_envs = OrderedDict(((split, (R2RBatch(feat_dict, pano_caffe, batch_size=args.batchSize, splits=[split], tokenizer=tok), Evaluation([split], featurized_scans, tok))) for split in val_env_names)) if args.train == 'listener': train(train_env, tok, args.iters, val_envs=val_envs) elif args.train == 'validlistener': if args.beam: beam_valid(train_env, tok, val_envs=val_envs) else: valid(train_env, tok, val_envs=val_envs) elif args.train == 'speaker': train_speaker(train_env, tok, args.iters, val_envs=val_envs) elif args.train == 'validspeaker': valid_speaker(tok, val_envs) else: assert False
def _pre_process(self): self.model_file = os.path.join(self.model_dir, 'model.ckpt') self.meta_file = os.path.join(self.model_dir, 'model.ckpt.meta') var_file = os.path.join(self.model_dir, 'var.pkl') with open(var_file, 'rb') as f: self.var, self.config = pickle.load(f) basic_config = config.basic_config() basic_config.__dict__.update(self.config) self.config = basic_config vocab_file = './data/vocab.txt' self.data_tools = Data(vocab_file, None, basic_config, logging) self.tokenizer = Tokenizer(logging)
def main(args): # Read test file with open(args.input_dataname) as f: test = [json.loads(line) for line in f] # Read embedding with open(str(args.output_dir) + '/embedding_tag.pkl', 'rb') as f: embedding = pickle.load(f) tokenizer = Tokenizer(lower=True) tokenizer.set_vocab(embedding.vocab) logging.info('Creating test dataset...') create_seq_tag_dataset(process_seq_tag_samples(tokenizer, test), args.output_dir / 'test_tag.pkl', tokenizer.pad_token_id)