def run(options): logger = get_logger() validation_dataset = get_validation_dataset(options) validation_iterator = get_validation_iterator(options, validation_dataset) word2idx = validation_dataset['word2idx'] embeddings = validation_dataset['embeddings'] idx2word = {v: k for k, v in word2idx.items()} logger.info('Initializing model.') trainer = build_net(options, embeddings, validation_iterator) # Parse diora = trainer.net.diora # Monkey patch parsing specific methods. override_init_with_batch(diora) override_inside_hook(diora) # Turn off outside pass. trainer.net.diora.outside = False # Eval mode. trainer.net.eval() # Topk predictor. parse_predictor = CKY(net=diora, word2idx=word2idx) batches = validation_iterator.get_iterator(random_seed=options.seed) logger.info('Beginning to parse.') with torch.no_grad(): for i, batch_map in enumerate(batches): sentences = batch_map['sentences'] batch_size = sentences.shape[0] length = sentences.shape[1] # Rather than skipping, just log the trees # (they are trivially easy to find). if length <= 2: for i in range(batch_size): example_id = batch_map['example_ids'][i] tokens = sentences[i].tolist() words = [idx2word[idx] for idx in tokens] if length == 2: o = dict(example_id=example_id, tree=(words[0], words[1])) elif length == 1: o = dict(example_id=example_id, tree=words[0]) print(json.dumps(o)) continue trainer.step(batch_map, train=False, compute_loss=False) trees = parse_predictor.parse_batch(batch_map) for ii, tr in enumerate(trees): example_id = batch_map['example_ids'][ii] s = [idx2word[idx] for idx in sentences[ii].tolist()] tr = replace_leaves(tr, s) o = dict(example_id=example_id, tree=tr) print(json.dumps(o))
def run(options): logger = get_logger() validation_dataset = get_validation_dataset(options) validation_iterator = get_validation_iterator(options, validation_dataset) word2idx = validation_dataset['word2idx'] embeddings = validation_dataset['embeddings'] idx2word = {v: k for k, v in word2idx.items()} logger.info('Initializing model.') trainer = build_net(options, embeddings, validation_iterator) diora = trainer.net.diora tree_helper = TreeHelper(diora, word2idx) tree_helper.init(options) csv_helper = CSVHelper() ## Eval mode. trainer.net.eval() batches = validation_iterator.get_iterator(random_seed=options.seed) meta_output_path = os.path.abspath(os.path.join(options.experiment_path, 'vectors.csv')) vec_output_path = os.path.abspath(os.path.join(options.experiment_path, 'vectors.npy')) logger.info('Beginning.') logger.info('Writing vectors to = {}'.format(vec_output_path)) logger.info('Writing metadata to = {}'.format(meta_output_path)) f_csv = open(meta_output_path, 'w') f_vec = open(vec_output_path, 'ab') csv_helper.write_header(f_csv) with torch.no_grad(): for i, batch_map in tqdm(enumerate(batches)): sentences = batch_map['sentences'] batch_size = sentences.shape[0] length = sentences.shape[1] # Skip very short sentences. if length <= 2: continue _ = trainer.step(batch_map, train=False, compute_loss=False) if options.parse_mode == 'all-spans': for ii in range(batch_size): example_id = batch_map['example_ids'][ii] for level in range(length): size = level + 1 for pos in range(length - level): # metadata csv_helper.write_row(f_csv, collections.OrderedDict( example_id=example_id, position=str(pos), size=str(size) )) inside_vectors = diora.inside_h.view(-1, options.hidden_dim) outside_vectors = diora.outside_h.view(-1, options.hidden_dim) else: trees, spans = tree_helper.get_trees_for_batch(batch_map, options) batch_index = [] cell_index = [] offset_cache = diora.index.get_offset(length) for ii, sp_lst in enumerate(spans): example_id = batch_map['example_ids'][ii] for pos, size in sp_lst: # metadata csv_helper.write_row(f_csv, collections.OrderedDict( example_id=example_id, position=str(pos), size=str(size) )) # for vectors level = size - 1 cell = offset_cache[level] + pos batch_index.append(ii) cell_index.append(cell) inside_vectors = diora.inside_h[batch_index, cell_index] assert inside_vectors.shape == (len(batch_index), options.hidden_dim) outside_vectors = diora.outside_h[batch_index, cell_index] assert outside_vectors.shape == (len(batch_index), options.hidden_dim) vectors = np.concatenate([inside_vectors, outside_vectors], axis=1) np.savetxt(f_vec, vectors) f_csv.close() f_vec.close()
def run(options): logger = get_logger() validation_dataset = get_validation_dataset(options) validation_iterator = get_validation_iterator(options, validation_dataset) word2idx = validation_dataset['word2idx'] embeddings = validation_dataset['embeddings'] idx2word = {v: k for k, v in word2idx.items()} logger.info('Initializing model.') trainer = build_net(options, embeddings, validation_iterator) diora = trainer.net.diora # 1. Get all relevant phrase vectors. dtype = { 'example_ids': 'list', 'labels': 'list', 'positions': 'list', 'sizes': 'list', 'phrases': 'list', 'inside': 'torch', 'outside': 'torch', } batch_recorder = BatchRecorder(dtype=dtype) ## Eval mode. trainer.net.eval() batches = validation_iterator.get_iterator(random_seed=options.seed) logger.info('Beginning to embed phrases.') with torch.no_grad(): for i, batch_map in enumerate(batches): sentences = batch_map['sentences'] batch_size = sentences.shape[0] length = sentences.shape[1] # Skips very short examples. if length <= 2: continue _ = trainer.step(batch_map, train=False, compute_loss=False) entity_labels = batch_map['entity_labels'] if len(entity_labels) == 0: continue try: batch_index, positions, sizes, labels = get_cell_index(entity_labels) except: continue # Skip short phrases. batch_index = [x for x, y in zip(batch_index, sizes) if y >= 2] positions = [x for x, y in zip(positions, sizes) if y >= 2] labels = [x for x, y in zip(labels, sizes) if y >= 2] sizes = [y for y in sizes if y >= 2] cell_index = (batch_index, positions, sizes) batch_result = {} batch_result['example_ids'] = [batch_map['example_ids'][idx] for idx in cell_index[0]] batch_result['labels'] = labels batch_result['positions'] = cell_index[1] batch_result['sizes'] = cell_index[2] batch_result['phrases'] = get_many_phrases(sentences, *cell_index) batch_result['inside'] = get_many_cells(diora, diora.inside_h, *cell_index) batch_result['outside'] = get_many_cells(diora, diora.outside_h, *cell_index) batch_recorder.record(**batch_result) result = batch_recorder.get_flattened_result() # 2. Build an index of nearest neighbors. vectors = np.concatenate([result['inside'], result['outside']], axis=1) normalize_L2(vectors) index = Index(dim=vectors.shape[1]) index.add(vectors) index.cache(vectors, options.k_candidates) # 3. Print a summary. example_ids = result['example_ids'] phrases = result['phrases'] labels = result['labels'] assert len(example_ids) == len(phrases) assert len(example_ids) == vectors.shape[0] def stringify(phrase): return ' '.join([idx2word[idx] for idx in phrase]) prec_1 = [] prec_10 = [] prec_100 = [] for i in range(vectors.shape[0]): topk = [] corr_lab = 0 for j, score in index.topk(i, options.k_candidates): # Skip same example. if example_ids[i] == example_ids[j]: continue # Skip string match. if phrases[i] == phrases[j]: continue topk.append((j, score)) corr_lab += 1. * (labels[i] == labels[j]) if len(topk) == 1: prec_1.append(corr_lab) elif len(topk) == 10: prec_10.append(corr_lab) elif len(topk) == 100: prec_100.append(corr_lab) if len(topk) == options.k_top: break assert len(topk) == options.k_top, 'Did not find enough valid candidates.' # Print. # print('[query] example_id={} phrase={} lab={}'.format( # example_ids[i], stringify(phrases[i]),labels[i])) # for rank, (j, score) in enumerate(topk[:2]): # print('rank={} score={:.3f} example_id={} phrase={} lab={}'.format( # rank, score, example_ids[j], stringify(phrases[j]), labels[j])) print(np.mean(prec_1), np.mean(prec_10)/10)
def run(options): logger = get_logger() validation_dataset = get_validation_dataset(options) validation_iterator = get_validation_iterator(options, validation_dataset) word2idx = validation_dataset['word2idx'] embeddings = validation_dataset['embeddings'] idx2word = {v: k for k, v in word2idx.items()} logger.info('Initializing model.') trainer = build_net(options, embeddings, validation_iterator) # Parse diora = trainer.net.diora ## Monkey patch parsing specific methods. override_init_with_batch(diora) override_inside_hook(diora) ## Turn off outside pass. trainer.net.diora.outside = False ## Eval mode. trainer.net.eval() ## Parse predictor. parse_predictor = CKY(net=diora, word2idx=word2idx) batches = validation_iterator.get_iterator(random_seed=options.seed) output_path = os.path.abspath(os.path.join(options.experiment_path, 'parse.jsonl')) logger.info('Beginning.') logger.info('Writing output to = {}'.format(output_path)) f = open(output_path, 'w') with torch.no_grad(): for i, batch_map in tqdm(enumerate(batches)): sentences = batch_map['sentences'] batch_size = sentences.shape[0] length = sentences.shape[1] # Skip very short sentences. if length <= 2: continue _ = trainer.step(batch_map, train=False, compute_loss=False) trees = parse_predictor.parse_batch(batch_map) for ii, tr in enumerate(trees): example_id = batch_map['example_ids'][ii] s = [idx2word[idx] for idx in sentences[ii].tolist()] tr = replace_leaves(tr, s) if options.postprocess: tr = postprocess(tr, s) o = collections.OrderedDict(example_id=example_id, tree=tr) f.write(json.dumps(o) + '\n') f.close()
def run(options): logger = get_logger() validation_dataset = get_validation_dataset(options) validation_iterator = get_validation_iterator(options, validation_dataset) word2idx = validation_dataset['word2idx'] embeddings = validation_dataset['embeddings'] idx2word = {v: k for k, v in word2idx.items()} logger.info('Initializing model.') trainer = build_net(options, embeddings, validation_iterator) diora = trainer.net.diora # 1. Get all relevant phrase vectors. dtype = { 'example_ids': 'list', 'labels': 'list', 'positions': 'list', 'sizes': 'list', 'phrases': 'list', 'inside': 'torch', 'outside': 'torch', } batch_recorder = BatchRecorder(dtype=dtype) # Eval mode. trainer.net.eval() batches = validation_iterator.get_iterator(random_seed=options.seed) logger.info('Beginning to embed phrases.') strings = [] with torch.no_grad(): for i, batch_map in enumerate(batches): sentences = batch_map['sentences'] length = sentences.shape[1] # Skips very short examples. if length <= 2: continue strings.extend([ "".join([idx2word[idx] for idx in x]) for x in sentences.numpy() ]) trainer.step(batch_map, train=False, compute_loss=False) batch_result = {} batch_result['inside'] = diora.inside_h[:, -1] batch_result['outside'] = diora.outside_h[:, -1] batch_recorder.record(**batch_result) result = batch_recorder.get_flattened_result() # 2. Build an index of nearest neighbors. vectors = np.concatenate([result['inside'], result['outside']], axis=1) print(len(strings), vectors.shape) r = Reach(vectors, strings) for s in strings: print(s) print(r.most_similar(s))