Exemple #1
0
def run(options):
    logger = get_logger()

    validation_dataset = get_validation_dataset(options)
    validation_iterator = get_validation_iterator(options, validation_dataset)
    word2idx = validation_dataset['word2idx']
    embeddings = validation_dataset['embeddings']

    idx2word = {v: k for k, v in word2idx.items()}

    logger.info('Initializing model.')
    trainer = build_net(options, embeddings, validation_iterator)

    # Parse
    diora = trainer.net.diora

    # Monkey patch parsing specific methods.
    override_init_with_batch(diora)
    override_inside_hook(diora)

    # Turn off outside pass.
    trainer.net.diora.outside = False
    # Eval mode.
    trainer.net.eval()

    # Topk predictor.
    parse_predictor = CKY(net=diora, word2idx=word2idx)
    batches = validation_iterator.get_iterator(random_seed=options.seed)

    logger.info('Beginning to parse.')

    with torch.no_grad():
        for i, batch_map in enumerate(batches):
            sentences = batch_map['sentences']
            batch_size = sentences.shape[0]
            length = sentences.shape[1]

            # Rather than skipping, just log the trees
            # (they are trivially easy to find).
            if length <= 2:
                for i in range(batch_size):
                    example_id = batch_map['example_ids'][i]
                    tokens = sentences[i].tolist()
                    words = [idx2word[idx] for idx in tokens]
                    if length == 2:
                        o = dict(example_id=example_id,
                                 tree=(words[0], words[1]))
                    elif length == 1:
                        o = dict(example_id=example_id, tree=words[0])
                    print(json.dumps(o))
                continue

            trainer.step(batch_map, train=False, compute_loss=False)
            trees = parse_predictor.parse_batch(batch_map)

            for ii, tr in enumerate(trees):
                example_id = batch_map['example_ids'][ii]
                s = [idx2word[idx] for idx in sentences[ii].tolist()]
                tr = replace_leaves(tr, s)
                o = dict(example_id=example_id, tree=tr)

                print(json.dumps(o))
Exemple #2
0
def run(options):
    logger = get_logger()

    validation_dataset = get_validation_dataset(options)
    validation_iterator = get_validation_iterator(options, validation_dataset)
    word2idx = validation_dataset['word2idx']
    embeddings = validation_dataset['embeddings']

    idx2word = {v: k for k, v in word2idx.items()}

    logger.info('Initializing model.')
    trainer = build_net(options, embeddings, validation_iterator)
    diora = trainer.net.diora
    tree_helper = TreeHelper(diora, word2idx)
    tree_helper.init(options)
    csv_helper = CSVHelper()

    ## Eval mode.
    trainer.net.eval()

    batches = validation_iterator.get_iterator(random_seed=options.seed)

    meta_output_path = os.path.abspath(os.path.join(options.experiment_path, 'vectors.csv'))
    vec_output_path = os.path.abspath(os.path.join(options.experiment_path, 'vectors.npy'))

    logger.info('Beginning.')
    logger.info('Writing vectors to = {}'.format(vec_output_path))
    logger.info('Writing metadata to = {}'.format(meta_output_path))

    f_csv = open(meta_output_path, 'w')
    f_vec = open(vec_output_path, 'ab')
    csv_helper.write_header(f_csv)

    with torch.no_grad():
        for i, batch_map in tqdm(enumerate(batches)):
            sentences = batch_map['sentences']
            batch_size = sentences.shape[0]
            length = sentences.shape[1]

            # Skip very short sentences.
            if length <= 2:
                continue

            _ = trainer.step(batch_map, train=False, compute_loss=False)

            if options.parse_mode == 'all-spans':
                for ii in range(batch_size):
                    example_id = batch_map['example_ids'][ii]
                    for level in range(length):
                        size = level + 1
                        for pos in range(length - level):
                            # metadata
                            csv_helper.write_row(f_csv,
                                collections.OrderedDict(
                                    example_id=example_id,
                                    position=str(pos),
                                    size=str(size)
                            ))
                inside_vectors = diora.inside_h.view(-1, options.hidden_dim)
                outside_vectors = diora.outside_h.view(-1, options.hidden_dim)

            else:
                trees, spans = tree_helper.get_trees_for_batch(batch_map, options)

                batch_index = []
                cell_index = []
                offset_cache = diora.index.get_offset(length)

                for ii, sp_lst in enumerate(spans):
                    example_id = batch_map['example_ids'][ii]
                    for pos, size in sp_lst:
                        # metadata
                        csv_helper.write_row(f_csv,
                            collections.OrderedDict(
                                example_id=example_id,
                                position=str(pos),
                                size=str(size)
                        ))
                        # for vectors
                        level = size - 1
                        cell = offset_cache[level] + pos
                        batch_index.append(ii)
                        cell_index.append(cell)

                inside_vectors = diora.inside_h[batch_index, cell_index]
                assert inside_vectors.shape == (len(batch_index), options.hidden_dim)
                outside_vectors = diora.outside_h[batch_index, cell_index]
                assert outside_vectors.shape == (len(batch_index), options.hidden_dim)

            vectors = np.concatenate([inside_vectors, outside_vectors], axis=1)
            np.savetxt(f_vec, vectors)

    f_csv.close()
    f_vec.close()
Exemple #3
0
def run(options):
    logger = get_logger()

    validation_dataset = get_validation_dataset(options)
    validation_iterator = get_validation_iterator(options, validation_dataset)
    word2idx = validation_dataset['word2idx']
    embeddings = validation_dataset['embeddings']

    idx2word = {v: k for k, v in word2idx.items()}

    logger.info('Initializing model.')
    trainer = build_net(options, embeddings, validation_iterator)
    diora = trainer.net.diora

    # 1. Get all relevant phrase vectors.

    dtype = {
        'example_ids': 'list',
        'labels': 'list',
        'positions': 'list',
        'sizes': 'list',
        'phrases': 'list',
        'inside': 'torch',
        'outside': 'torch',
    }
    batch_recorder = BatchRecorder(dtype=dtype)

    ## Eval mode.
    trainer.net.eval()

    batches = validation_iterator.get_iterator(random_seed=options.seed)

    logger.info('Beginning to embed phrases.')

    with torch.no_grad():
        for i, batch_map in enumerate(batches):
            sentences = batch_map['sentences']
            batch_size = sentences.shape[0]
            length = sentences.shape[1]

            # Skips very short examples.
            if length <= 2:
                continue

            _ = trainer.step(batch_map, train=False, compute_loss=False)

            entity_labels = batch_map['entity_labels']
            if len(entity_labels) == 0:
                continue
            try:
                batch_index, positions, sizes, labels = get_cell_index(entity_labels)
            except:
                continue
            # Skip short phrases.
            batch_index = [x for x, y in zip(batch_index, sizes) if y >= 2]
            positions = [x for x, y in zip(positions, sizes) if y >= 2]
            labels = [x for x, y in zip(labels, sizes) if y >= 2]
            sizes = [y for y in sizes if y >= 2]

            cell_index = (batch_index, positions, sizes)

            batch_result = {}
            batch_result['example_ids'] = [batch_map['example_ids'][idx] for idx in cell_index[0]]
            batch_result['labels'] = labels
            batch_result['positions'] = cell_index[1]
            batch_result['sizes'] = cell_index[2]
            batch_result['phrases'] = get_many_phrases(sentences, *cell_index)
            batch_result['inside'] = get_many_cells(diora, diora.inside_h, *cell_index)
            batch_result['outside'] = get_many_cells(diora, diora.outside_h, *cell_index)

            batch_recorder.record(**batch_result)

    result = batch_recorder.get_flattened_result()

    # 2. Build an index of nearest neighbors.

    vectors = np.concatenate([result['inside'], result['outside']], axis=1)
    normalize_L2(vectors)

    index = Index(dim=vectors.shape[1])
    index.add(vectors)
    index.cache(vectors, options.k_candidates)

    # 3. Print a summary.

    example_ids = result['example_ids']
    phrases = result['phrases']
    labels  = result['labels']

    assert len(example_ids) == len(phrases)
    assert len(example_ids) == vectors.shape[0]

    def stringify(phrase):
        return ' '.join([idx2word[idx] for idx in phrase])

    prec_1 = []
    prec_10 = []
    prec_100 = []
    for i in range(vectors.shape[0]):
        topk = []
        corr_lab = 0
        for j, score in index.topk(i, options.k_candidates):
            # Skip same example.
            if example_ids[i] == example_ids[j]:
                continue
            # Skip string match.
            if phrases[i] == phrases[j]:
                continue
            topk.append((j, score))

            corr_lab += 1. * (labels[i] == labels[j])

            if len(topk) == 1:
                prec_1.append(corr_lab)
            elif len(topk) == 10:
                prec_10.append(corr_lab)
            elif len(topk) == 100:
                prec_100.append(corr_lab)

            if len(topk) == options.k_top:
                break
        assert len(topk) == options.k_top, 'Did not find enough valid candidates.'

        # Print.
        # print('[query] example_id={} phrase={} lab={}'.format(
        #   example_ids[i], stringify(phrases[i]),labels[i]))
        # for rank, (j, score) in enumerate(topk[:2]):
        #   print('rank={} score={:.3f} example_id={} phrase={} lab={}'.format(
        #       rank, score, example_ids[j], stringify(phrases[j]), labels[j]))
    print(np.mean(prec_1), np.mean(prec_10)/10)
Exemple #4
0
def run(options):
    logger = get_logger()

    validation_dataset = get_validation_dataset(options)
    validation_iterator = get_validation_iterator(options, validation_dataset)
    word2idx = validation_dataset['word2idx']
    embeddings = validation_dataset['embeddings']

    idx2word = {v: k for k, v in word2idx.items()}

    logger.info('Initializing model.')
    trainer = build_net(options, embeddings, validation_iterator)

    # Parse

    diora = trainer.net.diora

    ## Monkey patch parsing specific methods.
    override_init_with_batch(diora)
    override_inside_hook(diora)

    ## Turn off outside pass.
    trainer.net.diora.outside = False

    ## Eval mode.
    trainer.net.eval()

    ## Parse predictor.
    parse_predictor = CKY(net=diora, word2idx=word2idx)

    batches = validation_iterator.get_iterator(random_seed=options.seed)

    output_path = os.path.abspath(os.path.join(options.experiment_path, 'parse.jsonl'))

    logger.info('Beginning.')
    logger.info('Writing output to = {}'.format(output_path))

    f = open(output_path, 'w')

    with torch.no_grad():
        for i, batch_map in tqdm(enumerate(batches)):
            sentences = batch_map['sentences']
            batch_size = sentences.shape[0]
            length = sentences.shape[1]

            # Skip very short sentences.
            if length <= 2:
                continue

            _ = trainer.step(batch_map, train=False, compute_loss=False)

            trees = parse_predictor.parse_batch(batch_map)

            for ii, tr in enumerate(trees):
                example_id = batch_map['example_ids'][ii]
                s = [idx2word[idx] for idx in sentences[ii].tolist()]
                tr = replace_leaves(tr, s)
                if options.postprocess:
                    tr = postprocess(tr, s)
                o = collections.OrderedDict(example_id=example_id, tree=tr)

                f.write(json.dumps(o) + '\n')

    f.close()
Exemple #5
0
def run(options):
    logger = get_logger()

    validation_dataset = get_validation_dataset(options)
    validation_iterator = get_validation_iterator(options, validation_dataset)
    word2idx = validation_dataset['word2idx']
    embeddings = validation_dataset['embeddings']

    idx2word = {v: k for k, v in word2idx.items()}

    logger.info('Initializing model.')
    trainer = build_net(options, embeddings, validation_iterator)
    diora = trainer.net.diora

    # 1. Get all relevant phrase vectors.
    dtype = {
        'example_ids': 'list',
        'labels': 'list',
        'positions': 'list',
        'sizes': 'list',
        'phrases': 'list',
        'inside': 'torch',
        'outside': 'torch',
    }
    batch_recorder = BatchRecorder(dtype=dtype)
    # Eval mode.
    trainer.net.eval()

    batches = validation_iterator.get_iterator(random_seed=options.seed)

    logger.info('Beginning to embed phrases.')

    strings = []
    with torch.no_grad():
        for i, batch_map in enumerate(batches):
            sentences = batch_map['sentences']
            length = sentences.shape[1]

            # Skips very short examples.
            if length <= 2:
                continue
            strings.extend([
                "".join([idx2word[idx] for idx in x])
                for x in sentences.numpy()
            ])
            trainer.step(batch_map, train=False, compute_loss=False)

            batch_result = {}
            batch_result['inside'] = diora.inside_h[:, -1]
            batch_result['outside'] = diora.outside_h[:, -1]
            batch_recorder.record(**batch_result)

    result = batch_recorder.get_flattened_result()

    # 2. Build an index of nearest neighbors.
    vectors = np.concatenate([result['inside'], result['outside']], axis=1)
    print(len(strings), vectors.shape)
    r = Reach(vectors, strings)

    for s in strings:
        print(s)
        print(r.most_similar(s))