Beispiel #1
0
def to_jsonl(input_file, outprefix, train_or_test):
    import random

    with open(input_file, 'r') as fin:
        lines = fin.readlines()

    examples = []
    k = 0
    while k < len(lines):
        line = lines[k]
        ls = line.split('\t')
        try:
            sent_id = int(ls[0])
        except ValueError:
            sent_id = None

        if sent_id is not None and sent_id >= 1 and sent_id <= 10717:
            # a line with a train or test example
            # remove the " "
            sentence = ls[1].strip()[1:-1]

            # get the label
            k += 1
            label = lines[k].strip()

            example = {'sentence': sentence, 'label': label, 'sent_id': sent_id}
            examples.append(example)

        # go to the next line
        k += 1

    if train_or_test == 'train':
        # split into train / dev
        random.shuffle(examples)

        with JsonFile(outprefix + '/train.json', 'w') as fout:
            for example in examples[:7500]:
                fout.write(example)

        with JsonFile(outprefix + '/dev.json', 'w') as fout:
            for example in examples[7500:]:
                fout.write(example)
    else:
        with JsonFile(outprefix + '/test.json', 'w') as fout:
            for example in examples:
                fout.write(example)
Beispiel #2
0
def convert_all_wsd_datasets(outdir, wsd_framework_root):
    datasets = get_dataset_metadata(wsd_framework_root)

    for ds in datasets:
        ds_name, ds_root = ds
        data = read_wsd_data(ds_root + '.data.xml', ds_root + '.gold.key.txt')
        with JsonFile(os.path.join(outdir, ds_name + '.json'), 'w') as fout:
            for line in data:
                fout.write(line)
Beispiel #3
0
def generate_wordnet_synset_vocab(entity_file, vocab_file):
    vocab = ['@@UNKNOWN@@']

    with JsonFile(entity_file, 'r') as fin:
        for node in fin:
            if node['type'] == 'synset':
                vocab.append(node['id'])

    vocab.append('@@MASK@@')
    vocab.append('@@NULL@@')

    with open(vocab_file, 'w') as fout:
        fout.write('\n'.join(vocab))
Beispiel #4
0
    def _read(self, file_path: str):
        with JsonFile(cached_path(file_path), 'r') as fin:
            for sentence in fin:
                gold_annotations = unpack_wsd_training_instance(sentence)
                gold_span_to_entity_id = {
                    tuple(gs): self.mention_generator._lemma_to_synset[gi]
                    for gs, gi in zip(gold_annotations['gold_spans'],
                                      gold_annotations['gold_lemma_ids'])
                }
                gold_span_to_data_id = {
                    tuple(gs): gid
                    for gs, gid in zip(gold_annotations['gold_spans'],
                                       gold_annotations['gold_ids'])
                }

                # get the candidates
                if self.is_training:
                    candidates = self.mention_generator.get_mentions_with_gold_spans(
                        gold_annotations)
                else:
                    candidates = self.mention_generator.get_mentions_from_gold_span_lemma_pos(
                        gold_annotations)

                # map the original gold lemma_id to the synset_id
                gold_entities = [

                    # value is synset_id
                    gold_span_to_entity_id[tuple(candidate_span)]
                    for candidate_span in candidates['candidate_spans']
                ]

                gold_data_ids = [
                    gold_span_to_data_id[tuple(candidate_span)]
                    for candidate_span in candidates['candidate_spans']
                ]

                if len(candidates['candidate_spans']) > 0:
                    yield self.text_to_instance(
                        gold_annotations['tokenized_text'],
                        candidates['candidate_entities'],
                        candidates['candidate_spans'],
                        candidates['candidate_entity_priors'], gold_entities,
                        gold_data_ids)
Beispiel #5
0
def load_candidate_maps(fname, topk=30, count_smoothing=1):
    """
    Load the candidate maps from the entity file.

    entity_file is the jsonl dump from extract_wordnet.py

    returns:
        candidates[Dict[normalized lemma string] -> candidates
        lemma_id_to_synset_id = Dict["able%3:00:00"] -> "able.a.01"

    each value candidates list is:
        [candidate1_metadata, candidate2_metadata, etc]
    where candidate_metadata is a dict with keys:
            synset_id, lemma_id, pos (n, v, a,   ), prior

    The lemmas are underscore and hyphen normalized for training.

    topk = keep this many of the top candidates for each lemma
    count_smoothing = use this for smoothing
        if count_smoothing < 0 then don't normalize lemmas, just return raw counts
    """
    def _update(d, key, m):
        if key not in d:
            d[key] = []
        d[key].append(m)

    def _trim_and_normalize(d, num, smoothing):
        for key in d:
            all_candidates = d[key]
            if len(all_candidates) > num:
                # sort by count and trim
                # sorted sorts ascending by default, we want decending by count
                sorted_candidates = sorted(all_candidates,
                                           key=lambda x: x['prior'],
                                           reverse=True)
                trimmed_candidates = sorted_candidates[:num]
            else:
                trimmed_candidates = all_candidates

            if smoothing >= 0:
                sum_count = sum(ele['prior'] + smoothing
                                for ele in trimmed_candidates)
                for cand in trimmed_candidates:
                    cand['prior'] = (cand['prior'] + smoothing) / sum_count
            d[key] = trimmed_candidates

    candidates = {}
    lemma_id_to_synset_id = {}

    with JsonFile(cached_path(fname), 'r') as fin:
        for entity in fin:
            if entity['type'] == 'lemma':
                lemma_id = entity['id']
                lemma_str = lemma_id.partition('%')[0]
                synset_id = entity['synset']

                metadata = {
                    'synset_id': synset_id,
                    'lemma_id': lemma_id,
                    'pos': entity['pos'],
                    'prior': entity['count']
                }

                # normalize the lemma_str
                lemma_str_normalized = _norm_lemma(lemma_str)
                _update(candidates, lemma_str_normalized, metadata)

                lemma_id_to_synset_id[lemma_id] = synset_id

    # now trim to top k and normalize the prior
    _trim_and_normalize(candidates, topk, count_smoothing)

    return candidates, lemma_id_to_synset_id
Beispiel #6
0
    def __init__(self,
                 embedding_file: str,
                 entity_dim: int,
                 entity_file: str = None,
                 vocab_file: str = None,
                 entity_h5_key: str = 'conve_tucker_infersent_bert',
                 dropout: float = 0.1,
                 pos_embedding_dim: int = 25,
                 include_null_embedding: bool = False):
        """
        pass pos_emedding_dim = None to skip POS embeddings and all the
            entity stuff, using this as a pretrained embedding file
            with feedforward
        """

        super().__init__()

        if pos_embedding_dim is not None:
            # entity_id -> pos abbreviation, e.g.
            # 'cat.n.01' -> 'n'
            # includes special, e.g. '@@PADDING@@' -> '@@PADDING@@'
            entity_to_pos = {}
            with JsonFile(cached_path(entity_file), 'r') as fin:
                for node in fin:
                    if node['type'] == 'synset':
                        entity_to_pos[node['id']] = node['pos']
            for special in [
                    '@@PADDING@@', '@@MASK@@', '@@NULL@@', '@@UNKNOWN@@'
            ]:
                entity_to_pos[special] = special

            # list of entity ids
            entities = ['@@PADDING@@']
            with open(cached_path(vocab_file), 'r') as fin:
                for line in fin:
                    entities.append(line.strip())

            # the map from entity index id -> pos embedding id,
            # will use for POS embedding lookup
            entity_id_to_pos_index = [
                self.POS_MAP[entity_to_pos[ent]] for ent in entities
            ]
            self.register_buffer('entity_id_to_pos_index',
                                 torch.tensor(entity_id_to_pos_index))

            self.pos_embeddings = torch.nn.Embedding(len(entities),
                                                     pos_embedding_dim)
            init_bert_weights(self.pos_embeddings, 0.02)

            self.use_pos = True
        else:
            self.use_pos = False

        # load the embeddings
        with h5py.File(cached_path(embedding_file), 'r') as fin:
            entity_embeddings = fin[entity_h5_key][...]
        self.entity_embeddings = torch.nn.Embedding(entity_embeddings.shape[0],
                                                    entity_embeddings.shape[1],
                                                    padding_idx=0)
        self.entity_embeddings.weight.data.copy_(
            torch.tensor(entity_embeddings).contiguous())

        if pos_embedding_dim is not None:
            assert entity_embeddings.shape[0] == len(entities)
            concat_dim = entity_embeddings.shape[1] + pos_embedding_dim
        else:
            concat_dim = entity_embeddings.shape[1]

        self.proj_feed_forward = torch.nn.Linear(concat_dim, entity_dim)
        init_bert_weights(self.proj_feed_forward, 0.02)

        self.dropout = torch.nn.Dropout(dropout)

        self.entity_dim = entity_dim

        self.include_null_embedding = include_null_embedding
        if include_null_embedding:
            # a special embedding for null
            entities = ['@@PADDING@@']
            with open(cached_path(vocab_file), 'r') as fin:
                for line in fin:
                    entities.append(line.strip())
            self.null_id = entities.index("@@NULL@@")
            self.null_embedding = torch.nn.Parameter(torch.zeros(entity_dim))
            self.null_embedding.data.normal_(mean=0.0, std=0.02)
Beispiel #7
0
def extract_gloss_examples_wordnet(entity_file, wic_root_dir, output_file, include_definitions=False):
    """
    WIC train: 4579 of 6330, 0.7233807266982623
    WIC dev: 931 of 1276, 0.7296238244514106
    WIC test: 2007 of 2800, 0.7167857142857142
    total examples, examples considered, examples lemma not found:  48247 45310 1024
    total definitions:  117659
    """
    import os
    from kb.common import JsonFile
    from kb.wordnet import WORDNET_TO_SEMCOR_POS_MAP
    import spacy
    from nltk.stem import PorterStemmer

    nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser', 'textcat'])
    stemmer = PorterStemmer()

    # make set of unique sentences in WIC
    wic_unique_sentences = {}
    for split in ['train', 'dev', 'test']:
        wic_unique_sentences[split] = set()
        data_file_name = os.path.join(wic_root_dir, split, split + '.data.txt')
        with open(data_file_name, 'r') as fin:
            for instance in fin:
                sentence1, sentence2 = instance.strip().split('\t')[-2:]
                for sent in [sentence1, sentence2]:
                    # remove all whitespace and strip punctuation
                    s = ''.join(sent.strip().split()).lower().rstrip('.')
                    wic_unique_sentences[split].add(s)

    # read through the dump
    wic_keys = ['train', 'dev', 'test']
    wic_counts = {k: 0 for k in wic_keys}
    n_examples = 0
    n_definitions = 0
    n_examples_considered = 0
    n_examples_not_found = 0

    examples_to_write = []

    nn = 0
    with JsonFile(entity_file, 'r') as fin:
        for entity in fin:
            nn += 1
            if nn % 100 == 0:
                print(nn)

            if entity['type'] == 'lemma':
                continue

            pos = WORDNET_TO_SEMCOR_POS_MAP[entity['pos']]

            if len(entity['definition']) > 0:
                n_definitions += 1
                if include_definitions:
                    doc = [t for t in nlp(entity['definition']) if not t.is_space]
                    lemma_id = entity['lemmas'][0]
                    lemma = lemma_id.partition('%')[0].replace('_', ' ').replace('-', ' - ').split(' ')
                    ex = lemma + ['is', 'defined', 'as', ':'] + [t.text for t in doc]
                    examples_to_write.append([' '.join(ex), (0, len(lemma)), lemma_id, pos])

            for example in entity['examples']:
                s = ''.join(example.strip().split()).lower().rstrip('.')
                n_examples += 1
                skip = False
                for key in wic_keys:
                    if s in wic_unique_sentences[key]:
                        wic_counts[key] += 1
                        if key == 'test' or key == 'dev':
                            skip = True

                if not skip:
                    # get the location of the lemma in the example
                    doc = [t for t in nlp(example) if not t.is_space]
                    n_examples_considered += 1

                    # need to check all the lemmas
                    found = False
                    for lemma_id in entity['lemmas']:

                        lemma = lemma_id.partition('%')[0].replace('_', ' ').replace('-', ' - ').split(' ')
                        len_lemma = len(lemma)

                        # exact match to word, exact match to lemma, word prefix, word suffix
                        lemma_indices = [[], [], []]
                        for i, t in enumerate(doc):
                            span = [t.text.lower() for t in doc[i:(i+len_lemma)]]
                            if span == lemma:
                                lemma_indices[0].append(i)
                            span_lemma = [t.lemma_.lower() for t in doc[i:(i+len_lemma)]]
                            if span_lemma == lemma:
                                lemma_indices[1].append(i)
                            if [stemmer.stem(t) for t in span] == [stemmer.stem(t) for t in lemma]:
                                lemma_indices[2].append(i)
    
                        # get the index
                        index = None
                        for ii in range(3):
                            if len(lemma_indices[ii]) == 1:
                                index = lemma_indices[ii][0]
                                break
                            elif len(lemma_indices[ii]) > 1:
                                break

                        if index is not None:
                            found = True
                            break

                    if found:
                        examples_to_write.append([' '.join(t.text for t in doc), (index, index+len(lemma)), lemma_id, pos])
                    else:
                        n_examples_not_found += 1

    for key in wic_counts:
        print("WIC {}: {} of {}, {}".format(key, wic_counts[key], len(wic_unique_sentences[key]),
                                            wic_counts[key] / len(wic_unique_sentences[key])))
    print("total examples, examples considered, examples lemma not found: ", n_examples, n_examples_considered, n_examples_not_found)
    print("total definitions: ", n_definitions)

    # WRITE OUT TO FILE!!
    with JsonFile(output_file, 'w') as fout:
        for ei, e_write in enumerate(examples_to_write):
            tokens = e_write[0].split()
            start, end = e_write[1]
            out = []
            for i in range(0, start):
                t = {'token': tokens[i], 'pos': '', 'lemma': ''}
                out.append(t)
    
            t = {'token': ' '.join(tokens[start:end]),
                 'pos': e_write[3],
                 'lemma': e_write[2].partition('%')[0],
                 'senses': [e_write[2].partition('%')[2]],
                 'id': 'example_definition.{}'.format(ei)}
            out.append(t)
    
            for i in range(end, len(tokens)):
                t = {'token': tokens[i], 'pos': '', 'lemma': ''}
                out.append(t)
    
            fout.write(out)
Beispiel #8
0
def make_files_for_official_eval(model_archive_file, evaluation_files, output_file,
                                 cuda_device):

    archive = load_archive(model_archive_file)
    model = archive.model

    model.eval()
    if cuda_device != -1:
        model.cuda(cuda_device)

    def find_key(d, func):
        ret = None
        stack = [d]
        while len(stack) > 0 and ret is None:
            s = stack.pop()
            for k, v in s.items():
                if func(k, v):
                    ret = s
                    break
                elif isinstance(v, dict):
                    stack.append(v)
        return ret

    # load reader
    full_reader_params = copy.deepcopy(archive.config['dataset_reader'].as_dict())
    reader_params = find_key(full_reader_params,
                             lambda k, v: k == 'type' and v == 'wordnet_fine_grained')
    reader_params['is_training'] = False
    reader_params['should_remap_span_indices'] = True
    if 'extra_candidate_generators' in reader_params:
        candidate_generator_params = find_key(
                full_reader_params,
                lambda k, v: k == 'tokenizer_and_candidate_generator'
        )['tokenizer_and_candidate_generator']
        candidate_generator = TokenizerAndCandidateGenerator.from_params(
                Params(candidate_generator_params)
        )

    reader_params = Params(reader_params)

    print("====================")
    print(reader_params.as_dict())
    print("====================")

    reader = DatasetReader.from_params(reader_params)

    synset_to_lemmas = {}
    for lemma_id, synset_id in reader.mention_generator._lemma_to_synset.items():
        if synset_id not in synset_to_lemmas:
            synset_to_lemmas[synset_id] = []
        synset_to_lemmas[synset_id].append(lemma_id)

    vocab_params = archive.config['vocabulary']
    vocab = Vocabulary.from_params(vocab_params)

    iterator = BasicIterator(batch_size=24)
    iterator.index_with(vocab)

    fout = open(output_file, 'w')

    for ds_file in [evaluation_file]:
        instances = reader.read(ds_file)

        # get the metadata ids from the raw file
        raw_lines = []
        with JsonFile(ds_file, 'r') as fin:
            for sentence in fin:
                raw_ids = [[token['id'], token['lemma']] for token in sentence if 'senses' in token]
                if len(raw_ids) > 0:
                    raw_lines.append(raw_ids)

        raw_i = 0
        for batch in iterator(instances, num_epochs=1, shuffle=False):
            print(raw_i)

            if cuda_device > -1:
                b = move_to_device(batch, cuda_device)
            else:
                b = batch

            b['candidates'] = {'wordnet': {
                    'candidate_entities': b.pop('candidate_entities'),
                    'candidate_entity_priors': b.pop('candidate_entity_prior'),
                    'candidate_segment_ids': b.pop('candidate_segment_ids'),
                    'candidate_spans': b.pop('candidate_spans')}}
            gold_entities = b.pop('gold_entities')
            b['gold_entities'] = {'wordnet': gold_entities}

            if 'extra_candidates' in b:
                extra_candidates = b.pop('extra_candidates')
                seq_len = b['tokens']['tokens'].shape[1]
                bbb = []
                for e in extra_candidates:
                    for k in e.keys():
                        e[k]['candidate_segment_ids'] = [0] * len(e[k]['candidate_spans'])
                    ee = {'tokens': ['[CLS]'] * seq_len, 'segment_ids': [0] * seq_len,
                          'candidates': e}
                    ee_fields = candidate_generator.convert_tokens_candidates_to_fields(ee)
                    bbb.append(Instance(ee_fields))
                eb = Batch(bbb)
                eb.index_instances(vocab)
                padding_lengths = eb.get_padding_lengths()
                tensor_dict = eb.as_tensor_dict(padding_lengths)
                b['candidates'].update(tensor_dict['candidates'])

            if cuda_device > -1:
                b = move_to_device(b, cuda_device)

            output = model(**b)
    
            # predicted entities is list of (batch_index, (start, end), entity_id)
            predicted_entities = model.soldered_kgs['wordnet'].entity_linker._decode(
                          output['wordnet']['linking_scores'], b['candidates']['wordnet']['candidate_spans'], 
                          b['candidates']['wordnet']['candidate_entities']['ids']
            )

            # make output file
            predicted_entities_batch_indices = []
            batch_size = batch['tokens']['tokens'].shape[0]
            for k in range(batch_size):
                predicted_entities_batch_indices.append([])
            for b_index, start_end, eid in predicted_entities:
                try:
                    synset_id = vocab.get_token_from_index(eid, 'entity')
                except KeyError:
                    synset_id = vocab.get_token_from_index(eid, 'entity_wordnet')
                all_lemma_ids = synset_to_lemmas[synset_id]
                predicted_entities_batch_indices[b_index].append(all_lemma_ids)

            # output lines look like semeval2013.d000.s001.t003 reader%1:19:00::
            for k in range(batch_size):
                raw_ids = raw_lines[raw_i]
                predicted_lemmas = predicted_entities_batch_indices[k]
                assert len(predicted_lemmas) == len(raw_ids)
                for (ii, gold_lemma), pl in zip(raw_ids, predicted_lemmas):
                    # get the predicted lemma_id
                    predicted_lemma_id = None
                    for pp in pl:
                        if pp.partition('%')[0] == gold_lemma:
                            predicted_lemma_id = pp
                    assert predicted_lemma_id is not None
                    line = "{} {}\n".format(ii, predicted_lemma_id)
                    fout.write(line)
                raw_i += 1

    fout.close()
Beispiel #9
0
    def _read(self, file_path: str) -> Iterable[Instance]:
        with JsonFile(cached_path(file_path), 'r') as fin:
            for example in fin:
                sentence = example['sentence']

                raw_tokens = self.tokenizer.bert_word_tokenizer.tokenize(sentence)
                tokens = []
                k = 0
                start_e1 = end_e1 = start_e2 = end_e2 = None
                while k < len(raw_tokens):
                    if raw_tokens[k:(k+3)] == ['<', 'e1', '>']:
                        start_e1 = len(tokens)
                        if self.entity_masking == 'entity_markers':
                            tokens.append(('[e1start]'))
                        k += 3
                    elif raw_tokens[k:(k+3)] == ['<', 'e2', '>']:
                        start_e2 = len(tokens)
                        if self.entity_masking == 'entity_markers':
                            tokens.append(('[e2start]'))
                        k += 3
                    elif raw_tokens[k:(k+4)] == ['<', '/', 'e1', '>']:
                        if self.entity_masking == 'entity_markers':
                            tokens.append(('[e1end]'))
                        end_e1 = len(tokens)
                        k += 4
                    elif raw_tokens[k:(k+4)] == ['<', '/', 'e2', '>']:
                        if self.entity_masking == 'entity_markers':
                            tokens.append(('[e2end]'))
                        end_e2 = len(tokens)
                        k += 4
                    else:
                        tokens.append(raw_tokens[k])
                        k += 1

                assert start_e1 is not None and end_e1 is not None and start_e2 is not None and end_e2 is not None

                tokens_and_candidates = self.tokenizer.tokenize_and_generate_candidates(' '.join(tokens))

                # set the segment ids
                # offsets is the beginning offset for each original token
                if self.entity_masking == 'segment':
                    offsets = [1] + tokens_and_candidates['offsets_a'][:-1]
                    segment_ids = list(tokens_and_candidates['segment_ids'])
                    for s, e, ii in [[start_e1, end_e1, 1], [start_e2, end_e2, 2]]:
                        ll = offsets[e] - offsets[s]
                        segment_ids[offsets[s]:offsets[e]] = [ii] * ll
                    tokens_and_candidates['segment_ids'] = segment_ids

                fields = self.tokenizer.convert_tokens_candidates_to_fields(tokens_and_candidates)

                fields['sentence_id'] = MetadataField(str(example['sent_id']))

                fields['label_ids'] = LabelField(LABEL_MAP[example['label']], skip_indexing=True)

                # get the indices of the entity starts
                offsets = [1] + tokens_and_candidates['offsets_a'][:-1]
                idx1_offset = offsets[start_e1]
                idx2_offset = offsets[start_e2]

                fields['index_a'] = LabelField(idx1_offset, skip_indexing=True)
                fields['index_b'] = LabelField(idx2_offset, skip_indexing=True)

                yield Instance(fields)