def to_jsonl(input_file, outprefix, train_or_test): import random with open(input_file, 'r') as fin: lines = fin.readlines() examples = [] k = 0 while k < len(lines): line = lines[k] ls = line.split('\t') try: sent_id = int(ls[0]) except ValueError: sent_id = None if sent_id is not None and sent_id >= 1 and sent_id <= 10717: # a line with a train or test example # remove the " " sentence = ls[1].strip()[1:-1] # get the label k += 1 label = lines[k].strip() example = {'sentence': sentence, 'label': label, 'sent_id': sent_id} examples.append(example) # go to the next line k += 1 if train_or_test == 'train': # split into train / dev random.shuffle(examples) with JsonFile(outprefix + '/train.json', 'w') as fout: for example in examples[:7500]: fout.write(example) with JsonFile(outprefix + '/dev.json', 'w') as fout: for example in examples[7500:]: fout.write(example) else: with JsonFile(outprefix + '/test.json', 'w') as fout: for example in examples: fout.write(example)
def convert_all_wsd_datasets(outdir, wsd_framework_root): datasets = get_dataset_metadata(wsd_framework_root) for ds in datasets: ds_name, ds_root = ds data = read_wsd_data(ds_root + '.data.xml', ds_root + '.gold.key.txt') with JsonFile(os.path.join(outdir, ds_name + '.json'), 'w') as fout: for line in data: fout.write(line)
def generate_wordnet_synset_vocab(entity_file, vocab_file): vocab = ['@@UNKNOWN@@'] with JsonFile(entity_file, 'r') as fin: for node in fin: if node['type'] == 'synset': vocab.append(node['id']) vocab.append('@@MASK@@') vocab.append('@@NULL@@') with open(vocab_file, 'w') as fout: fout.write('\n'.join(vocab))
def _read(self, file_path: str): with JsonFile(cached_path(file_path), 'r') as fin: for sentence in fin: gold_annotations = unpack_wsd_training_instance(sentence) gold_span_to_entity_id = { tuple(gs): self.mention_generator._lemma_to_synset[gi] for gs, gi in zip(gold_annotations['gold_spans'], gold_annotations['gold_lemma_ids']) } gold_span_to_data_id = { tuple(gs): gid for gs, gid in zip(gold_annotations['gold_spans'], gold_annotations['gold_ids']) } # get the candidates if self.is_training: candidates = self.mention_generator.get_mentions_with_gold_spans( gold_annotations) else: candidates = self.mention_generator.get_mentions_from_gold_span_lemma_pos( gold_annotations) # map the original gold lemma_id to the synset_id gold_entities = [ # value is synset_id gold_span_to_entity_id[tuple(candidate_span)] for candidate_span in candidates['candidate_spans'] ] gold_data_ids = [ gold_span_to_data_id[tuple(candidate_span)] for candidate_span in candidates['candidate_spans'] ] if len(candidates['candidate_spans']) > 0: yield self.text_to_instance( gold_annotations['tokenized_text'], candidates['candidate_entities'], candidates['candidate_spans'], candidates['candidate_entity_priors'], gold_entities, gold_data_ids)
def load_candidate_maps(fname, topk=30, count_smoothing=1): """ Load the candidate maps from the entity file. entity_file is the jsonl dump from extract_wordnet.py returns: candidates[Dict[normalized lemma string] -> candidates lemma_id_to_synset_id = Dict["able%3:00:00"] -> "able.a.01" each value candidates list is: [candidate1_metadata, candidate2_metadata, etc] where candidate_metadata is a dict with keys: synset_id, lemma_id, pos (n, v, a, ), prior The lemmas are underscore and hyphen normalized for training. topk = keep this many of the top candidates for each lemma count_smoothing = use this for smoothing if count_smoothing < 0 then don't normalize lemmas, just return raw counts """ def _update(d, key, m): if key not in d: d[key] = [] d[key].append(m) def _trim_and_normalize(d, num, smoothing): for key in d: all_candidates = d[key] if len(all_candidates) > num: # sort by count and trim # sorted sorts ascending by default, we want decending by count sorted_candidates = sorted(all_candidates, key=lambda x: x['prior'], reverse=True) trimmed_candidates = sorted_candidates[:num] else: trimmed_candidates = all_candidates if smoothing >= 0: sum_count = sum(ele['prior'] + smoothing for ele in trimmed_candidates) for cand in trimmed_candidates: cand['prior'] = (cand['prior'] + smoothing) / sum_count d[key] = trimmed_candidates candidates = {} lemma_id_to_synset_id = {} with JsonFile(cached_path(fname), 'r') as fin: for entity in fin: if entity['type'] == 'lemma': lemma_id = entity['id'] lemma_str = lemma_id.partition('%')[0] synset_id = entity['synset'] metadata = { 'synset_id': synset_id, 'lemma_id': lemma_id, 'pos': entity['pos'], 'prior': entity['count'] } # normalize the lemma_str lemma_str_normalized = _norm_lemma(lemma_str) _update(candidates, lemma_str_normalized, metadata) lemma_id_to_synset_id[lemma_id] = synset_id # now trim to top k and normalize the prior _trim_and_normalize(candidates, topk, count_smoothing) return candidates, lemma_id_to_synset_id
def __init__(self, embedding_file: str, entity_dim: int, entity_file: str = None, vocab_file: str = None, entity_h5_key: str = 'conve_tucker_infersent_bert', dropout: float = 0.1, pos_embedding_dim: int = 25, include_null_embedding: bool = False): """ pass pos_emedding_dim = None to skip POS embeddings and all the entity stuff, using this as a pretrained embedding file with feedforward """ super().__init__() if pos_embedding_dim is not None: # entity_id -> pos abbreviation, e.g. # 'cat.n.01' -> 'n' # includes special, e.g. '@@PADDING@@' -> '@@PADDING@@' entity_to_pos = {} with JsonFile(cached_path(entity_file), 'r') as fin: for node in fin: if node['type'] == 'synset': entity_to_pos[node['id']] = node['pos'] for special in [ '@@PADDING@@', '@@MASK@@', '@@NULL@@', '@@UNKNOWN@@' ]: entity_to_pos[special] = special # list of entity ids entities = ['@@PADDING@@'] with open(cached_path(vocab_file), 'r') as fin: for line in fin: entities.append(line.strip()) # the map from entity index id -> pos embedding id, # will use for POS embedding lookup entity_id_to_pos_index = [ self.POS_MAP[entity_to_pos[ent]] for ent in entities ] self.register_buffer('entity_id_to_pos_index', torch.tensor(entity_id_to_pos_index)) self.pos_embeddings = torch.nn.Embedding(len(entities), pos_embedding_dim) init_bert_weights(self.pos_embeddings, 0.02) self.use_pos = True else: self.use_pos = False # load the embeddings with h5py.File(cached_path(embedding_file), 'r') as fin: entity_embeddings = fin[entity_h5_key][...] self.entity_embeddings = torch.nn.Embedding(entity_embeddings.shape[0], entity_embeddings.shape[1], padding_idx=0) self.entity_embeddings.weight.data.copy_( torch.tensor(entity_embeddings).contiguous()) if pos_embedding_dim is not None: assert entity_embeddings.shape[0] == len(entities) concat_dim = entity_embeddings.shape[1] + pos_embedding_dim else: concat_dim = entity_embeddings.shape[1] self.proj_feed_forward = torch.nn.Linear(concat_dim, entity_dim) init_bert_weights(self.proj_feed_forward, 0.02) self.dropout = torch.nn.Dropout(dropout) self.entity_dim = entity_dim self.include_null_embedding = include_null_embedding if include_null_embedding: # a special embedding for null entities = ['@@PADDING@@'] with open(cached_path(vocab_file), 'r') as fin: for line in fin: entities.append(line.strip()) self.null_id = entities.index("@@NULL@@") self.null_embedding = torch.nn.Parameter(torch.zeros(entity_dim)) self.null_embedding.data.normal_(mean=0.0, std=0.02)
def extract_gloss_examples_wordnet(entity_file, wic_root_dir, output_file, include_definitions=False): """ WIC train: 4579 of 6330, 0.7233807266982623 WIC dev: 931 of 1276, 0.7296238244514106 WIC test: 2007 of 2800, 0.7167857142857142 total examples, examples considered, examples lemma not found: 48247 45310 1024 total definitions: 117659 """ import os from kb.common import JsonFile from kb.wordnet import WORDNET_TO_SEMCOR_POS_MAP import spacy from nltk.stem import PorterStemmer nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser', 'textcat']) stemmer = PorterStemmer() # make set of unique sentences in WIC wic_unique_sentences = {} for split in ['train', 'dev', 'test']: wic_unique_sentences[split] = set() data_file_name = os.path.join(wic_root_dir, split, split + '.data.txt') with open(data_file_name, 'r') as fin: for instance in fin: sentence1, sentence2 = instance.strip().split('\t')[-2:] for sent in [sentence1, sentence2]: # remove all whitespace and strip punctuation s = ''.join(sent.strip().split()).lower().rstrip('.') wic_unique_sentences[split].add(s) # read through the dump wic_keys = ['train', 'dev', 'test'] wic_counts = {k: 0 for k in wic_keys} n_examples = 0 n_definitions = 0 n_examples_considered = 0 n_examples_not_found = 0 examples_to_write = [] nn = 0 with JsonFile(entity_file, 'r') as fin: for entity in fin: nn += 1 if nn % 100 == 0: print(nn) if entity['type'] == 'lemma': continue pos = WORDNET_TO_SEMCOR_POS_MAP[entity['pos']] if len(entity['definition']) > 0: n_definitions += 1 if include_definitions: doc = [t for t in nlp(entity['definition']) if not t.is_space] lemma_id = entity['lemmas'][0] lemma = lemma_id.partition('%')[0].replace('_', ' ').replace('-', ' - ').split(' ') ex = lemma + ['is', 'defined', 'as', ':'] + [t.text for t in doc] examples_to_write.append([' '.join(ex), (0, len(lemma)), lemma_id, pos]) for example in entity['examples']: s = ''.join(example.strip().split()).lower().rstrip('.') n_examples += 1 skip = False for key in wic_keys: if s in wic_unique_sentences[key]: wic_counts[key] += 1 if key == 'test' or key == 'dev': skip = True if not skip: # get the location of the lemma in the example doc = [t for t in nlp(example) if not t.is_space] n_examples_considered += 1 # need to check all the lemmas found = False for lemma_id in entity['lemmas']: lemma = lemma_id.partition('%')[0].replace('_', ' ').replace('-', ' - ').split(' ') len_lemma = len(lemma) # exact match to word, exact match to lemma, word prefix, word suffix lemma_indices = [[], [], []] for i, t in enumerate(doc): span = [t.text.lower() for t in doc[i:(i+len_lemma)]] if span == lemma: lemma_indices[0].append(i) span_lemma = [t.lemma_.lower() for t in doc[i:(i+len_lemma)]] if span_lemma == lemma: lemma_indices[1].append(i) if [stemmer.stem(t) for t in span] == [stemmer.stem(t) for t in lemma]: lemma_indices[2].append(i) # get the index index = None for ii in range(3): if len(lemma_indices[ii]) == 1: index = lemma_indices[ii][0] break elif len(lemma_indices[ii]) > 1: break if index is not None: found = True break if found: examples_to_write.append([' '.join(t.text for t in doc), (index, index+len(lemma)), lemma_id, pos]) else: n_examples_not_found += 1 for key in wic_counts: print("WIC {}: {} of {}, {}".format(key, wic_counts[key], len(wic_unique_sentences[key]), wic_counts[key] / len(wic_unique_sentences[key]))) print("total examples, examples considered, examples lemma not found: ", n_examples, n_examples_considered, n_examples_not_found) print("total definitions: ", n_definitions) # WRITE OUT TO FILE!! with JsonFile(output_file, 'w') as fout: for ei, e_write in enumerate(examples_to_write): tokens = e_write[0].split() start, end = e_write[1] out = [] for i in range(0, start): t = {'token': tokens[i], 'pos': '', 'lemma': ''} out.append(t) t = {'token': ' '.join(tokens[start:end]), 'pos': e_write[3], 'lemma': e_write[2].partition('%')[0], 'senses': [e_write[2].partition('%')[2]], 'id': 'example_definition.{}'.format(ei)} out.append(t) for i in range(end, len(tokens)): t = {'token': tokens[i], 'pos': '', 'lemma': ''} out.append(t) fout.write(out)
def make_files_for_official_eval(model_archive_file, evaluation_files, output_file, cuda_device): archive = load_archive(model_archive_file) model = archive.model model.eval() if cuda_device != -1: model.cuda(cuda_device) def find_key(d, func): ret = None stack = [d] while len(stack) > 0 and ret is None: s = stack.pop() for k, v in s.items(): if func(k, v): ret = s break elif isinstance(v, dict): stack.append(v) return ret # load reader full_reader_params = copy.deepcopy(archive.config['dataset_reader'].as_dict()) reader_params = find_key(full_reader_params, lambda k, v: k == 'type' and v == 'wordnet_fine_grained') reader_params['is_training'] = False reader_params['should_remap_span_indices'] = True if 'extra_candidate_generators' in reader_params: candidate_generator_params = find_key( full_reader_params, lambda k, v: k == 'tokenizer_and_candidate_generator' )['tokenizer_and_candidate_generator'] candidate_generator = TokenizerAndCandidateGenerator.from_params( Params(candidate_generator_params) ) reader_params = Params(reader_params) print("====================") print(reader_params.as_dict()) print("====================") reader = DatasetReader.from_params(reader_params) synset_to_lemmas = {} for lemma_id, synset_id in reader.mention_generator._lemma_to_synset.items(): if synset_id not in synset_to_lemmas: synset_to_lemmas[synset_id] = [] synset_to_lemmas[synset_id].append(lemma_id) vocab_params = archive.config['vocabulary'] vocab = Vocabulary.from_params(vocab_params) iterator = BasicIterator(batch_size=24) iterator.index_with(vocab) fout = open(output_file, 'w') for ds_file in [evaluation_file]: instances = reader.read(ds_file) # get the metadata ids from the raw file raw_lines = [] with JsonFile(ds_file, 'r') as fin: for sentence in fin: raw_ids = [[token['id'], token['lemma']] for token in sentence if 'senses' in token] if len(raw_ids) > 0: raw_lines.append(raw_ids) raw_i = 0 for batch in iterator(instances, num_epochs=1, shuffle=False): print(raw_i) if cuda_device > -1: b = move_to_device(batch, cuda_device) else: b = batch b['candidates'] = {'wordnet': { 'candidate_entities': b.pop('candidate_entities'), 'candidate_entity_priors': b.pop('candidate_entity_prior'), 'candidate_segment_ids': b.pop('candidate_segment_ids'), 'candidate_spans': b.pop('candidate_spans')}} gold_entities = b.pop('gold_entities') b['gold_entities'] = {'wordnet': gold_entities} if 'extra_candidates' in b: extra_candidates = b.pop('extra_candidates') seq_len = b['tokens']['tokens'].shape[1] bbb = [] for e in extra_candidates: for k in e.keys(): e[k]['candidate_segment_ids'] = [0] * len(e[k]['candidate_spans']) ee = {'tokens': ['[CLS]'] * seq_len, 'segment_ids': [0] * seq_len, 'candidates': e} ee_fields = candidate_generator.convert_tokens_candidates_to_fields(ee) bbb.append(Instance(ee_fields)) eb = Batch(bbb) eb.index_instances(vocab) padding_lengths = eb.get_padding_lengths() tensor_dict = eb.as_tensor_dict(padding_lengths) b['candidates'].update(tensor_dict['candidates']) if cuda_device > -1: b = move_to_device(b, cuda_device) output = model(**b) # predicted entities is list of (batch_index, (start, end), entity_id) predicted_entities = model.soldered_kgs['wordnet'].entity_linker._decode( output['wordnet']['linking_scores'], b['candidates']['wordnet']['candidate_spans'], b['candidates']['wordnet']['candidate_entities']['ids'] ) # make output file predicted_entities_batch_indices = [] batch_size = batch['tokens']['tokens'].shape[0] for k in range(batch_size): predicted_entities_batch_indices.append([]) for b_index, start_end, eid in predicted_entities: try: synset_id = vocab.get_token_from_index(eid, 'entity') except KeyError: synset_id = vocab.get_token_from_index(eid, 'entity_wordnet') all_lemma_ids = synset_to_lemmas[synset_id] predicted_entities_batch_indices[b_index].append(all_lemma_ids) # output lines look like semeval2013.d000.s001.t003 reader%1:19:00:: for k in range(batch_size): raw_ids = raw_lines[raw_i] predicted_lemmas = predicted_entities_batch_indices[k] assert len(predicted_lemmas) == len(raw_ids) for (ii, gold_lemma), pl in zip(raw_ids, predicted_lemmas): # get the predicted lemma_id predicted_lemma_id = None for pp in pl: if pp.partition('%')[0] == gold_lemma: predicted_lemma_id = pp assert predicted_lemma_id is not None line = "{} {}\n".format(ii, predicted_lemma_id) fout.write(line) raw_i += 1 fout.close()
def _read(self, file_path: str) -> Iterable[Instance]: with JsonFile(cached_path(file_path), 'r') as fin: for example in fin: sentence = example['sentence'] raw_tokens = self.tokenizer.bert_word_tokenizer.tokenize(sentence) tokens = [] k = 0 start_e1 = end_e1 = start_e2 = end_e2 = None while k < len(raw_tokens): if raw_tokens[k:(k+3)] == ['<', 'e1', '>']: start_e1 = len(tokens) if self.entity_masking == 'entity_markers': tokens.append(('[e1start]')) k += 3 elif raw_tokens[k:(k+3)] == ['<', 'e2', '>']: start_e2 = len(tokens) if self.entity_masking == 'entity_markers': tokens.append(('[e2start]')) k += 3 elif raw_tokens[k:(k+4)] == ['<', '/', 'e1', '>']: if self.entity_masking == 'entity_markers': tokens.append(('[e1end]')) end_e1 = len(tokens) k += 4 elif raw_tokens[k:(k+4)] == ['<', '/', 'e2', '>']: if self.entity_masking == 'entity_markers': tokens.append(('[e2end]')) end_e2 = len(tokens) k += 4 else: tokens.append(raw_tokens[k]) k += 1 assert start_e1 is not None and end_e1 is not None and start_e2 is not None and end_e2 is not None tokens_and_candidates = self.tokenizer.tokenize_and_generate_candidates(' '.join(tokens)) # set the segment ids # offsets is the beginning offset for each original token if self.entity_masking == 'segment': offsets = [1] + tokens_and_candidates['offsets_a'][:-1] segment_ids = list(tokens_and_candidates['segment_ids']) for s, e, ii in [[start_e1, end_e1, 1], [start_e2, end_e2, 2]]: ll = offsets[e] - offsets[s] segment_ids[offsets[s]:offsets[e]] = [ii] * ll tokens_and_candidates['segment_ids'] = segment_ids fields = self.tokenizer.convert_tokens_candidates_to_fields(tokens_and_candidates) fields['sentence_id'] = MetadataField(str(example['sent_id'])) fields['label_ids'] = LabelField(LABEL_MAP[example['label']], skip_indexing=True) # get the indices of the entity starts offsets = [1] + tokens_and_candidates['offsets_a'][:-1] idx1_offset = offsets[start_e1] idx2_offset = offsets[start_e2] fields['index_a'] = LabelField(idx1_offset, skip_indexing=True) fields['index_b'] = LabelField(idx2_offset, skip_indexing=True) yield Instance(fields)