コード例 #1
0
 def _get_batch_mention_infos(self, closeby_page_ids):
     self._candidate_strs_lookup = {}
     mention_infos = {}
     mentions_covered = set()
     mentions_by_page_id = self._get_mention_infos_by_page_id(
         closeby_page_ids)
     candidate_ids = []
     for page_id, mentions in mentions_by_page_id.items():
         for mention_info in mentions:
             if mention_info['mention'] in mentions_covered: continue
             mentions_covered.add(mention_info['mention'])
             label = self.entity_label_lookup[mention_info['entity_id']]
             candidate_ids.extend(
                 self._get_candidate_ids(mention_info['mention'],
                                         label).tolist())
         self._mentions_per_page_ctr[page_id] = len(mentions)
         mention_infos.update(
             {mention['mention_id']: mention
              for mention in mentions})
     self._candidate_strs_lookup.update(
         dict(
             zip(
                 candidate_ids,
                 u.chunk_apply_at_lim(
                     lambda ids: get_candidate_strs(self.cursor, ids), [
                         self.entity_id_lookup[cand_id]
                         for cand_id in candidate_ids
                     ], 10000))))
     return mention_infos
コード例 #2
0
 def _getitem_sum_encoder(self, idx):
   idx = self.with_label[idx]
   label = self.entity_label_lookup.get(self.labels[idx], -1)
   mention = self.mentions[idx]
   candidate_ids = get_candidate_ids(self.entity_candidates_prior,
                                     self.prior_approx_mapping,
                                     self.num_entities,
                                     self.num_candidates,
                                     mention,
                                     label)
   candidates = get_candidate_strs(self.cursor, [self.entity_id_lookup[cand_id] for cand_id in candidate_ids.tolist()])
   return {'mention_sentence': self.mention_sentences[idx],
           'page_token_cnts': self.page_token_cnts_lookup[self.mention_doc_id[idx]],
           'label': label,
           'p_prior': get_p_prior(self.entity_candidates_prior, self.prior_approx_mapping, mention, candidate_ids),
           'candidate_ids': candidate_ids,
           'candidate_mention_sim': torch.tensor([Levenshtein.ratio(mention, candidate)
                                                  for candidate in candidates])}
コード例 #3
0
 def _getitem_wiki2vec(self, idx):
   idx = self.with_label[idx]
   label = self.entity_label_lookup.get(self.labels[idx], -1)
   mention = self.mentions[idx]
   candidate_ids = get_candidate_ids(self.entity_candidates_prior,
                                     self.prior_approx_mapping,
                                     self.num_entities,
                                     self.num_candidates,
                                     mention,
                                     label)
   bag_of_nouns = get_bag_of_nouns(self.documents[self.mention_doc_id[idx]])
   candidates = get_candidate_strs(self.cursor, [self.entity_id_lookup[cand_id] for cand_id in candidate_ids.tolist()])
   return {'label': label,
           'bag_of_nouns': bag_of_nouns,
           'p_prior': get_p_prior(self.entity_candidates_prior, self.prior_approx_mapping, mention, candidate_ids),
           'candidate_ids': candidate_ids,
           'candidate_mention_sim': torch.tensor([Levenshtein.ratio(mention, candidate)
                                                  for candidate in candidates])}
コード例 #4
0
ファイル: conll_dataset.py プロジェクト: dmh43/entity-linking
 def __getitem__(self, idx):
   idx = self.with_label[idx]
   label = self.entity_label_lookup.get(self.labels[idx]) or -1
   mention = self.mentions[idx]
   candidate_ids = get_candidate_ids(self.entity_candidates_prior,
                                     self.num_entities,
                                     self.num_candidates,
                                     mention,
                                     label)
   candidates = get_candidate_strs(self.cursor, [self.entity_id_lookup[cand_id] for cand_id in candidate_ids.tolist()])
   return {'sentence_splits': self.sentence_splits[idx],
           'label': label,
           'embedded_page_content': self.embedded_documents[self.mention_doc_id[idx]],
           'entity_page_mentions': embed_page_content(self.embedding,
                                                      self.token_idx_lookup,
                                                      ' '.join(self.mentions_by_doc_id[self.mention_doc_id[idx]])),
           'p_prior': get_p_prior(self.entity_candidates_prior, mention, candidate_ids),
           'candidate_ids': candidate_ids,
           'candidate_mention_sim': torch.tensor([Levenshtein.ratio(mention, candidate)
                                                  for candidate in candidates])}