def _get_batch_mention_infos(self, closeby_page_ids): self._candidate_strs_lookup = {} mention_infos = {} mentions_covered = set() mentions_by_page_id = self._get_mention_infos_by_page_id( closeby_page_ids) candidate_ids = [] for page_id, mentions in mentions_by_page_id.items(): for mention_info in mentions: if mention_info['mention'] in mentions_covered: continue mentions_covered.add(mention_info['mention']) label = self.entity_label_lookup[mention_info['entity_id']] candidate_ids.extend( self._get_candidate_ids(mention_info['mention'], label).tolist()) self._mentions_per_page_ctr[page_id] = len(mentions) mention_infos.update( {mention['mention_id']: mention for mention in mentions}) self._candidate_strs_lookup.update( dict( zip( candidate_ids, u.chunk_apply_at_lim( lambda ids: get_candidate_strs(self.cursor, ids), [ self.entity_id_lookup[cand_id] for cand_id in candidate_ids ], 10000)))) return mention_infos
def _getitem_sum_encoder(self, idx): idx = self.with_label[idx] label = self.entity_label_lookup.get(self.labels[idx], -1) mention = self.mentions[idx] candidate_ids = get_candidate_ids(self.entity_candidates_prior, self.prior_approx_mapping, self.num_entities, self.num_candidates, mention, label) candidates = get_candidate_strs(self.cursor, [self.entity_id_lookup[cand_id] for cand_id in candidate_ids.tolist()]) return {'mention_sentence': self.mention_sentences[idx], 'page_token_cnts': self.page_token_cnts_lookup[self.mention_doc_id[idx]], 'label': label, 'p_prior': get_p_prior(self.entity_candidates_prior, self.prior_approx_mapping, mention, candidate_ids), 'candidate_ids': candidate_ids, 'candidate_mention_sim': torch.tensor([Levenshtein.ratio(mention, candidate) for candidate in candidates])}
def _getitem_wiki2vec(self, idx): idx = self.with_label[idx] label = self.entity_label_lookup.get(self.labels[idx], -1) mention = self.mentions[idx] candidate_ids = get_candidate_ids(self.entity_candidates_prior, self.prior_approx_mapping, self.num_entities, self.num_candidates, mention, label) bag_of_nouns = get_bag_of_nouns(self.documents[self.mention_doc_id[idx]]) candidates = get_candidate_strs(self.cursor, [self.entity_id_lookup[cand_id] for cand_id in candidate_ids.tolist()]) return {'label': label, 'bag_of_nouns': bag_of_nouns, 'p_prior': get_p_prior(self.entity_candidates_prior, self.prior_approx_mapping, mention, candidate_ids), 'candidate_ids': candidate_ids, 'candidate_mention_sim': torch.tensor([Levenshtein.ratio(mention, candidate) for candidate in candidates])}
def __getitem__(self, idx): idx = self.with_label[idx] label = self.entity_label_lookup.get(self.labels[idx]) or -1 mention = self.mentions[idx] candidate_ids = get_candidate_ids(self.entity_candidates_prior, self.num_entities, self.num_candidates, mention, label) candidates = get_candidate_strs(self.cursor, [self.entity_id_lookup[cand_id] for cand_id in candidate_ids.tolist()]) return {'sentence_splits': self.sentence_splits[idx], 'label': label, 'embedded_page_content': self.embedded_documents[self.mention_doc_id[idx]], 'entity_page_mentions': embed_page_content(self.embedding, self.token_idx_lookup, ' '.join(self.mentions_by_doc_id[self.mention_doc_id[idx]])), 'p_prior': get_p_prior(self.entity_candidates_prior, mention, candidate_ids), 'candidate_ids': candidate_ids, 'candidate_mention_sim': torch.tensor([Levenshtein.ratio(mention, candidate) for candidate in candidates])}