def embed_page_content(embedding, token_idx_lookup, page_content, page_mention_infos=[]): page_content_with_mention_flags = reduce(_insert_mention_flags, page_mention_infos, page_content) tokens = parse_text_for_tokens(page_content_with_mention_flags) return tokens_to_embeddings(embedding, token_idx_lookup, tokens)
def _get_token_ctr_by_entity_id(self, cursor: Cursor, token_idx_lookup): cursor.execute( 'select e.id as entity_id, left(p.content, 2000) as text from entities e join pages p on e.text = p.title' ) entity_desc_bow = {} for row in cursor.fetchall(): tokens = parse_text_for_tokens(row['text']) text_idxs = [to_idx(token_idx_lookup, token) for token in tokens] entity_desc_bow[row['entity_id']] = dict(Counter(text_idxs)) return entity_desc_bow
def _get_desc_fs(cursor): stemmer = SnowballStemmer('english') cursor.execute( 'select e.id as entity_id, left(p.content, 2000) as text from entities e join pages p on e.text = p.title' ) entity_desc_bow = {} for row in cursor.fetchall(): tokens = parse_text_for_tokens(row['text']) entity_desc_bow[row['entity_id']] = dict( Counter(stemmer.stem(token) for token in tokens)) return entity_desc_bow
def get_page_iobes(page, mentions, mention_link_titles, mention_link_titles_preredirect=None): """Returns a list of triples/pairs describing the iobes of the page based on `mentions` and `mention_link_titles`. ASSUMES `mentions` and `mention_link_titles` are in the same order. An element is a pair if the token is not part of a mention, and is a triple otherwise. """ page_iobes = [] page_content = page['content'] flagged_page = reduce(_insert_mention_flags, sorted(mentions, key=lambda pair: pair['offset'], reverse=True), page_content) sentences = parse_for_sentences(flagged_page) sentences = _merge_sentences_with_straddling_mentions(sentences) link_title_ctr = 0 in_a_mention = False for sentence in sentences: sentence_tokens = parse_text_for_tokens(sentence) sentence_iobes = [] for token_ctr, current_token in enumerate(sentence_tokens): previous_token = sentence_tokens[token_ctr - 1] if token_ctr != 0 else None next_token = sentence_tokens[token_ctr + 1] if token_ctr + 1 != len(sentence_tokens) else None if current_token == mention_start_token or current_token == mention_end_token: continue elif previous_token == mention_start_token and next_token == mention_end_token: iobes = 'S' elif previous_token == mention_start_token: iobes = 'B' in_a_mention = True elif next_token == mention_end_token: iobes = 'E' in_a_mention = False elif in_a_mention: iobes = 'I' else: iobes = 'O' if iobes == 'O': sentence_iobes.append([current_token, iobes]) else: if mention_link_titles_preredirect: sentence_iobes.append([current_token, u.escape_title(mention_link_titles_preredirect[link_title_ctr]), u.escape_title(mention_link_titles[link_title_ctr]), iobes]) else: sentence_iobes.append([current_token, u.escape_title(mention_link_titles[link_title_ctr]), iobes]) if iobes in ['S', 'E']: link_title_ctr += 1 page_iobes.append(sentence_iobes) return page_iobes
def __init__(self, cursor, token_idx_lookup, full_token_idx_lookup, page_ids, lookups_path, idf_path, train_size, txt_dataset_path=None, pkl_dataset_prefix=None): super().__init__(cursor, token_idx_lookup, full_token_idx_lookup, lookups_path, idf_path, train_size, txt_dataset_path, pkl_dataset_prefix=pkl_dataset_prefix) if txt_dataset_path is not None: return self.page_content_lim = 2000 self.cursor = cursor self.page_ids = page_ids self.document_lookup = self.get_document_lookup(page_ids) self.mention_infos = self.get_mention_infos(page_ids) self.mentions = [info['mention'] for info in self.mention_infos] self.labels = [info['entity_id'] for info in self.mention_infos] self.mention_doc_id = [info['page_id'] for info in self.mention_infos] self.mention_sentences = get_mention_sentences_from_infos( self.document_lookup, self.mention_infos) self.mention_fs = [ self._to_f(sentence) for sentence in self.mention_sentences ] self.mention_fs_unstemmed = [ self._to_f(sentence, stem_p=False) for sentence in self.mention_sentences ] self.document_lookup = { page_id: parse_text_for_tokens(doc[:self.page_content_lim]) for page_id, doc in self.document_lookup.items() } self.page_f_lookup = { page_id: self._to_f(tokens) for page_id, tokens in self.document_lookup.items() } self.page_f_lookup_unstemmed = { page_id: self._to_f(tokens, stem_p=False) for page_id, tokens in self.document_lookup.items() } self._post_init()
def _get_batch_page_token_cnts_lookup(self, page_ids): lim = self.page_content_lim lookup = {} for page_id in page_ids: if page_id not in self.to_entity_id: page_content = self._page_content_lookup[page_id] if len(page_content.strip()) > 5: lookup[page_id] = dict( Counter( u.to_idx(self.token_idx_lookup, token) for token in parse_text_for_tokens(page_content[:lim]))) else: entity_id = self.to_entity_id[page_id] lookup[page_id] = self.token_ctr_by_entity_id[entity_id] return lookup
def __init__(self, cursor, token_idx_lookup, full_token_idx_lookup, conll_path, lookups_path, idf_path, train_size, txt_dataset_path=None): super().__init__(cursor, token_idx_lookup, full_token_idx_lookup, lookups_path, idf_path, train_size, txt_dataset_path) if txt_dataset_path is not None: return with open(conll_path, 'r') as fh: lines = fh.read().strip().split('\n')[:-1] self.documents = get_documents(lines) self.mentions = get_mentions(lines) self.entity_page_ids = get_entity_page_ids(lines) self.labels = from_page_ids_to_entity_ids(self.cursor, self.entity_page_ids) self.mention_doc_id = get_doc_id_per_mention(lines) self.mentions_by_doc_id = get_mentions_by_doc_id(lines) self.mention_sentences = get_mention_sentences(self.documents, self.mentions) self.document_lookup = self.documents self.mention_fs = [ self._to_f(sentence) for sentence in self.mention_sentences ] self.mention_fs_unstemmed = [ self._to_f(sentence, stem_p=False) for sentence in self.mention_sentences ] self.document_lookup = [ parse_text_for_tokens(doc) for doc in self.document_lookup ] self.page_f_lookup = [self._to_f(doc) for doc in self.document_lookup] self.page_f_lookup_unstemmed = [ self._to_f(tokens, stem_p=False) for tokens in self.document_lookup ] self._post_init()
def __init__(self, cursor, entity_candidates_prior, embedding, token_idx_lookup, num_entities, num_candidates, entity_label_lookup, path='./AIDA-YAGO2-dataset.tsv', use_wiki2vec=False, use_sum_encoder=False): self.cursor = cursor self.entity_candidates_prior = entity_candidates_prior self.embedding = embedding self.token_idx_lookup = token_idx_lookup self.num_entities = num_entities self.num_candidates = num_candidates with open(path, 'r') as fh: self.lines = fh.read().strip().split('\n')[:-1] self.documents = get_documents(self.lines) self.embedded_documents = [embed_page_content(self.embedding, self.token_idx_lookup, document) for document in self.documents] self.mentions = get_mentions(self.lines) self.sentence_splits = get_splits(self.documents, self.mentions) self.mention_sentences = get_mention_sentences(self.documents, self.mentions) self.entity_page_ids = get_entity_page_ids(self.lines) self.labels = from_page_ids_to_entity_ids(cursor, self.entity_page_ids) self.with_label = [i for i, x in enumerate(self.labels) if x != -1] self.mention_doc_id = get_doc_id_per_mention(self.lines) self.mentions_by_doc_id = get_mentions_by_doc_id(self.lines) self.entity_label_lookup = entity_label_lookup self.entity_id_lookup = {int(label): entity_id for entity_id, label in self.entity_label_lookup.items()} self.use_wiki2vec = use_wiki2vec self.prior_approx_mapping = u.get_prior_approx_mapping(self.entity_candidates_prior) self.use_sum_encoder = use_sum_encoder self.stemmer = SnowballStemmer('english') self.page_token_cnts_lookup = [dict(Counter(u.to_idx(self.token_idx_lookup, self._stem(token)) for token in parse_text_for_tokens(page_content))) for page_content in self.documents]
def _get_bow(self, doc_id): text = self._get_doc_text(doc_id) tokenized = parse_text_for_tokens(text) counts = dict(Counter(tokenized)) return counts