Exemple #1
0
def embed_page_content(embedding,
                       token_idx_lookup,
                       page_content,
                       page_mention_infos=[]):
    page_content_with_mention_flags = reduce(_insert_mention_flags,
                                             page_mention_infos, page_content)
    tokens = parse_text_for_tokens(page_content_with_mention_flags)
    return tokens_to_embeddings(embedding, token_idx_lookup, tokens)
Exemple #2
0
 def _get_token_ctr_by_entity_id(self, cursor: Cursor, token_idx_lookup):
     cursor.execute(
         'select e.id as entity_id, left(p.content, 2000) as text from entities e join pages p on e.text = p.title'
     )
     entity_desc_bow = {}
     for row in cursor.fetchall():
         tokens = parse_text_for_tokens(row['text'])
         text_idxs = [to_idx(token_idx_lookup, token) for token in tokens]
         entity_desc_bow[row['entity_id']] = dict(Counter(text_idxs))
     return entity_desc_bow
def _get_desc_fs(cursor):
    stemmer = SnowballStemmer('english')
    cursor.execute(
        'select e.id as entity_id, left(p.content, 2000) as text from entities e join pages p on e.text = p.title'
    )
    entity_desc_bow = {}
    for row in cursor.fetchall():
        tokens = parse_text_for_tokens(row['text'])
        entity_desc_bow[row['entity_id']] = dict(
            Counter(stemmer.stem(token) for token in tokens))
    return entity_desc_bow
Exemple #4
0
def get_page_iobes(page, mentions, mention_link_titles, mention_link_titles_preredirect=None):
  """Returns a list of triples/pairs describing the iobes of the page
based on `mentions` and `mention_link_titles`. ASSUMES `mentions` and
`mention_link_titles` are in the same order. An element is a pair if
the token is not part of a mention, and is a triple otherwise.
  """
  page_iobes = []
  page_content = page['content']
  flagged_page = reduce(_insert_mention_flags,
                        sorted(mentions, key=lambda pair: pair['offset'], reverse=True),
                        page_content)
  sentences = parse_for_sentences(flagged_page)
  sentences = _merge_sentences_with_straddling_mentions(sentences)
  link_title_ctr = 0
  in_a_mention = False
  for sentence in sentences:
    sentence_tokens = parse_text_for_tokens(sentence)
    sentence_iobes = []
    for token_ctr, current_token in enumerate(sentence_tokens):
      previous_token = sentence_tokens[token_ctr - 1] if token_ctr != 0 else None
      next_token = sentence_tokens[token_ctr + 1] if token_ctr + 1 != len(sentence_tokens) else None
      if current_token == mention_start_token or current_token == mention_end_token:
        continue
      elif previous_token == mention_start_token and next_token == mention_end_token:
        iobes = 'S'
      elif previous_token == mention_start_token:
        iobes = 'B'
        in_a_mention = True
      elif next_token == mention_end_token:
        iobes = 'E'
        in_a_mention = False
      elif in_a_mention:
        iobes = 'I'
      else:
        iobes = 'O'
      if iobes == 'O':
        sentence_iobes.append([current_token, iobes])
      else:
        if mention_link_titles_preredirect:
          sentence_iobes.append([current_token,
                                 u.escape_title(mention_link_titles_preredirect[link_title_ctr]),
                                 u.escape_title(mention_link_titles[link_title_ctr]),
                                 iobes])
        else:
          sentence_iobes.append([current_token,
                                 u.escape_title(mention_link_titles[link_title_ctr]),
                                 iobes])
        if iobes in ['S', 'E']:
          link_title_ctr += 1
    page_iobes.append(sentence_iobes)
  return page_iobes
 def __init__(self,
              cursor,
              token_idx_lookup,
              full_token_idx_lookup,
              page_ids,
              lookups_path,
              idf_path,
              train_size,
              txt_dataset_path=None,
              pkl_dataset_prefix=None):
     super().__init__(cursor,
                      token_idx_lookup,
                      full_token_idx_lookup,
                      lookups_path,
                      idf_path,
                      train_size,
                      txt_dataset_path,
                      pkl_dataset_prefix=pkl_dataset_prefix)
     if txt_dataset_path is not None: return
     self.page_content_lim = 2000
     self.cursor = cursor
     self.page_ids = page_ids
     self.document_lookup = self.get_document_lookup(page_ids)
     self.mention_infos = self.get_mention_infos(page_ids)
     self.mentions = [info['mention'] for info in self.mention_infos]
     self.labels = [info['entity_id'] for info in self.mention_infos]
     self.mention_doc_id = [info['page_id'] for info in self.mention_infos]
     self.mention_sentences = get_mention_sentences_from_infos(
         self.document_lookup, self.mention_infos)
     self.mention_fs = [
         self._to_f(sentence) for sentence in self.mention_sentences
     ]
     self.mention_fs_unstemmed = [
         self._to_f(sentence, stem_p=False)
         for sentence in self.mention_sentences
     ]
     self.document_lookup = {
         page_id: parse_text_for_tokens(doc[:self.page_content_lim])
         for page_id, doc in self.document_lookup.items()
     }
     self.page_f_lookup = {
         page_id: self._to_f(tokens)
         for page_id, tokens in self.document_lookup.items()
     }
     self.page_f_lookup_unstemmed = {
         page_id: self._to_f(tokens, stem_p=False)
         for page_id, tokens in self.document_lookup.items()
     }
     self._post_init()
 def _get_batch_page_token_cnts_lookup(self, page_ids):
     lim = self.page_content_lim
     lookup = {}
     for page_id in page_ids:
         if page_id not in self.to_entity_id:
             page_content = self._page_content_lookup[page_id]
             if len(page_content.strip()) > 5:
                 lookup[page_id] = dict(
                     Counter(
                         u.to_idx(self.token_idx_lookup, token) for token in
                         parse_text_for_tokens(page_content[:lim])))
         else:
             entity_id = self.to_entity_id[page_id]
             lookup[page_id] = self.token_ctr_by_entity_id[entity_id]
     return lookup
 def __init__(self,
              cursor,
              token_idx_lookup,
              full_token_idx_lookup,
              conll_path,
              lookups_path,
              idf_path,
              train_size,
              txt_dataset_path=None):
     super().__init__(cursor, token_idx_lookup, full_token_idx_lookup,
                      lookups_path, idf_path, train_size, txt_dataset_path)
     if txt_dataset_path is not None: return
     with open(conll_path, 'r') as fh:
         lines = fh.read().strip().split('\n')[:-1]
     self.documents = get_documents(lines)
     self.mentions = get_mentions(lines)
     self.entity_page_ids = get_entity_page_ids(lines)
     self.labels = from_page_ids_to_entity_ids(self.cursor,
                                               self.entity_page_ids)
     self.mention_doc_id = get_doc_id_per_mention(lines)
     self.mentions_by_doc_id = get_mentions_by_doc_id(lines)
     self.mention_sentences = get_mention_sentences(self.documents,
                                                    self.mentions)
     self.document_lookup = self.documents
     self.mention_fs = [
         self._to_f(sentence) for sentence in self.mention_sentences
     ]
     self.mention_fs_unstemmed = [
         self._to_f(sentence, stem_p=False)
         for sentence in self.mention_sentences
     ]
     self.document_lookup = [
         parse_text_for_tokens(doc) for doc in self.document_lookup
     ]
     self.page_f_lookup = [self._to_f(doc) for doc in self.document_lookup]
     self.page_f_lookup_unstemmed = [
         self._to_f(tokens, stem_p=False) for tokens in self.document_lookup
     ]
     self._post_init()
 def __init__(self,
              cursor,
              entity_candidates_prior,
              embedding,
              token_idx_lookup,
              num_entities,
              num_candidates,
              entity_label_lookup,
              path='./AIDA-YAGO2-dataset.tsv',
              use_wiki2vec=False,
              use_sum_encoder=False):
   self.cursor = cursor
   self.entity_candidates_prior = entity_candidates_prior
   self.embedding = embedding
   self.token_idx_lookup = token_idx_lookup
   self.num_entities = num_entities
   self.num_candidates = num_candidates
   with open(path, 'r') as fh:
     self.lines = fh.read().strip().split('\n')[:-1]
   self.documents = get_documents(self.lines)
   self.embedded_documents = [embed_page_content(self.embedding, self.token_idx_lookup, document)
                              for document in self.documents]
   self.mentions = get_mentions(self.lines)
   self.sentence_splits = get_splits(self.documents, self.mentions)
   self.mention_sentences = get_mention_sentences(self.documents, self.mentions)
   self.entity_page_ids = get_entity_page_ids(self.lines)
   self.labels = from_page_ids_to_entity_ids(cursor, self.entity_page_ids)
   self.with_label = [i for i, x in enumerate(self.labels) if x != -1]
   self.mention_doc_id = get_doc_id_per_mention(self.lines)
   self.mentions_by_doc_id = get_mentions_by_doc_id(self.lines)
   self.entity_label_lookup = entity_label_lookup
   self.entity_id_lookup = {int(label): entity_id for entity_id, label in self.entity_label_lookup.items()}
   self.use_wiki2vec = use_wiki2vec
   self.prior_approx_mapping = u.get_prior_approx_mapping(self.entity_candidates_prior)
   self.use_sum_encoder = use_sum_encoder
   self.stemmer = SnowballStemmer('english')
   self.page_token_cnts_lookup = [dict(Counter(u.to_idx(self.token_idx_lookup, self._stem(token))
                                               for token in parse_text_for_tokens(page_content)))
                                  for page_content in self.documents]
Exemple #9
0
 def _get_bow(self, doc_id):
     text = self._get_doc_text(doc_id)
     tokenized = parse_text_for_tokens(text)
     counts = dict(Counter(tokenized))
     return counts