コード例 #1
0
ファイル: examples.py プロジェクト: gpldecha/cs-exercises
def example_path_search():
    n = 10
    adj_matrix = create_grid_adj(n)

    start_idx = to_idx(0, 0, n)
    end_idx = to_idx(8, 8, n)

    # path = breadth_first_search(start=start_idx, target=end_idx, adjacency=adj_matrix)
    path = djikstra_search(start=start_idx,
                           target=end_idx,
                           adjacency=adj_matrix)
    plot_path(path, start_idx, end_idx, n)
コード例 #2
0
ファイル: examples.py プロジェクト: gpldecha/cs-exercises
def example_floyd_walsh():
    n = 10
    adj_matrix = create_grid_adj(n)

    dist, next_ = floyd_warshall(adj_matrix)

    start_idx = to_idx(0, 0, n)
    end_idx = to_idx(8, 8, n)
    path = get_path(next_, start_idx, end_idx)

    plot_path(path, start_idx, end_idx, n)

    print('path     {}'.format(path))
    print('next_    {}'.format(next_))
コード例 #3
0
def get_idf(token_idx_lookup, idf_path):
    with open(idf_path) as fh:
        idf = json.load(fh)
    lookup = {
        u.to_idx(token_idx_lookup, token): token_idf
        for token, token_idf in idf.items()
    }
    lookup[token_idx_lookup['<UNK>']] = 0
    lookup[token_idx_lookup['<PAD>']] = 0
    return lookup
コード例 #4
0
 def _get_token_ctr_by_entity_id(self, cursor: Cursor, token_idx_lookup):
     cursor.execute(
         'select e.id as entity_id, left(p.content, 2000) as text from entities e join pages p on e.text = p.title'
     )
     entity_desc_bow = {}
     for row in cursor.fetchall():
         tokens = parse_text_for_tokens(row['text'])
         text_idxs = [to_idx(token_idx_lookup, token) for token in tokens]
         entity_desc_bow[row['entity_id']] = dict(Counter(text_idxs))
     return entity_desc_bow
コード例 #5
0
 def _get_batch_page_token_cnts_lookup(self, page_ids):
     lim = self.page_content_lim
     lookup = {}
     for page_id in page_ids:
         if page_id not in self.to_entity_id:
             page_content = self._page_content_lookup[page_id]
             if len(page_content.strip()) > 5:
                 lookup[page_id] = dict(
                     Counter(
                         u.to_idx(self.token_idx_lookup, token) for token in
                         parse_text_for_tokens(page_content[:lim])))
         else:
             entity_id = self.to_entity_id[page_id]
             lookup[page_id] = self.token_ctr_by_entity_id[entity_id]
     return lookup
コード例 #6
0
ファイル: inference.py プロジェクト: dmh-cs/entity-linking
def predict_sum_encoder(embedding, token_idx_lookup, p_prior, model, batch,
                        ablation, entity_embeds, use_stacker):
    model.eval()
    context_bows = [
        Counter(to_idx(token_idx_lookup, token) for token in sentence)
        for sentence in batch['mention_sentence']
    ]
    doc_bows = batch['page_token_cnts']
    encoded = model.encoder(context_bows, doc_bows)
    logits = Logits()
    calc_logits = lambda embeds, ids: logits(embeds, entity_embeds(ids))
    men_logits = calc_logits(encoded, batch['candidate_ids'])
    if use_stacker:
        p_text, __ = model.calc_scores(
            (men_logits, torch.zeros_like(men_logits)),
            batch['candidate_mention_sim'], p_prior)
    else:
        p_text = men_logits
    return torch.argmax(p_text, dim=1)
コード例 #7
0
 def __init__(self,
              cursor,
              entity_candidates_prior,
              embedding,
              token_idx_lookup,
              num_entities,
              num_candidates,
              entity_label_lookup,
              path='./AIDA-YAGO2-dataset.tsv',
              use_wiki2vec=False,
              use_sum_encoder=False):
   self.cursor = cursor
   self.entity_candidates_prior = entity_candidates_prior
   self.embedding = embedding
   self.token_idx_lookup = token_idx_lookup
   self.num_entities = num_entities
   self.num_candidates = num_candidates
   with open(path, 'r') as fh:
     self.lines = fh.read().strip().split('\n')[:-1]
   self.documents = get_documents(self.lines)
   self.embedded_documents = [embed_page_content(self.embedding, self.token_idx_lookup, document)
                              for document in self.documents]
   self.mentions = get_mentions(self.lines)
   self.sentence_splits = get_splits(self.documents, self.mentions)
   self.mention_sentences = get_mention_sentences(self.documents, self.mentions)
   self.entity_page_ids = get_entity_page_ids(self.lines)
   self.labels = from_page_ids_to_entity_ids(cursor, self.entity_page_ids)
   self.with_label = [i for i, x in enumerate(self.labels) if x != -1]
   self.mention_doc_id = get_doc_id_per_mention(self.lines)
   self.mentions_by_doc_id = get_mentions_by_doc_id(self.lines)
   self.entity_label_lookup = entity_label_lookup
   self.entity_id_lookup = {int(label): entity_id for entity_id, label in self.entity_label_lookup.items()}
   self.use_wiki2vec = use_wiki2vec
   self.prior_approx_mapping = u.get_prior_approx_mapping(self.entity_candidates_prior)
   self.use_sum_encoder = use_sum_encoder
   self.stemmer = SnowballStemmer('english')
   self.page_token_cnts_lookup = [dict(Counter(u.to_idx(self.token_idx_lookup, self._stem(token))
                                               for token in parse_text_for_tokens(page_content)))
                                  for page_content in self.documents]
コード例 #8
0
                batch_y = label_data[j * _batch_size:j * _batch_size +
                                     _batch_size]
                count = 0  # 900개씩 빼고 그 범주내에서 count를..

                for i in range(_batch_size):
                    left_idx = i - 15
                    right_idx = i + 15
                    if left_idx >= 0 and right_idx < _batch_size:
                        count += 1
                        m1_data = np.append(m1_data,
                                            m1_batch_x[left_idx:right_idx])
                        m2_data = np.append(m2_data,
                                            m2_batch_x[left_idx:right_idx])
                        target = np.append(
                            target,
                            np.eye(n_class)[to_idx(batch_y[right_idx - 1])])

                        if (count == batch_size
                            ):  # feed_dict 구간. batch_size 만큼 채워졌을때.
                            m1_data = m1_data.reshape(batch_size, 30, 72)
                            m2_data = m2_data.reshape(batch_size, 30, 9)
                            target = target.reshape(batch_size, 20)
                            summary, c, _ = sess.run(
                                [merged_summary, cost, optimizer],
                                feed_dict={
                                    m1_X: m1_data,
                                    m2_X: m2_data,
                                    Y: target
                                })

                            m1_data = np.array([])
コード例 #9
0
 def train_sum_encoder(self):
     for epoch_num in range(self.num_epochs):
         self.experiment.update_epoch(epoch_num)
         self._dataset = self.get_dataset()
         dataloader = DataLoader(dataset=self._dataset,
                                 batch_sampler=self.get_batch_sampler(),
                                 collate_fn=collate_sum_encoder)
         for batch_num, batch in enumerate(dataloader):
             if self._dataset.use_fast_sampler:
                 dataloader.batch_sampler.page_ctr = dataloader.dataset.page_ctr
             self.model.train()
             self.optimizer.zero_grad()
             batch = tensors_to_device(batch, self.device)
             labels = self._get_labels_for_batch(batch['label'],
                                                 batch['candidate_ids'])
             context_bows = [
                 Counter(
                     to_idx(self.token_idx_lookup, token)
                     for token in sentence)
                 for sentence in batch['mention_sentence']
             ]
             doc_bows = batch['page_token_cnts']
             encoded = self.model.encoder(context_bows, doc_bows)
             logits = self.calc_logits(encoded, batch['candidate_ids'])
             if self.use_stacker:
                 scores = self.model.calc_scores(
                     logits, batch['candidate_mention_sim'], batch['prior'])
             else:
                 scores = logits
             scores = scores[(labels != -1).nonzero().reshape(-1)]
             labels = labels[(labels != -1).nonzero().reshape(-1)]
             loss = self.calc_loss(scores, labels)
             assert torch.isnan(loss).sum() == 0
             loss.backward()
             if not self.dont_clip_grad:
                 torch.nn.utils.clip_grad_norm_(
                     itertools.chain(self.model.parameters(),
                                     self._get_adaptive_logits_params()),
                     self.clip_grad)
             self.optimizer.step()
             with torch.no_grad():
                 self.model.eval()
                 encoded_test = self.model.encoder(context_bows, doc_bows)
                 logits_test = self.calc_logits(encoded_test,
                                                batch['candidate_ids'])
                 if self.use_stacker:
                     probas = self.model.calc_scores(
                         logits_test, batch['candidate_mention_sim'],
                         batch['prior'])
                 else:
                     probas = logits_test
                 context_error = self._classification_error(probas, labels)
             self.experiment.record_metrics(
                 {
                     'error': context_error,
                     'loss': loss.item()
                 },
                 batch_num=batch_num)
         torch.save(
             self.model.state_dict(),
             './' + self.experiment.model_name + '_epoch_' + str(epoch_num))