def example_path_search(): n = 10 adj_matrix = create_grid_adj(n) start_idx = to_idx(0, 0, n) end_idx = to_idx(8, 8, n) # path = breadth_first_search(start=start_idx, target=end_idx, adjacency=adj_matrix) path = djikstra_search(start=start_idx, target=end_idx, adjacency=adj_matrix) plot_path(path, start_idx, end_idx, n)
def example_floyd_walsh(): n = 10 adj_matrix = create_grid_adj(n) dist, next_ = floyd_warshall(adj_matrix) start_idx = to_idx(0, 0, n) end_idx = to_idx(8, 8, n) path = get_path(next_, start_idx, end_idx) plot_path(path, start_idx, end_idx, n) print('path {}'.format(path)) print('next_ {}'.format(next_))
def get_idf(token_idx_lookup, idf_path): with open(idf_path) as fh: idf = json.load(fh) lookup = { u.to_idx(token_idx_lookup, token): token_idf for token, token_idf in idf.items() } lookup[token_idx_lookup['<UNK>']] = 0 lookup[token_idx_lookup['<PAD>']] = 0 return lookup
def _get_token_ctr_by_entity_id(self, cursor: Cursor, token_idx_lookup): cursor.execute( 'select e.id as entity_id, left(p.content, 2000) as text from entities e join pages p on e.text = p.title' ) entity_desc_bow = {} for row in cursor.fetchall(): tokens = parse_text_for_tokens(row['text']) text_idxs = [to_idx(token_idx_lookup, token) for token in tokens] entity_desc_bow[row['entity_id']] = dict(Counter(text_idxs)) return entity_desc_bow
def _get_batch_page_token_cnts_lookup(self, page_ids): lim = self.page_content_lim lookup = {} for page_id in page_ids: if page_id not in self.to_entity_id: page_content = self._page_content_lookup[page_id] if len(page_content.strip()) > 5: lookup[page_id] = dict( Counter( u.to_idx(self.token_idx_lookup, token) for token in parse_text_for_tokens(page_content[:lim]))) else: entity_id = self.to_entity_id[page_id] lookup[page_id] = self.token_ctr_by_entity_id[entity_id] return lookup
def predict_sum_encoder(embedding, token_idx_lookup, p_prior, model, batch, ablation, entity_embeds, use_stacker): model.eval() context_bows = [ Counter(to_idx(token_idx_lookup, token) for token in sentence) for sentence in batch['mention_sentence'] ] doc_bows = batch['page_token_cnts'] encoded = model.encoder(context_bows, doc_bows) logits = Logits() calc_logits = lambda embeds, ids: logits(embeds, entity_embeds(ids)) men_logits = calc_logits(encoded, batch['candidate_ids']) if use_stacker: p_text, __ = model.calc_scores( (men_logits, torch.zeros_like(men_logits)), batch['candidate_mention_sim'], p_prior) else: p_text = men_logits return torch.argmax(p_text, dim=1)
def __init__(self, cursor, entity_candidates_prior, embedding, token_idx_lookup, num_entities, num_candidates, entity_label_lookup, path='./AIDA-YAGO2-dataset.tsv', use_wiki2vec=False, use_sum_encoder=False): self.cursor = cursor self.entity_candidates_prior = entity_candidates_prior self.embedding = embedding self.token_idx_lookup = token_idx_lookup self.num_entities = num_entities self.num_candidates = num_candidates with open(path, 'r') as fh: self.lines = fh.read().strip().split('\n')[:-1] self.documents = get_documents(self.lines) self.embedded_documents = [embed_page_content(self.embedding, self.token_idx_lookup, document) for document in self.documents] self.mentions = get_mentions(self.lines) self.sentence_splits = get_splits(self.documents, self.mentions) self.mention_sentences = get_mention_sentences(self.documents, self.mentions) self.entity_page_ids = get_entity_page_ids(self.lines) self.labels = from_page_ids_to_entity_ids(cursor, self.entity_page_ids) self.with_label = [i for i, x in enumerate(self.labels) if x != -1] self.mention_doc_id = get_doc_id_per_mention(self.lines) self.mentions_by_doc_id = get_mentions_by_doc_id(self.lines) self.entity_label_lookup = entity_label_lookup self.entity_id_lookup = {int(label): entity_id for entity_id, label in self.entity_label_lookup.items()} self.use_wiki2vec = use_wiki2vec self.prior_approx_mapping = u.get_prior_approx_mapping(self.entity_candidates_prior) self.use_sum_encoder = use_sum_encoder self.stemmer = SnowballStemmer('english') self.page_token_cnts_lookup = [dict(Counter(u.to_idx(self.token_idx_lookup, self._stem(token)) for token in parse_text_for_tokens(page_content))) for page_content in self.documents]
batch_y = label_data[j * _batch_size:j * _batch_size + _batch_size] count = 0 # 900개씩 빼고 그 범주내에서 count를.. for i in range(_batch_size): left_idx = i - 15 right_idx = i + 15 if left_idx >= 0 and right_idx < _batch_size: count += 1 m1_data = np.append(m1_data, m1_batch_x[left_idx:right_idx]) m2_data = np.append(m2_data, m2_batch_x[left_idx:right_idx]) target = np.append( target, np.eye(n_class)[to_idx(batch_y[right_idx - 1])]) if (count == batch_size ): # feed_dict 구간. batch_size 만큼 채워졌을때. m1_data = m1_data.reshape(batch_size, 30, 72) m2_data = m2_data.reshape(batch_size, 30, 9) target = target.reshape(batch_size, 20) summary, c, _ = sess.run( [merged_summary, cost, optimizer], feed_dict={ m1_X: m1_data, m2_X: m2_data, Y: target }) m1_data = np.array([])
def train_sum_encoder(self): for epoch_num in range(self.num_epochs): self.experiment.update_epoch(epoch_num) self._dataset = self.get_dataset() dataloader = DataLoader(dataset=self._dataset, batch_sampler=self.get_batch_sampler(), collate_fn=collate_sum_encoder) for batch_num, batch in enumerate(dataloader): if self._dataset.use_fast_sampler: dataloader.batch_sampler.page_ctr = dataloader.dataset.page_ctr self.model.train() self.optimizer.zero_grad() batch = tensors_to_device(batch, self.device) labels = self._get_labels_for_batch(batch['label'], batch['candidate_ids']) context_bows = [ Counter( to_idx(self.token_idx_lookup, token) for token in sentence) for sentence in batch['mention_sentence'] ] doc_bows = batch['page_token_cnts'] encoded = self.model.encoder(context_bows, doc_bows) logits = self.calc_logits(encoded, batch['candidate_ids']) if self.use_stacker: scores = self.model.calc_scores( logits, batch['candidate_mention_sim'], batch['prior']) else: scores = logits scores = scores[(labels != -1).nonzero().reshape(-1)] labels = labels[(labels != -1).nonzero().reshape(-1)] loss = self.calc_loss(scores, labels) assert torch.isnan(loss).sum() == 0 loss.backward() if not self.dont_clip_grad: torch.nn.utils.clip_grad_norm_( itertools.chain(self.model.parameters(), self._get_adaptive_logits_params()), self.clip_grad) self.optimizer.step() with torch.no_grad(): self.model.eval() encoded_test = self.model.encoder(context_bows, doc_bows) logits_test = self.calc_logits(encoded_test, batch['candidate_ids']) if self.use_stacker: probas = self.model.calc_scores( logits_test, batch['candidate_mention_sim'], batch['prior']) else: probas = logits_test context_error = self._classification_error(probas, labels) self.experiment.record_metrics( { 'error': context_error, 'loss': loss.item() }, batch_num=batch_num) torch.save( self.model.state_dict(), './' + self.experiment.model_name + '_epoch_' + str(epoch_num))