def extract_structures(model, test_batches, device, vocab, dirName): model.eval() dirName = dirName + "/structures" if not os.path.exists(dirName): os.mkdir(dirName) print("Directory ", dirName, " Created ") count = 0 for ct, batch in test_batches: value, feed_dict = get_feed_dict(batch, device) if not value: continue output, sent_attention_matrix, doc_attention_matrix = model.forward( feed_dict) for i in range(len(batch)): fileName = dirName + "/" + str(count) + ".txt" count += 1 fp = open(fileName, "w") #print("\nDoc: "+str(count)+"\n") fp.write("Doc: " + str(count) + "\n") l = len(batch[i].token_idxs) sent_no = 0 for sent in batch[i].token_idxs: printstr = '' #scores = str_scores_sent[sent_no][0:l, 0:l] token_count = 0 for token in sent: printstr += vocab[token] + " " token_count = token_count + 1 #print(printstr) fp.write(printstr + "\n") scores = sent_attention_matrix[sent_no][0:token_count, 0:token_count] shape2 = sent_attention_matrix[sent_no][0:token_count, 0:token_count].size() row = torch.ones([1, shape2[1] + 1]).to(device) column = torch.zeros([shape2[0], 1]).to(device) new_scores = torch.cat([column, scores], dim=1) new_scores = torch.cat([row, new_scores], dim=0) heads, tree_score = chu_liu_edmonds( new_scores.data.cpu().numpy().astype(np.float64)) #print(heads, tree_score) fp.write(str(heads) + " ") fp.write(str(tree_score) + "\n") shape2 = doc_attention_matrix[i][0:l, 0:l].size() row = torch.ones([1, shape2[1] + 1]).to(device) column = torch.zeros([shape2[0], 1]).to(device) scores = doc_attention_matrix[i][0:l, 0:l] new_scores = torch.cat([column, scores], dim=1) new_scores = torch.cat([row, new_scores], dim=0) heads, tree_score = chu_liu_edmonds( new_scores.data.cpu().numpy().astype(np.float64)) #print(heads, tree_score) fp.write("\n") fp.write(str(heads) + " ") fp.write(str(tree_score) + "\n") fp.close()
def predict(self, dataset, args): import io conllu, sentences = io.StringIO(), 0 while not dataset.epoch_finished(): sentence_lens, word_ids, charseq_ids, charseqs, charseq_lens = dataset.next_batch( args.batch_size) feeds = { self.is_training: False, self.sentence_lens: sentence_lens, self.charseqs: charseqs[train.FORMS], self.charseq_lens: charseq_lens[train.FORMS], self.word_ids: word_ids[train.FORMS], self.charseq_ids: charseq_ids[train.FORMS] } for tag in args.tags: feeds[self.tags[tag]] = word_ids[train.FACTORS_MAP[tag]] if args.parse: feeds[self.heads] = word_ids[train.HEAD] if args.parse: predictions, heads, _ = self.session.run( [self.predictions, self.heads_logs, self.update_loss], feeds) else: predictions, _ = self.session.run( [self.predictions, self.update_loss], feeds) for i in range(len(sentence_lens)): overrides = [None] * dataset.FACTORS for tag in args.tags: overrides[dataset.FACTORS_MAP[tag]] = predictions[tag][i] if args.parse: padded_heads = np.pad( heads[i][:sentence_lens[i], :sentence_lens[i] + 1].astype(np.float), ((1, 0), (0, 0)), mode="constant") roots, _ = dependency_decoding.chu_liu_edmonds( padded_heads) if np.count_nonzero(roots) != len(roots) - 1: best_score = None padded_heads[:, 0] = np.nan for r in range(len(roots)): if roots[r] == 0: padded_heads[r, 0] = heads[i][r - 1, 0] current_roots, current_score = dependency_decoding.chu_liu_edmonds( padded_heads) padded_heads[r, 0] = np.nan if best_score is None or current_score > best_score: best_score, best_roots = current_score, current_roots roots = best_roots overrides[dataset.HEAD] = roots[1:] dataset.write_sentence(conllu, sentences, overrides) sentences += 1 return conllu.getvalue()
def mst_decode(scores): """Decode arc-factored parse using maximum spanning tree.""" device = scores.device scores = scores.cpu().double().numpy() heads, _ = dependency_decoding.chu_liu_edmonds(scores) heads[0] = 0 # Set root to itself return torch.LongTensor(heads).to(device)
def cle_loss(scores: torch.Tensor, lengths: torch.Tensor, gold_heads: torch.Tensor, normalize_wrt_seq_len: bool): """ Parses a batch of sentences and computes a hinge loss (see code by Eliyahu Kiperwasser: https://github.com/elikip/bist-parser) :param scores torch.Tensor of shape (batch_size,tokens, tokens), the length of the sentences is an array of length batch_size that specifies how long the sentences are :param gold_heads: Tensor of shape (batch_size, tokens) that contains the correct head for every word. :param lengths: actual lengths of the sentences, tensor of shape (batch_size,) :return: a scalar torch.Tensor with the hinge loss """ losses: torch.Tensor = 0 device = get_device_of(scores) scores = scores.cpu() #scores_np = scores.detach().double().numpy() gold_heads = gold_heads.cpu().numpy() lengths = lengths.cpu().numpy() for m, g, l in zip(scores, gold_heads, lengths): #m: shape (tokens, tokens) #g: shape (tokens,) #l: scalar, sentence length range = np.arange(l) #remove padding at the end: m = m[:l, :l] g = g[:l] # -> shape (l,) # make gold solution look worse by cost augmentation (in the original, make non-gold look better)/introduce margin: m[range, g] -= 1.0 # cost augmentation r, _ = chu_liu_edmonds(m.detach().double().numpy( )) #discard _score_ of solution, -> r has shape (l,) # this implementation says that head of artificial root is -1, but the rest of the pipeline says the head of the artificial root is the artificial root itself (i.e. 0): r[0] = 0 r = np.array(r) scores_of_solution = m[ range, r] #extract the scores belonging to the decoded edges -> shape (l,) scores_of_gold = m[ range, g] # extract the scores belonging to the gold edges -> shape (l,) r = torch.from_numpy(r) g = torch.from_numpy(g) zero = torch.zeros(1, dtype=torch.float32) #where predicted head differs from gold head, add the score difference to the loss term: loss_term = torch.sum( torch.where(torch.eq(r, g), zero, scores_of_solution - scores_of_gold)) if normalize_wrt_seq_len: loss_term /= l losses += loss_term if device < 0: return losses return losses.to(device)
def cle_decode(scores, lengths): """ Parses a batch of sentences :param scores torch.Tensor of shape (batch_size,tokens, tokens), the length of the sentences is an array of length batch_size that specifies how long the sentences are :param lengths: actual lengths of the sentences, tensor of shape (batch_size,) :return: a tensor of shape (batch_size, tokens) that contains the heads of the tokens. Positions that go over the sentence length are filled with -1. """ heads = [] scores = scores.detach().cpu().double().numpy() lengths = lengths.cpu().numpy() bs, toks, _ = scores.shape for m, l in zip(scores, lengths): r, _ = chu_liu_edmonds(m[:l, :l]) #discard _score_ of solution h = np.concatenate([r, -np.ones(toks - l, dtype=np.long)]) heads.append(h) return torch.from_numpy(np.stack(heads))
def __call__(self, probs: List[np.ndarray]) -> List[List[int]]: """Applies Chu-Liu-Edmonds algorithm to the matrix of head probabilities. probs: a 3D-array of probabilities of shape B*L*(L+1) """ answer = [] for elem in probs: m, n = elem.shape assert n == m + 1 elem = np.log10(np.maximum(self.min_edge_prob, elem)) - np.log10( self.min_edge_prob) elem = np.concatenate([np.zeros_like(elem[:1, :]), elem], axis=0) # it makes impossible to create multiple edges 0->i elem[1:, 0] += np.log10(self.min_edge_prob) * len(elem) chl_data = chu_liu_edmonds(elem.astype("float64")) answer.append(chl_data[0][1:]) return answer
def calculate(targets): for target in targets: target.T = softmax(np.nansum(target.T, axis=2)) # target.T = eliminate_all_nan_rows(target.T) target.T = chu_liu_edmonds(target.T)[0] pos_tags = [] for pos_projection in target.pos_tags: # TODO: ako ima istih if len(pos_projection) != 0: most_common = Counter(pos_projection).most_common(1)[0][0] if most_common == '_' and len(Counter(pos_projection)) != 1: most_common = Counter(pos_projection).most_common(1)[1][0] else: most_common = '_' pos_tags.append(most_common) target.pos_tags = pos_tags
def solve_mst(g): start_overall = time.time() # Create dummy root node nodes = g.nodes() gc = g.copy() gc.add_node('ROOT') for w in g.nodes(): gc.add_edge(w, 'ROOT', {'weight': 0.1}) # Create matrix N = len(gc) gcmat = np.zeros((N, N)) newnodes = ['ROOT'] + nodes for i, w1 in enumerate(newnodes): for j, w2 in enumerate(newnodes): if w1 == w2: continue gcmat[i][j] = gc[w1].get(w2, {}).get('weight', 0.) # Solve MST heads, score = chu_liu_edmonds(gcmat) # Keep only edges in MST keptedges = [] # [(i,j,{'relation':'hypernym','weight':X.X})] for i, headidx in enumerate(heads[1:]): # skip ROOT if headidx == 0: continue w1 = nodes[i] w2 = nodes[headidx - 1] keptedges.append((w1, w2, g[w1][w2])) # Calculate stats end_overall = time.time() stats = { 'node_cnt': len(nodes), 'runtime': end_overall - start_overall, 'keptedge_cnt': len(keptedges) } return keptedges, stats
def predict(self, dataset, evaluating, args): import io conllu, sentences = io.StringIO(), 0 if evaluating: self.session.run(self.reset_metrics) while not dataset.epoch_finished(): sentence_lens, word_ids, charseq_ids, charseqs, charseq_lens = dataset.next_batch( args.batch_size) feeds = { self.is_training: False, self.sentence_lens: sentence_lens, self.charseqs: charseqs[train.FORMS], self.charseq_lens: charseq_lens[train.FORMS], self.word_ids: word_ids[train.FORMS], self.charseq_ids: charseq_ids[train.FORMS] } if args.embeddings: embeddings = np.zeros([ word_ids[train.EMBEDDINGS].shape[0], word_ids[train.EMBEDDINGS].shape[1], args.embeddings_size ]) for i in range(embeddings.shape[0]): for j in range(embeddings.shape[1]): if word_ids[train.EMBEDDINGS][i, j]: embeddings[i, j] = args.embeddings_data[ word_ids[train.EMBEDDINGS][i, j] - 1] feeds[self.embeddings] = embeddings if args.elmo_size: feeds[self.elmo] = word_ids[train.ELMO] if evaluating: for tag in args.tags: feeds[self.tags[tag]] = word_ids[train.FACTORS_MAP[tag]] if args.parse: feeds[self.heads] = word_ids[train.HEAD] feeds[self.deprels] = word_ids[train.DEPREL] targets = [self.predictions] if args.parse: targets.extend([self.heads_logs, self.deprel_hidden_layer]) if evaluating: targets.append(self.update_loss) predictions, *other_values = self.session.run(targets, feeds) if args.parse: prior_heads, deprel_hidden_layer, *_ = other_values if args.parse: heads = np.zeros(prior_heads.shape[:2], dtype=np.int32) for i in range(len(sentence_lens)): padded_heads = np.pad( prior_heads[i][:sentence_lens[i], :sentence_lens[i] + 1].astype(np.float), ((1, 0), (0, 0)), mode="constant") padded_heads[:, 0] = np.nan padded_heads[1 + np.argmax(prior_heads[i][:sentence_lens[i], 0]), 0] = 0 chosen_heads, _ = dependency_decoding.chu_liu_edmonds( padded_heads) heads[i, :sentence_lens[i]] = chosen_heads[1:] deprels = self.session.run( self.predictions_deprel, { self.is_training: False, self.deprel_hidden_layer: deprel_hidden_layer, self.deprel_heads: heads }) for i in range(len(sentence_lens)): overrides = [None] * dataset.FACTORS for tag in args.tags: overrides[dataset.FACTORS_MAP[tag]] = predictions[tag][i] if args.parse: overrides[dataset.HEAD] = heads[i] overrides[dataset.DEPREL] = deprels[i] dataset.write_sentence(conllu, sentences, overrides) sentences += 1 return conllu.getvalue()
import numpy as np from dependency_decoding import chu_liu_edmonds np.random.seed(43) score_matrix = np.random.rand(3, 3) heads, tree_score = chu_liu_edmonds(score_matrix) print(score_matrix) print(heads, tree_score)
import numpy as np from dependency_decoding import chu_liu_edmonds np.random.seed(43) score_arc = np.random.rand(2, 3, 3) score_root = np.random.rand(2, 3) lengths = np.array([3, 2], dtype=np.int32) heads, scores = chu_liu_edmonds(score_arc, score_root, lengths) # print(score_arc) print(f'heads => {heads}') print(f'scores => {scores}') score_arc = np.array([[[1, 1], [2, 1]], [[1, 2], [1, 1]]], dtype=np.float64) score_arc = np.log(score_arc) score_root = np.array([[3, 2], [1, 2]], dtype=np.float64) score_root = np.log(score_root) lengths = np.array([2, 2], dtype=np.int32) heads, scores = chu_liu_edmonds(score_arc, score_root, lengths) scores = np.exp(np.array(scores)) # print(score_arc) print(f'heads => {heads}') print(f'scores => {scores}')
if args.parse: prior_heads = np.log(head_probs / len(networks)) heads = np.zeros(prior_heads.shape[:2], dtype=np.int32) for i in range(len(sentence_lens)): padded_heads = np.pad( prior_heads[i] [:sentence_lens[i], :sentence_lens[i] + 1].astype( np.float), ((1, 0), (0, 0)), mode="constant") padded_heads[:, 0] = np.nan padded_heads[ 1 + np.argmax(prior_heads[i][:sentence_lens[i], 0]), 0] = 0 chosen_heads, _ = dependency_decoding.chu_liu_edmonds( padded_heads) heads[i, :sentence_lens[i]] = chosen_heads[1:] deprel_probs = None for network, deprel_hidden_layer in zip( networks, deprel_hidden_layers): deprels = network.session.run( network.predictions_deprel_probs, { network.is_training: False, network.deprel_hidden_layer: deprel_hidden_layer, network.deprel_heads: heads }) deprel_probs = deprel_probs + deprels if deprel_probs is not None else deprels for i in range(len(sentence_lens)):
def extract_structures(self, batch, sent_attention_matrix, doc_attention_matrix, count, use_cuda, sent_scores): fileName = os.path.join(self._structures_dir, "%06d_struct.txt" % count) fp = open(fileName, "w") fp.write("Doc: " + str(count) + "\n") #exit(0) doc_attention_matrix = doc_attention_matrix[:, :] #this change yet to be tested! l = batch.enc_doc_lens[0].item() doc_sent_no = 0 # for i in range(l): # printstr = '' # sent = batch.enc_batch[0][i] # #scores = str_scores_sent[sent_no][0:l, 0:l] # token_count = 0 # for j in range(batch.enc_sent_lens[0][i].item()): # token = sent[j].item() # printstr += self.vocab.id2word(token)+" " # token_count = token_count + 1 # #print(printstr) # fp.write(printstr+"\n") # # scores = sent_attention_matrix[doc_sent_no][0:token_count, 0:token_count] # shape2 = sent_attention_matrix[doc_sent_no][0:token_count,0:token_count].size() # row = torch.ones([1, shape2[1]+1]).cuda() # column = torch.zeros([shape2[0], 1]).cuda() # new_scores = torch.cat([column, scores], dim=1) # new_scores = torch.cat([row, new_scores], dim=0) # # heads, tree_score = chu_liu_edmonds(new_scores.data.cpu().numpy().astype(np.float64)) # #print(heads, tree_score) # fp.write(str(heads)+" ") # fp.write(str(tree_score)+"\n") # doc_sent_no+=1 shape2 = doc_attention_matrix[0:l, 0:l + 1].size() row = torch.zeros([1, shape2[1]]).cuda() #column = torch.zeros([shape2[0], 1]).cuda() scores = doc_attention_matrix[0:l, 0:l + 1] #new_scores = torch.cat([column, scores], dim=1) new_scores = torch.cat([row, scores], dim=0) val, root_edge = torch.max(new_scores[:, 0], dim=0) root_score = torch.zeros([shape2[0] + 1, 1]).cuda() root_score[root_edge] = 1 new_scores[:, 0] = root_score.squeeze() #print(new_scores) #print(new_scores.sum(dim=0)) #print(new_scores.sum(dim=1)) #print(new_scores.size()) heads, tree_score = chu_liu_edmonds( new_scores.data.cpu().numpy().astype(np.float64)) height = find_height(heads) leaf_nodes = leaf_node_proportion(heads) #print(heads, tree_score) fp.write("\n") sentences = str(batch.original_articles[0]).split("<split1>") for idx, sent in enumerate(sentences): fp.write(str(idx) + "\t" + str(sent) + "\n") #fp.write(str("\n".join(batch.original_articles[0].split("<split1>"))+"\n") fp.write(str(heads) + " ") fp.write(str(tree_score) + "\n") fp.write(str(height) + "\n") s = sent_scores[0].data.cpu().numpy() for val in s: fp.write(str(val)) fp.close() #exit() structure_info = dict() structure_info['heads'] = heads structure_info['height'] = height structure_info['leaf_nodes'] = leaf_nodes return structure_info