if args.large: save_file = "../out/UAS_large_{}_{}_{}_{}".format( case, args.hidden_dim, args.layer, seed) else: save_file = "../out/UAS_base_{}_{}_{}_{}".format( case, args.hidden_dim, args.layer, seed) print(save_file) with open(save_file, 'w') as fout: for words, is_heads, heads, sentences, idx2word, score in zip( Words, Is_heads, Heads, Sentences, Idx2word, Scores): is_heads = np.array(is_heads) == 1 score = np.array(score)[is_heads] score = score[:, is_heads] preds = [ words.split()[i] if words.split()[i] != '[CLS]' else 'root' for i in chuliu_edmonds_one_root(score)[:-1] ] assert (len(preds) + 1) == len(words.split()) == len( heads.split()) == (len(sentences) + 2) for w, t, p, s in zip(words.split()[1:-1], heads.split()[1:-1], preds[1:], sentences): fout.write('{} {} {} {}\n'.format(w, idx2word[int(t)], p, s['text'])) y_true = np.array([ line.split()[1] for line in open(save_file, 'r').read().splitlines() if len(line) > 0 ]) y_pred = np.array([ line.split()[2] for line in open(save_file, 'r').read().splitlines() if len(line) > 0
def Train(self, trainData, options): errors = 0 batch = 0 eloss = 0.0 mloss = 0.0 eerrors = 0 lerrors = 0 etotal = 0 beg = start = time.time() random.shuffle(trainData) # in certain cases the data will already have been shuffled after being read from file or while creating dev data errs = [] lerrs = [] eeloss = 0.0 self.feature_extractor.Init(options) for iSentence, sentence in enumerate(trainData,1): if iSentence % 100 == 0 and iSentence != 0: loss_message = 'Processing sentence number: %d'%iSentence + \ ' Loss: %.3f'%(eloss / etotal)+ \ ' Errors: %.3f'%((float(eerrors)) / etotal)+\ ' Labeled Errors: %.3f'%(float(lerrors) / etotal)+\ ' Time: %.2gs'%(time.time()-start) print loss_message start = time.time() eerrors = 0 eloss = 0.0 etotal = 0 lerrors = 0 ltotal = 0 conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)] self.feature_extractor.getWordEmbeddings(conll_sentence, True, options) scores, exprs = self.__evaluate(conll_sentence, True) gold = [entry.parent_id for entry in conll_sentence] if self.proj: heads = decoder.parse_proj(scores, gold if self.costaugFlag else None) else: if self.costaugFlag: #augment the score of non-gold arcs for i in range(len(scores)): for j in range(len(scores)): if gold[j] != i: scores[i][j] += 1. heads = chuliu_edmonds_one_root(scores.T) heads[0] = -1 if self.labelsFlag: for modifier, head in enumerate(gold[1:]): rscores, rexprs = self.__evaluateLabel(conll_sentence, head, modifier+1) goldLabelInd = self.feature_extractor.rels[conll_sentence[modifier+1].relation] wrongLabelInd = max(((l, scr) for l, scr in enumerate(rscores) if l != goldLabelInd), key=itemgetter(1))[0] if rscores[goldLabelInd] < rscores[wrongLabelInd] + 1: lerrs.append(rexprs[wrongLabelInd] - rexprs[goldLabelInd]) lerrors += 1 #not quite right but gives some indication e = sum([1 for h, g in zip(heads[1:], gold[1:]) if h != g]) eerrors += e if e > 0: loss = [(exprs[h][i] - exprs[g][i]) for i, (h,g) in enumerate(zip(heads, gold)) if h != g] eloss += dy.esum(loss).scalar_value() mloss += dy.esum(loss).scalar_value() errs.extend(loss) etotal += len(conll_sentence) if iSentence % 1 == 0 or len(errs) > 0 or len(lerrs) > 0: eeloss = 0.0 if len(errs) > 0 or len(lerrs) > 0: eerrs = (dy.esum(errs + lerrs)) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] dy.renew_cg() if len(errs) > 0: eerrs = (dy.esum(errs + lerrs)) eerrs.scalar_value() eerrs.backward() self.trainer.update() errs = [] lerrs = [] eeloss = 0.0 dy.renew_cg() self.trainer.update() print "Loss: ", mloss/iSentence print "Total Training Time: %.2gs"%(time.time()-beg)
def Predict(self, treebanks, datasplit, options): char_map = {} if options.char_map_file: char_map_fh = codecs.open(options.char_map_file,encoding='utf-8') char_map = json.loads(char_map_fh.read()) # should probably use a namedtuple in get_vocab to make this prettier _, test_words, test_chars, _, _, _, test_treebanks, test_langs = utils.get_vocab(treebanks,datasplit,char_map) # get external embeddings for the set of words and chars in the # test vocab but not in the training vocab test_embeddings = defaultdict(lambda: {}) if options.word_emb_size > 0 and options.ext_word_emb_file: new_test_words = \ set(test_words) - self.feature_extractor.words.viewkeys() print "Number of OOV word types at test time: %i (out of %i)" % ( len(new_test_words), len(test_words)) if len(new_test_words) > 0: # no point loading embeddings if there are no words to look for for lang in test_langs: embeddings = utils.get_external_embeddings( options, emb_file=options.ext_word_emb_file, lang=lang, words=new_test_words ) test_embeddings["words"].update(embeddings) if len(test_langs) > 1 and test_embeddings["words"]: print "External embeddings found for %i words "\ "(out of %i)" % \ (len(test_embeddings["words"]), len(new_test_words)) if options.char_emb_size > 0: new_test_chars = \ set(test_chars) - self.feature_extractor.chars.viewkeys() print "Number of OOV char types at test time: %i (out of %i)" % ( len(new_test_chars), len(test_chars)) if len(new_test_chars) > 0: for lang in test_langs: embeddings = utils.get_external_embeddings( options, emb_file=options.ext_char_emb_file, lang=lang, words=new_test_chars, chars=True ) test_embeddings["chars"].update(embeddings) if len(test_langs) > 1 and test_embeddings["chars"]: print "External embeddings found for %i chars "\ "(out of %i)" % \ (len(test_embeddings["chars"]), len(new_test_chars)) data = utils.read_conll_dir(treebanks,datasplit,char_map=char_map) for iSentence, osentence in enumerate(data,1): sentence = deepcopy(osentence) self.feature_extractor.Init(options) conll_sentence = [entry for entry in sentence if isinstance(entry, utils.ConllEntry)] self.feature_extractor.getWordEmbeddings(conll_sentence, False, options, test_embeddings) scores, exprs = self.__evaluate(conll_sentence, True) if self.proj: heads = decoder.parse_proj(scores) #LATTICE solution to multiple roots # see https://github.com/jujbob/multilingual-bist-parser/blob/master/bist-parser/bmstparser/src/mstlstm.py ## ADD for handling multi-roots problem rootHead = [head for head in heads if head==0] if len(rootHead) != 1: print "it has multi-root, changing it for heading first root for other roots" rootHead = [seq for seq, head in enumerate(heads) if head == 0] for seq in rootHead[1:]:heads[seq] = rootHead[0] ## finish to multi-roots else: heads = chuliu_edmonds_one_root(scores.T) for entry, head in zip(conll_sentence, heads): entry.pred_parent_id = head entry.pred_relation = '_' if self.labelsFlag: for modifier, head in enumerate(heads[1:]): scores, exprs = self.__evaluateLabel(conll_sentence, head, modifier+1) conll_sentence[modifier+1].pred_relation = self.feature_extractor.irels[max(enumerate(scores), key=itemgetter(1))[0]] dy.renew_cg() #keep in memory the information we need, not all the vectors oconll_sentence = [entry for entry in osentence if isinstance(entry, utils.ConllEntry)] for tok_o, tok in zip(oconll_sentence, conll_sentence): tok_o.pred_relation = tok.pred_relation tok_o.pred_parent_id = tok.pred_parent_id yield osentence
Heads.extend(heads) Y.extend(y.numpy().tolist()) Sentences.extend(sentences) Idx2word.extend(idx2word) if args.large: save_file = "../out/UAS_average_large_{}_{}_{}".format(case, args.hidden_dim, seed) else: save_file = "../out/UAS_average_base_{}_{}_{}".format(case, args.hidden_dim, seed) print(save_file) with open(save_file, 'w') as fout: for words, is_heads, heads, sentences, idx2word, score in zip(Words, Is_heads, Heads, Sentences, Idx2word, Scores): is_heads = np.array(is_heads)==1 score = np.array(score)[is_heads] score = score[:,is_heads] preds = [words.split()[i] if words.split()[i]!='[CLS]' else 'root' for i in chuliu_edmonds_one_root(score)[:-1]] assert (len(preds) + 1) == len(words.split()) == len(heads.split()) == (len(sentences) + 2) for w, t, p, s in zip(words.split()[1:-1], heads.split()[1:-1], preds[1:], sentences): fout.write('{} {} {} {}\n'.format(w, idx2word[int(t)], p, s['text'])) y_true = np.array([line.split()[1] for line in open(save_file, 'r').read().splitlines() if len(line) > 0]) y_pred = np.array([line.split()[2] for line in open(save_file, 'r').read().splitlines() if len(line) > 0]) right = (y_true == y_pred).astype(np.int32).sum() total = len(y_true) print("Dev set accuracy = %.4f" % (right/total) + "({}/{})".format(right, total)) def eval_accuracy(model, iterator, case, seed): model.eval() Words, Is_heads, Heads, Y,Sentences, Idx2word, Scores = [], [], [], [], [], [], [] with torch.no_grad():