def test_empty_sort_with_indices(): ordered, orig_idx = utils.sort_with_indices([]) assert len(ordered) == 0 assert len(orig_idx) == 0 unsorted = utils.unsort(ordered, orig_idx) assert [] == unsorted
def predict(self, batch, unsort=True): inputs, orig_idx, word_orig_idx, char_orig_idx, sentlens, wordlens, charlens, charoffsets = unpack_batch( batch, self.use_cuda) word, word_mask, wordchars, wordchars_mask, chars, tags = inputs self.model.eval() batch_size = word.size(0) _, logits, trans = self.model(word, word_mask, wordchars, wordchars_mask, tags, word_orig_idx, sentlens, wordlens, chars, charoffsets, charlens, char_orig_idx) # decode trans = trans.data.cpu().numpy() scores = logits.data.cpu().numpy() bs = logits.size(0) tag_seqs = [] for i in range(bs): tags, _ = viterbi_decode(scores[i, :sentlens[i]], trans) tags = self.vocab['tag'].unmap(tags) tag_seqs += [tags] if unsort: tag_seqs = utils.unsort(tag_seqs, orig_idx) return tag_seqs
def process(self, document): try: batch = DataLoader(document, self.config['batch_size'], self.config, self.pretrain, vocab=self.vocab, evaluation=True, sort_during_eval=self.config.get( 'sort_during_eval', True), min_length_to_batch_separately=self.config.get( 'min_length_to_batch_separately', DEFAULT_SEPARATE_BATCH)) preds = [] for i, b in enumerate(batch): preds += self.trainer.predict(b) if batch.data_orig_idx is not None: preds = unsort(preds, batch.data_orig_idx) batch.doc.set([doc.HEAD, doc.DEPREL], [y for x in preds for y in x]) # build dependencies based on predictions for sentence in batch.doc.sentences: sentence.build_dependencies() return batch.doc except RuntimeError as e: if str(e).startswith("CUDA out of memory. Tried to allocate"): new_message = str( e ) + " ... You may be able to compensate for this by separating long sentences into their own batch with a parameter such as depparse_min_length_to_batch_separately=150 or by limiting the overall batch size with depparse_batch_size=400." raise RuntimeError(new_message) from e else: raise
def test_split_into_batches(): data = [] for i in range(5): data.append(["Unban", "mox", "opal", str(i)]) data.append(["Do", "n't", "ban", "Urza", "'s", "Saga", "that", "card", "is", "great"]) data.append(["Ban", "Ragavan"]) # small batches will put one element in each interval batches = utils.split_into_batches(data, 5) assert batches == [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7)] # this one has a batch interrupted in the middle by a large element batches = utils.split_into_batches(data, 8) assert batches == [(0, 2), (2, 4), (4, 5), (5, 6), (6, 7)] # this one has the large element at the start of its own batch batches = utils.split_into_batches(data[1:], 8) assert batches == [(0, 2), (2, 4), (4, 5), (5, 6)] # overloading the test! assert that the key & reverse is working ordered, orig_idx = utils.sort_with_indices(data, key=len, reverse=True) assert [len(x) for x in ordered] == [10, 4, 4, 4, 4, 4, 2] # this has the large element at the start batches = utils.split_into_batches(ordered, 8) assert batches == [(0, 1), (1, 3), (3, 5), (5, 7)] # double check that unsort is working as expected assert data == utils.unsort(ordered, orig_idx)
def predict(self, batch, unsort=True): inputs, orig_idx, word_orig_idx, sentlens, wordlens = unpack_batch( batch, self.use_cuda) word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, lemma, head, deprel = inputs self.model.eval() batch_size = word.size(0) _, preds = self.model(word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, lemma, head, deprel, word_orig_idx, sentlens, wordlens) head_seqs = [ chuliu_edmonds_one_root(adj[:l, :l])[1:] for adj, l in zip(preds[0], sentlens) ] # remove attachment for the root deprel_seqs = [ self.vocab['deprel'].unmap( [preds[1][i][j + 1][h] for j, h in enumerate(hs)]) for i, hs in enumerate(head_seqs) ] deprel_prob_seqs = [[preds[2][i][j + 1] for j, h in enumerate(hs)] for i, hs in enumerate(head_seqs)] pred_tokens = [[[ str(head_seqs[i][j]), deprel_seqs[i][j], deprel_prob_seqs[i][j] ] for j in range(sentlens[i] - 1)] for i in range(batch_size)] if unsort: pred_tokens = utils.unsort(pred_tokens, orig_idx) return pred_tokens
def predict(self, batch, unsort=True): inputs, orig_idx, word_orig_idx, sentlens, wordlens = unpack_batch( batch, self.use_cuda) word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained = inputs self.model.eval() batch_size = word.size(0) _, preds = self.model(word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, word_orig_idx, sentlens, wordlens) upos_seqs = [ self.vocab['upos'].unmap(sent) for sent in preds[0].tolist() ] xpos_seqs = [ self.vocab['xpos'].unmap(sent) for sent in preds[1].tolist() ] feats_seqs = [ self.vocab['feats'].unmap(sent) for sent in preds[2].tolist() ] pred_tokens = [[[upos_seqs[i][j], xpos_seqs[i][j], feats_seqs[i][j]] for j in range(sentlens[i])] for i in range(batch_size)] if unsort: pred_tokens = utils.unsort(pred_tokens, orig_idx) return pred_tokens
def test_sort_with_indices(): data = [[1, 2, 3], [4, 5], [6]] ordered, orig_idx = utils.sort_with_indices(data, key=len) assert ordered == ([6], [4, 5], [1, 2, 3]) assert orig_idx == (2, 1, 0) unsorted = utils.unsort(ordered, orig_idx) assert data == unsorted
def predict(self, batch, beam_size=1): inputs, orig_idx = unpack_batch(batch, self.use_cuda) src, src_mask, tgt, tgt_mask, pos, edits = inputs self.model.eval() batch_size = src.size(0) preds, edit_logits = self.model.predict(src, src_mask, pos=pos, beam_size=beam_size) pred_seqs = [self.vocab['char'].unmap(ids) for ids in preds] # unmap to tokens pred_seqs = utils.prune_decoded_seqs(pred_seqs) pred_tokens = ["".join(seq) for seq in pred_seqs] # join chars to be tokens pred_tokens = utils.unsort(pred_tokens, orig_idx) if self.args.get('edit', False): assert edit_logits is not None edits = np.argmax(edit_logits.data.cpu().numpy(), axis=1).reshape([batch_size]).tolist() edits = utils.unsort(edits, orig_idx) else: edits = None return pred_tokens, edits
def get_representation(self, chars, charoffsets, charlens, char_orig_idx): with torch.no_grad(): output, _, _ = self.forward(chars, charlens) res = [output[i, offsets] for i, offsets in enumerate(charoffsets)] res = unsort(res, char_orig_idx) res = pack_sequence(res) if self.pad: res = pad_packed_sequence(res, batch_first=True)[0] return res
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \ else '{}/{}_tagger.pt'.format(args['save_dir'], args['shorthand']) # load pretrain; note that we allow the pretrain_file to be non-existent pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'], args['shorthand']) pretrain = Pretrain(pretrain_file) # load model print("Loading model from: {}".format(model_file)) use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(pretrain=pretrain, model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab # load config for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand' ] or k == 'mode': loaded_args[k] = args[k] # load data print("Loading data with batch size {}...".format(args['batch_size'])) doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) batch = DataLoader(doc, args['batch_size'], loaded_args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True) if len(batch) > 0: print("Start evaluation...") preds = [] for i, b in enumerate(batch): preds += trainer.predict(b) else: # skip eval if dev data does not exist preds = [] preds = utils.unsort(preds, batch.data_orig_idx) # write to file and score batch.doc.set([UPOS, XPOS, FEATS], [y for x in preds for y in x]) CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) print("Tagger score:") print("{} {:.2f}".format(args['shorthand'], score * 100))
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] model_file = model_file_name(args) # load pretrained vectors if needed pretrain = load_pretrain(args) # load model logger.info("Loading model from: {}".format(model_file)) use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(pretrain=pretrain, model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab # load config for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand' ] or k == 'mode': loaded_args[k] = args[k] # load data logger.info("Loading data with batch size {}...".format( args['batch_size'])) doc = CoNLL.conll2doc(input_file=args['eval_file']) batch = DataLoader(doc, args['batch_size'], loaded_args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True) if len(batch) > 0: logger.info("Start evaluation...") preds = [] for i, b in enumerate(batch): preds += trainer.predict(b) else: # skip eval if dev data does not exist preds = [] preds = utils.unsort(preds, batch.data_orig_idx) # write to file and score batch.doc.set([HEAD, DEPREL], [y for x in preds for y in x]) CoNLL.write_doc2conll(batch.doc, system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) logger.info("Parser score:") logger.info("{} {:.2f}".format(args['shorthand'], score * 100))
def predict(self, batch, unsort=True): inputs, orig_idx = unpack_batch(batch, self.use_cuda) src, src_mask, tgt, tgt_mask = inputs self.model.eval() batch_size = src.size(0) preds, _ = self.model.predict(src, src_mask, self.args['beam_size']) pred_seqs = [self.vocab.unmap(ids) for ids in preds] # unmap to tokens pred_seqs = utils.prune_decoded_seqs(pred_seqs) pred_tokens = ["".join(seq) for seq in pred_seqs] # join chars to be tokens if unsort: pred_tokens = utils.unsort(pred_tokens, orig_idx) return pred_tokens
def process(self, document): batch = DataLoader(document, self.config['batch_size'], self.config, self.pretrain, vocab=self.vocab, evaluation=True, sort_during_eval=True) preds = [] for i, b in enumerate(batch): preds += self.trainer.predict(b) preds = unsort(preds, batch.data_orig_idx) batch.doc.set([doc.UPOS, doc.XPOS, doc.FEATS], [y for x in preds for y in x]) return batch.doc
def process(self, document): batch = DataLoader(document, self.config['batch_size'], self.config, self.pretrain, vocab=self.vocab, evaluation=True, sort_during_eval=True) preds = [] for i, b in enumerate(batch): preds += self.trainer.predict(b) preds = unsort(preds, batch.data_orig_idx) batch.doc.set([doc.HEAD, doc.DEPREL], [y for x in preds for y in x]) # build dependencies based on predictions for sentence in batch.doc.sentences: sentence.build_dependencies() return batch.doc
def build_char_representation(self, all_word_labels, device, forward): CHARLM_START = "\n" CHARLM_END = " " if forward: charlm = self.forward_charlm vocab = self.forward_charlm_vocab else: charlm = self.backward_charlm vocab = self.backward_charlm_vocab all_data = [] for idx, word_labels in enumerate(all_word_labels): if forward: word_labels = reversed(word_labels) else: word_labels = [x[::-1] for x in word_labels] chars = [CHARLM_START] offsets = [] for w in word_labels: chars.extend(w) chars.append(CHARLM_END) offsets.append(len(chars) - 1) if not forward: offsets.reverse() chars = vocab.map(chars) all_data.append((chars, offsets, len(chars), len(all_data))) all_data.sort(key=itemgetter(2), reverse=True) chars, char_offsets, char_lens, orig_idx = tuple(zip(*all_data)) chars = get_long_tensor(chars, len(all_data), pad_id=vocab.unit2id(' ')).to(device=device) # TODO: surely this should be stuffed in the charlm model itself rather than done here with torch.no_grad(): output, _, _ = charlm.forward(chars, char_lens) res = [ output[i, offsets] for i, offsets in enumerate(char_offsets) ] res = unsort(res, orig_idx) return res
def predict(self, eval_file_or_string): eval_file = _read_conllu_arg(eval_file_or_string, self.feature_config, predict=True) doc = Document(CoNLL.conll2dict(input_file=eval_file)) batch = DataLoader(doc, self.batch_size, self.loaded_args, self.pretrain, vocab=self.vocab, evaluation=True, sort_during_eval=True) preds = [] if len(batch) > 0: for i, b in enumerate(batch): preds += self.trainer.predict(b) preds = utils.unsort(preds, batch.data_orig_idx) batch.doc.set([HEAD, DEPREL], [y for x in preds for y in x]) doc_conll = CoNLL.convert_dict(batch.doc.to_dict()) conll_string = CoNLL.conll_as_string(doc_conll) return conll_string
def process(self, document): batch = DataLoader( document, self.config['batch_size'], self.config, self.pretrain, vocab=self.vocab, evaluation=True, sort_during_eval=self.config.get('sort_during_eval', True), max_sentence_size=self.config.get('max_sentence_size', None)) preds = [] for i, b in enumerate(batch): preds += self.trainer.predict(b) if batch.data_orig_idx is not None: preds = unsort(preds, batch.data_orig_idx) for i, sentence in enumerate(batch.doc.sentences): sentence.alt_score = preds[i][0][2] batch.doc.set([doc.ALT_HEAD, doc.ALT_DEPREL], [y for x in preds for y in x]) return batch.doc
def predict(self, batch, unsort=True): inputs, orig_idx, word_orig_idx, sentlens, wordlens = unpack_batch(batch, self.use_cuda) word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained = inputs self.model.eval() batch_size = word.size(0) _, preds = self.model(word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, word_orig_idx, sentlens, wordlens) n_pred = self.n_pred bpi = lambda word: np.argsort(word)[-1:-(n_pred + 1):-1] best_predictions = lambda vocab, word: self.vocab[vocab].unmap(bpi(word)) best_scores = lambda word: np.sort(word)[-1:-(n_pred + 1):-1] zipper = lambda vocab, word: tuple( zip( best_predictions(vocab, word), best_scores(word) ) ) feats_zip = lambda word: tuple( zip( bpi(word.detach().numpy()), best_scores(word.detach().numpy())) ) feats_zipper = lambda word: tuple(zip( self.vocab['feats'].unmap(word[0].astype(int)), word[1].tolist() )) upos_seqs = [[zipper('upos', word) for word in sent] for sent in preds[0].tolist()] xpos_seqs = [[zipper('xpos', word) for word in sent] for sent in preds[1].tolist()] feats_seqs = [[[feats_zip(word) for word in sent] for sent in feat] for feat in preds[2]] feats_seqs = np.array(feats_seqs).transpose((1, 2, 4, 3, 0)) feats_seqs = [[feats_zipper(word) for word in sent] for sent in feats_seqs] pred_tokens = [[[upos_seqs[i][j], xpos_seqs[i][j], feats_seqs[i][j]] for j in range(sentlens[i])] for i in range(batch_size)] if unsort: pred_tokens = utils.unsort(pred_tokens, orig_idx) return pred_tokens
def train(args): model_file = model_file_name(args) utils.ensure_dir(os.path.split(model_file)[0]) # load pretrained vectors if needed pretrain = load_pretrain(args) # load data logger.info("Loading data with batch size {}...".format( args['batch_size'])) train_data, _ = CoNLL.conll2dict(input_file=args['train_file']) # possibly augment the training data with some amount of fake data # based on the options chosen logger.info("Original data size: {}".format(len(train_data))) train_data.extend( augment_punct(train_data, args['augment_nopunct'], keep_original_sentences=False)) logger.info("Augmented data size: {}".format(len(train_data))) train_doc = Document(train_data) train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, evaluation=False) vocab = train_batch.vocab dev_doc = CoNLL.conll2doc(input_file=args['eval_file']) dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True) # pred and gold path system_pred_file = args['output_file'] gold_file = args['gold_file'] # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: logger.info("Skip training because no data available...") sys.exit(0) logger.info("Training parser...") trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, use_cuda=args['cuda']) global_step = 0 max_steps = args['max_steps'] dev_score_history = [] best_dev_preds = [] current_lr = args['lr'] global_start_time = time.time() format_str = 'Finished STEP {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' using_amsgrad = False last_best_step = 0 # start training train_loss = 0 while True: do_break = False for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time logger.info( format_str.format(global_step, max_steps, loss, duration, current_lr)) if global_step % args['eval_interval'] == 0: # eval on dev logger.info("Evaluating on dev set...") dev_preds = [] for batch in dev_batch: preds = trainer.predict(batch) dev_preds += preds dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx) dev_batch.doc.set([HEAD, DEPREL], [y for x in dev_preds for y in x]) CoNLL.write_doc2conll(dev_batch.doc, system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) train_loss = train_loss / args[ 'eval_interval'] # avg loss per batch logger.info( "step {}: train_loss = {:.6f}, dev_score = {:.4f}".format( global_step, train_loss, dev_score)) train_loss = 0 # save best model if len(dev_score_history ) == 0 or dev_score > max(dev_score_history): last_best_step = global_step trainer.save(model_file) logger.info("new best model saved.") best_dev_preds = dev_preds dev_score_history += [dev_score] if global_step - last_best_step >= args['max_steps_before_stop']: if not using_amsgrad: logger.info("Switching to AMSGrad") last_best_step = global_step using_amsgrad = True trainer.optimizer = optim.Adam(trainer.model.parameters(), amsgrad=True, lr=args['lr'], betas=(.9, args['beta2']), eps=1e-6) else: do_break = True break if global_step >= args['max_steps']: do_break = True break if do_break: break train_batch.reshuffle() logger.info("Training ended with {} steps.".format(global_step)) best_f, best_eval = max(dev_score_history) * 100, np.argmax( dev_score_history) + 1 logger.info("Best dev F1 = {:.2f}, at iteration = {}".format( best_f, best_eval * args['eval_interval']))
def train(args): utils.ensure_dir(args['save_dir']) model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \ else '{}/{}_parser.pt'.format(args['save_dir'], args['shorthand']) # load pretrained vectors if needed pretrain = None if args['pretrain']: vec_file = args['wordvec_file'] if args['wordvec_file'] else utils.get_wordvec_file(args['wordvec_dir'], args['shorthand']) pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'], args['shorthand']) pretrain = Pretrain(pretrain_file, vec_file, args['pretrain_max_vocab']) # load data print("Loading data with batch size {}...".format(args['batch_size'])) train_doc = Document(CoNLL.conll2dict(input_file=args['train_file'])) train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, evaluation=False) vocab = train_batch.vocab dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True) # pred and gold path system_pred_file = args['output_file'] gold_file = args['gold_file'] # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: print("Skip training because no data available...") sys.exit(0) print("Training parser...") trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, use_cuda=args['cuda']) global_step = 0 max_steps = args['max_steps'] dev_score_history = [] best_dev_preds = [] current_lr = args['lr'] global_start_time = time.time() format_str = '{}: step {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' using_amsgrad = False last_best_step = 0 # start training train_loss = 0 while True: do_break = False for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time print(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\ max_steps, loss, duration, current_lr)) if global_step % args['eval_interval'] == 0: # eval on dev print("Evaluating on dev set...") dev_preds = [] for batch in dev_batch: preds = trainer.predict(batch) dev_preds += preds dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx) dev_batch.doc.set([HEAD, DEPREL], [y for x in dev_preds for y in x]) CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) train_loss = train_loss / args['eval_interval'] # avg loss per batch print("step {}: train_loss = {:.6f}, dev_score = {:.4f}".format(global_step, train_loss, dev_score)) train_loss = 0 # save best model if len(dev_score_history) == 0 or dev_score > max(dev_score_history): last_best_step = global_step trainer.save(model_file) print("new best model saved.") best_dev_preds = dev_preds dev_score_history += [dev_score] print("") if global_step - last_best_step >= args['max_steps_before_stop']: if not using_amsgrad: print("Switching to AMSGrad") last_best_step = global_step using_amsgrad = True trainer.optimizer = optim.Adam(trainer.model.parameters(), amsgrad=True, lr=args['lr'], betas=(.9, args['beta2']), eps=1e-6) else: do_break = True break if global_step >= args['max_steps']: do_break = True break if do_break: break train_batch.reshuffle() print("Training ended with {} steps.".format(global_step)) best_f, best_eval = max(dev_score_history)*100, np.argmax(dev_score_history)+1 print("Best dev F1 = {:.2f}, at iteration = {}".format(best_f, best_eval * args['eval_interval']))
def forward(self, word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, word_orig_idx, sentlens, wordlens, orig_idx=None, morph_dict=None, start=None, end=None): def pack(x): # Packs a Tensor containing padded sequences of variable length. return pack_padded_sequence(x, sentlens, batch_first=True) inputs = [] if self.args['word_emb_dim'] > 0: word_emb = self.word_emb(word) word_emb = pack(word_emb) inputs += [word_emb] if self.args['pretrain']: pretrained_emb = self.pretrained_emb(pretrained) pretrained_emb = self.trans_pretrained(pretrained_emb) pretrained_emb = pack(pretrained_emb) inputs += [pretrained_emb] def pad(x): # inverse operation to pack_padded_sequence(). Pads a packed batch of variable length sequences. return pad_packed_sequence(PackedSequence(x, word_emb.batch_sizes), batch_first=True)[0] if self.args['char'] and self.args['char_emb_dim'] > 0: char_reps = self.charmodel(wordchars, wordchars_mask, word_orig_idx, sentlens, wordlens) char_reps = PackedSequence(self.trans_char(self.drop(char_reps.data)), char_reps.batch_sizes) inputs += [char_reps] lstm_inputs = torch.cat([x.data for x in inputs],1) lstm_inputs = self.worddrop(lstm_inputs, self.drop_replacement) lstm_inputs = self.drop(lstm_inputs) lstm_inputs = PackedSequence(lstm_inputs, inputs[0].batch_sizes) lstm_outputs, _ = self.taggerlstm(lstm_inputs, sentlens, hx=( self.taggerlstm_h_init.expand(2 * self.args['num_layers'], word.size(0), self.args['hidden_dim']).contiguous(), self.taggerlstm_c_init.expand(2 * self.args['num_layers'], word.size(0), self.args['hidden_dim']).contiguous())) lstm_outputs = lstm_outputs.data upos_hid = F.relu(self.upos_hid(self.drop(lstm_outputs))) upos_pred = self.upos_clf(self.drop(upos_hid)) preds = [pad(upos_pred).max(2)[1]] upos = pack(upos).data loss = self.crit(upos_pred.view(-1, upos_pred.size(-1)), upos.view(-1)) if self.share_hid: xpos_hid = upos_hid ufeats_hid = upos_hid clffunc = lambda clf, hid: clf(self.drop(hid)) else: xpos_hid = F.relu(self.xpos_hid(self.drop(lstm_outputs))) ufeats_hid = F.relu(self.ufeats_hid(self.drop(lstm_outputs))) # this is where we get upos embeddings if self.training: upos_emb = self.upos_emb(upos) else: # get the top 5 upos predictions best_5 = [sorted(range(len(x)), key=lambda i: x[i], reverse=True)[:5] for x in upos_pred] # save upos emb for later upos_temp = self.upos_emb upos_emb = self.upos_emb(upos_pred.max(1)[1]) clffunc = lambda clf, hid: clf(self.drop(hid), self.drop(upos_emb)) # ORG xpos = pack(xpos).data if isinstance(self.vocab['xpos'], CompositeVocab): xpos_preds = [] for i in range(len(self.vocab['xpos'])): xpos_pred = clffunc(self.xpos_clf[i], xpos_hid) loss += self.crit(xpos_pred.view(-1, xpos_pred.size(-1)), xpos[:, i].view(-1)) xpos_preds.append(pad(xpos_pred).max(2, keepdim=True)[1]) preds.append(torch.cat(xpos_preds, 2)) else: xpos_pred = clffunc(self.xpos_clf, xpos_hid) loss += self.crit(xpos_pred.view(-1, xpos_pred.size(-1)), xpos.view(-1)) preds.append(pad(xpos_pred).max(2)[1]) ufeats_preds = [] ufeats = pack(ufeats).data for i in range(len(self.vocab['feats'])): ufeats_pred = clffunc(self.ufeats_clf[i], ufeats_hid) loss += self.crit(ufeats_pred.view(-1, ufeats_pred.size(-1)), ufeats[:, i].view(-1)) ufeats_preds.append(pad(ufeats_pred).max(2, keepdim=True)[1]) preds.append(torch.cat(ufeats_preds,2)) # post-filter only if a morphological dictionary is present if morph_dict: # get the most likely ufeats tag for each top 5 upos tags predicted for a word feats_coeffs = list() for r in range(5): # condition ufeats on a different upos tag embedding each time upos_2 = torch.LongTensor([x[r] for x in best_5]) upos_emb2 = upos_temp(upos_2) clffunc_temp = lambda clf, hid: clf(self.drop(hid), self.drop(upos_emb2)) ufeats_preds_temp = [] for i in range(len(self.vocab['feats'])): ufeats_pred = clffunc_temp(self.ufeats_clf[i], ufeats_hid) ufeats_preds_temp.append(pad(ufeats_pred).max(2, keepdim=True)[1]) feats_coeffs.append(torch.cat(ufeats_preds_temp, 2)) # unmap all tags into readable format and unsort them into the original order that matches the sentence order upos_seqs = [self.vocab['upos'].unmap(up) for up in preds[0].tolist()] xpos_seqs = [self.vocab['xpos'].unmap(up) for up in preds[1].tolist()] feats_seqs = [self.vocab['feats'].unmap(up) for up in preds[2].tolist()] pred_tokens = [[[upos_seqs[i][j], xpos_seqs[i][j], feats_seqs[i][j]] for j in range(sentlens[i])] for i in range(word.size(0))] pred_tokens = utils.unsort(pred_tokens, orig_idx) # pair the tags with the right words in the right sentences. sntncs = self.doc.sentences[start:end] sent_tokens = [[x.text for x in sent.tokens] for sent in sntncs] pair = [x for x in zip(sent_tokens, pred_tokens)] # 5 most likely upos tags for the token coeff = utils.unsort(pad(upos_pred).tolist(), orig_idx) coeff_max = [[sorted(range(len(x)), key=lambda i: x[i], reverse=True)[:5] for x in y] for y in coeff] # the most likely feats tag for each top 5 predicted upos tag fct = [] for f in feats_coeffs: fct.append(utils.unsort(f, orig_idx)) fct2 = [list(zip(*[fct[0][i], fct[1][i], fct[2][i], fct[3][i], fct[4][i]])) for i in range(len(fct[0]))] feats_coeffs = [[list(j[i]) for i in range(len(j))] for j in fct2] # initialise hunspell for Lithuanian if self.args['lang'] == 'lt': root = os.path.dirname(os.getcwd()) hunspell = Hunchecker('lt-LT_morphology', root + '/data_files/hunspell') print('Post-filtering...') for p in range(len(pair)): # get a sentence words = pair[p][0] tags = pair[p][1] a = 0 while a < len(words): lemma, upos, xpos, feats = morph_dict.find(words[a]) if upos is None: lemma, upos, xpos, feats = morph_dict.find(words[a].lower()) else: lemma2, upos2, xpos2, feats2 = morph_dict.find(words[a].lower()) if lemma2: for i in range(len(lemma2)): if upos2[i] not in upos or feats2[i] not in feats: lemma += [lemma2[i]] upos += [upos2[i]] xpos += [xpos2[i]] feats += [feats2[i]] if self.args['lang'] == 'lt': if upos is None: lemma, upos, xpos, feats = hunspell.hunspell_to_conll(words[a]) else: lemma_h, upos_h, xpos_h, feats_h = hunspell.hunspell_to_conll(words[a]) if upos_h is not None: for i in range(len(upos_h)): if upos_h[i] not in upos or feats_h[i] not in feats: lemma += [lemma_h[i]] upos += [upos_h[i]] xpos += [xpos_h[i]] feats += [feats_h[i]] if upos is not None: if tags[a][0] not in upos: new_upos = None tag_idx = None if len(upos) > 1: max_values = self.vocab['upos'].unmap(coeff_max[p][a][1:]) # go through the values in the order of the most likely one for m in range(len(max_values)): # for every max upos tag # found one of the possible predicted values in the upos list if max_values[m] in upos: indices = [i for i, x in enumerate(upos) if x == max_values[m]] if len(indices) > 1: # more than one upos list items matches the max value item # check if an exact match can be found, using the most informative ufeats tag for d in indices: if feats[d] == self.vocab['feats'].unmap(feats_coeffs[p][a][1:])[m] and \ upos[d] == max_values[m]: new_upos = upos[d] tag_idx = d break if len(indices) == 1 or new_upos is None: new_upos = max_values[m] tag_idx = upos.index(max_values[m]) break if new_upos is None: # last resort new_upos = upos[0] tag_idx = 0 else: # only one item in upos list new_upos = upos[0] tag_idx = 0 new_xpos = xpos[tag_idx] new_feats = feats[tag_idx] # let the tagger deal with multiword tokens itself if ('Hyph=Yes' not in new_feats and 'Hyph=Yes' in tags[a][2]) or ( 'Hyph=Yes' in new_feats and 'Hyph=Yes' not in tags[a][2]): new_upos = new_xpos = new_feats = None if new_upos is not None: preds[0][orig_idx.index(p)][a] = self.vocab['upos'].map([new_upos])[0] # sme has a 2D torch here, LT has 3D if not isinstance(self.vocab['xpos'], CompositeVocab): preds[1][orig_idx.index(p)][a] = self.vocab['xpos'].map([new_xpos])[0] else: preds[1][orig_idx.index(p)][a] = torch.LongTensor( self.vocab['xpos'].map([new_xpos])[0]) preds[2][orig_idx.index(p)][a] = torch.LongTensor( self.vocab['feats'].map([new_feats])[0]) else: new_xpos = new_feats = None all_found = False for x in range(len(xpos)): if tags[a][1] == xpos[x] and tags[a][2] == feats[x] and upos[x] == tags[a][0]: all_found = True break if not all_found: if len(upos) == 1 or (False not in [feats[a] == feats[a + 1] for a in range(len(feats) - 1)] and False not in [ upos[a] == upos[a + 1] for a in range(len(upos) - 1)]): new_feats = feats[0] if '*' not in tags[a][1]: new_xpos = xpos[0] all_found = True if not all_found: if len([i for i, x in enumerate(upos) if x == tags[a][0]]) == 1: new_feats = feats[upos.index(tags[a][0])] if '*' not in tags[a][1]: new_xpos = xpos[upos.index(tags[a][0])] all_found = True if not all_found: found_ft = False for x in range(len(xpos)): if tags[a][2] == feats[x] and upos[x] == tags[a][0]: found_ft = True if xpos[x] != tags[a][1] and '*' not in tags[a][1]: new_xpos = xpos[x] break if not found_ft: for x in range(len(xpos)): if tags[a][1] == xpos[x] and tags[a][2] != feats[x] and upos[x] == tags[a][0]: new_feats = feats[x] break if new_feats: if ('Hyph=Yes' not in new_feats and 'Hyph=Yes' in tags[a][2]) or ( 'Hyph=Yes' in new_feats and 'Hyph=Yes' not in tags[a][2]): # let the tagger deal with multiword tokens itself new_xpos = new_feats = None if new_xpos is not None: # non composite has a 2D torch here, composite has 3D if not isinstance(self.vocab['xpos'], CompositeVocab): preds[1][orig_idx.index(p)][a] = self.vocab['xpos'].map([new_xpos])[0] else: preds[1][orig_idx.index(p)][a] = torch.LongTensor( self.vocab['xpos'].map([new_xpos])[0]) if new_feats is not None: preds[2][orig_idx.index(p)][a] = torch.LongTensor( self.vocab['feats'].map([new_feats])[0]) a += 1 print('Post-filtering complete.') return loss, preds
def predict(self, batch, unsort=True): inputs, orig_idx, word_orig_idx, sentlens, wordlens = unpack_batch( batch, self.use_cuda) word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, lemma, head, deprel = inputs self.model.eval() batch_size = word.size(0) _, preds = self.model(word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, lemma, head, deprel, word_orig_idx, sentlens, wordlens) # head_seqs is dimension (sentence, word) # so let's see what happens here # adj[:l, :l] is an adjacency matrix telling who is the head of who # preds[0][i] is the adjacency matrix for the i-th sentence # preds[1][i] is the depprel adjacency matrix for the i-th sentence # so basically: let's get the head-sequence for each sentence # let's get the head_seqs = [] deprel_seqs = [] score_seqs = [] # get the head graph and the deprel map for each sentence for i, (head, deps) in enumerate(zip(preds[0], preds[1])): head_seq = [] deprel_seq = [] score_seq = [] edge_type = lambda edge: self.vocab['deprel'].unmap( (deps[edge.v][edge.u], ))[0] k_best = alternatives.GetKBest( head, self._n_trees, self._kalm_shuffle, edge_type, self._automatic_n_parses, ) for j in range(sentlens[i] - 1): headc = [] deprelc = [] scorec = [] for n_tree, (tree, score) in enumerate(k_best): scorec.append(score) best_in_edge = tree[j + 1] source = best_in_edge.u - 1 headc.append(source + 1) edge = self.vocab['deprel'].unmap( (deps[j + 1][source + 1], ))[0] deprelc.append(edge) head_seq.append(headc) deprel_seq.append(deprelc) score_seq.append(scorec) head_seqs.append(head_seq) deprel_seqs.append(deprel_seq) score_seqs.append(score_seq) pred_tokens = [[[head_seqs[i][j], deprel_seqs[i][j], score_seqs[i][j]] for j in range(sentlens[i] - 1)] for i in range(batch_size)] if unsort: pred_tokens = utils.unsort(pred_tokens, orig_idx) return pred_tokens