コード例 #1
0
ファイル: test_utils.py プロジェクト: stanfordnlp/stanza
def test_empty_sort_with_indices():
    ordered, orig_idx = utils.sort_with_indices([])
    assert len(ordered) == 0
    assert len(orig_idx) == 0

    unsorted = utils.unsort(ordered, orig_idx)
    assert [] == unsorted
コード例 #2
0
ファイル: trainer.py プロジェクト: Pandinosaurus/stanfordnlp
    def predict(self, batch, unsort=True):
        inputs, orig_idx, word_orig_idx, char_orig_idx, sentlens, wordlens, charlens, charoffsets = unpack_batch(
            batch, self.use_cuda)
        word, word_mask, wordchars, wordchars_mask, chars, tags = inputs

        self.model.eval()
        batch_size = word.size(0)
        _, logits, trans = self.model(word, word_mask, wordchars,
                                      wordchars_mask, tags, word_orig_idx,
                                      sentlens, wordlens, chars, charoffsets,
                                      charlens, char_orig_idx)

        # decode
        trans = trans.data.cpu().numpy()
        scores = logits.data.cpu().numpy()
        bs = logits.size(0)
        tag_seqs = []
        for i in range(bs):
            tags, _ = viterbi_decode(scores[i, :sentlens[i]], trans)
            tags = self.vocab['tag'].unmap(tags)
            tag_seqs += [tags]

        if unsort:
            tag_seqs = utils.unsort(tag_seqs, orig_idx)
        return tag_seqs
コード例 #3
0
 def process(self, document):
     try:
         batch = DataLoader(document,
                            self.config['batch_size'],
                            self.config,
                            self.pretrain,
                            vocab=self.vocab,
                            evaluation=True,
                            sort_during_eval=self.config.get(
                                'sort_during_eval', True),
                            min_length_to_batch_separately=self.config.get(
                                'min_length_to_batch_separately',
                                DEFAULT_SEPARATE_BATCH))
         preds = []
         for i, b in enumerate(batch):
             preds += self.trainer.predict(b)
         if batch.data_orig_idx is not None:
             preds = unsort(preds, batch.data_orig_idx)
         batch.doc.set([doc.HEAD, doc.DEPREL],
                       [y for x in preds for y in x])
         # build dependencies based on predictions
         for sentence in batch.doc.sentences:
             sentence.build_dependencies()
         return batch.doc
     except RuntimeError as e:
         if str(e).startswith("CUDA out of memory. Tried to allocate"):
             new_message = str(
                 e
             ) + " ... You may be able to compensate for this by separating long sentences into their own batch with a parameter such as depparse_min_length_to_batch_separately=150 or by limiting the overall batch size with depparse_batch_size=400."
             raise RuntimeError(new_message) from e
         else:
             raise
コード例 #4
0
ファイル: test_utils.py プロジェクト: stanfordnlp/stanza
def test_split_into_batches():
    data = []
    for i in range(5):
        data.append(["Unban", "mox", "opal", str(i)])

    data.append(["Do", "n't", "ban", "Urza", "'s", "Saga", "that", "card", "is", "great"])
    data.append(["Ban", "Ragavan"])

    # small batches will put one element in each interval
    batches = utils.split_into_batches(data, 5)
    assert batches == [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7)]

    # this one has a batch interrupted in the middle by a large element
    batches = utils.split_into_batches(data, 8)
    assert batches == [(0, 2), (2, 4), (4, 5), (5, 6), (6, 7)]

    # this one has the large element at the start of its own batch
    batches = utils.split_into_batches(data[1:], 8)
    assert batches == [(0, 2), (2, 4), (4, 5), (5, 6)]

    # overloading the test!  assert that the key & reverse is working
    ordered, orig_idx = utils.sort_with_indices(data, key=len, reverse=True)
    assert [len(x) for x in ordered] == [10, 4, 4, 4, 4, 4, 2]

    # this has the large element at the start
    batches = utils.split_into_batches(ordered, 8)
    assert batches == [(0, 1), (1, 3), (3, 5), (5, 7)]

    # double check that unsort is working as expected
    assert data == utils.unsort(ordered, orig_idx)
コード例 #5
0
    def predict(self, batch, unsort=True):
        inputs, orig_idx, word_orig_idx, sentlens, wordlens = unpack_batch(
            batch, self.use_cuda)
        word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, lemma, head, deprel = inputs

        self.model.eval()
        batch_size = word.size(0)
        _, preds = self.model(word, word_mask, wordchars, wordchars_mask, upos,
                              xpos, ufeats, pretrained, lemma, head, deprel,
                              word_orig_idx, sentlens, wordlens)
        head_seqs = [
            chuliu_edmonds_one_root(adj[:l, :l])[1:]
            for adj, l in zip(preds[0], sentlens)
        ]  # remove attachment for the root
        deprel_seqs = [
            self.vocab['deprel'].unmap(
                [preds[1][i][j + 1][h] for j, h in enumerate(hs)])
            for i, hs in enumerate(head_seqs)
        ]
        deprel_prob_seqs = [[preds[2][i][j + 1] for j, h in enumerate(hs)]
                            for i, hs in enumerate(head_seqs)]

        pred_tokens = [[[
            str(head_seqs[i][j]), deprel_seqs[i][j], deprel_prob_seqs[i][j]
        ] for j in range(sentlens[i] - 1)] for i in range(batch_size)]
        if unsort:
            pred_tokens = utils.unsort(pred_tokens, orig_idx)
        return pred_tokens
コード例 #6
0
ファイル: trainer.py プロジェクト: Pandinosaurus/stanfordnlp
    def predict(self, batch, unsort=True):
        inputs, orig_idx, word_orig_idx, sentlens, wordlens = unpack_batch(
            batch, self.use_cuda)
        word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained = inputs

        self.model.eval()
        batch_size = word.size(0)
        _, preds = self.model(word, word_mask, wordchars, wordchars_mask, upos,
                              xpos, ufeats, pretrained, word_orig_idx,
                              sentlens, wordlens)
        upos_seqs = [
            self.vocab['upos'].unmap(sent) for sent in preds[0].tolist()
        ]
        xpos_seqs = [
            self.vocab['xpos'].unmap(sent) for sent in preds[1].tolist()
        ]
        feats_seqs = [
            self.vocab['feats'].unmap(sent) for sent in preds[2].tolist()
        ]

        pred_tokens = [[[upos_seqs[i][j], xpos_seqs[i][j], feats_seqs[i][j]]
                        for j in range(sentlens[i])]
                       for i in range(batch_size)]
        if unsort:
            pred_tokens = utils.unsort(pred_tokens, orig_idx)
        return pred_tokens
コード例 #7
0
ファイル: test_utils.py プロジェクト: stanfordnlp/stanza
def test_sort_with_indices():
    data = [[1, 2, 3], [4, 5], [6]]
    ordered, orig_idx = utils.sort_with_indices(data, key=len)
    assert ordered == ([6], [4, 5], [1, 2, 3])
    assert orig_idx == (2, 1, 0)

    unsorted = utils.unsort(ordered, orig_idx)
    assert data == unsorted
コード例 #8
0
    def predict(self, batch, beam_size=1):
        inputs, orig_idx = unpack_batch(batch, self.use_cuda)
        src, src_mask, tgt, tgt_mask, pos, edits = inputs

        self.model.eval()
        batch_size = src.size(0)
        preds, edit_logits = self.model.predict(src, src_mask, pos=pos, beam_size=beam_size)
        pred_seqs = [self.vocab['char'].unmap(ids) for ids in preds] # unmap to tokens
        pred_seqs = utils.prune_decoded_seqs(pred_seqs)
        pred_tokens = ["".join(seq) for seq in pred_seqs] # join chars to be tokens
        pred_tokens = utils.unsort(pred_tokens, orig_idx)
        if self.args.get('edit', False):
            assert edit_logits is not None
            edits = np.argmax(edit_logits.data.cpu().numpy(), axis=1).reshape([batch_size]).tolist()
            edits = utils.unsort(edits, orig_idx)
        else:
            edits = None
        return pred_tokens, edits
コード例 #9
0
 def get_representation(self, chars, charoffsets, charlens, char_orig_idx):
     with torch.no_grad():
         output, _, _ = self.forward(chars, charlens)
         res = [output[i, offsets] for i, offsets in enumerate(charoffsets)]
         res = unsort(res, char_orig_idx)
         res = pack_sequence(res)
         if self.pad:
             res = pad_packed_sequence(res, batch_first=True)[0]
     return res
コード例 #10
0
def evaluate(args):
    # file paths
    system_pred_file = args['output_file']
    gold_file = args['gold_file']
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
            else '{}/{}_tagger.pt'.format(args['save_dir'], args['shorthand'])

    # load pretrain; note that we allow the pretrain_file to be non-existent
    pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'],
                                               args['shorthand'])
    pretrain = Pretrain(pretrain_file)

    # load model
    print("Loading model from: {}".format(model_file))
    use_cuda = args['cuda'] and not args['cpu']
    trainer = Trainer(pretrain=pretrain,
                      model_file=model_file,
                      use_cuda=use_cuda)
    loaded_args, vocab = trainer.args, trainer.vocab

    # load config
    for k in args:
        if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand'
                                                              ] or k == 'mode':
            loaded_args[k] = args[k]

    # load data
    print("Loading data with batch size {}...".format(args['batch_size']))
    doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    batch = DataLoader(doc,
                       args['batch_size'],
                       loaded_args,
                       pretrain,
                       vocab=vocab,
                       evaluation=True,
                       sort_during_eval=True)
    if len(batch) > 0:
        print("Start evaluation...")
        preds = []
        for i, b in enumerate(batch):
            preds += trainer.predict(b)
    else:
        # skip eval if dev data does not exist
        preds = []
    preds = utils.unsort(preds, batch.data_orig_idx)

    # write to file and score
    batch.doc.set([UPOS, XPOS, FEATS], [y for x in preds for y in x])
    CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)

    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        print("Tagger score:")
        print("{} {:.2f}".format(args['shorthand'], score * 100))
コード例 #11
0
def evaluate(args):
    # file paths
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    model_file = model_file_name(args)
    # load pretrained vectors if needed
    pretrain = load_pretrain(args)

    # load model
    logger.info("Loading model from: {}".format(model_file))
    use_cuda = args['cuda'] and not args['cpu']
    trainer = Trainer(pretrain=pretrain,
                      model_file=model_file,
                      use_cuda=use_cuda)
    loaded_args, vocab = trainer.args, trainer.vocab

    # load config
    for k in args:
        if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand'
                                                              ] or k == 'mode':
            loaded_args[k] = args[k]

    # load data
    logger.info("Loading data with batch size {}...".format(
        args['batch_size']))
    doc = CoNLL.conll2doc(input_file=args['eval_file'])
    batch = DataLoader(doc,
                       args['batch_size'],
                       loaded_args,
                       pretrain,
                       vocab=vocab,
                       evaluation=True,
                       sort_during_eval=True)

    if len(batch) > 0:
        logger.info("Start evaluation...")
        preds = []
        for i, b in enumerate(batch):
            preds += trainer.predict(b)
    else:
        # skip eval if dev data does not exist
        preds = []
    preds = utils.unsort(preds, batch.data_orig_idx)

    # write to file and score
    batch.doc.set([HEAD, DEPREL], [y for x in preds for y in x])
    CoNLL.write_doc2conll(batch.doc, system_pred_file)

    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        logger.info("Parser score:")
        logger.info("{} {:.2f}".format(args['shorthand'], score * 100))
コード例 #12
0
ファイル: trainer.py プロジェクト: sarves/stanza-for-Tamil
    def predict(self, batch, unsort=True):
        inputs, orig_idx = unpack_batch(batch, self.use_cuda)
        src, src_mask, tgt, tgt_mask = inputs

        self.model.eval()
        batch_size = src.size(0)
        preds, _ = self.model.predict(src, src_mask, self.args['beam_size'])
        pred_seqs = [self.vocab.unmap(ids) for ids in preds]  # unmap to tokens
        pred_seqs = utils.prune_decoded_seqs(pred_seqs)
        pred_tokens = ["".join(seq)
                       for seq in pred_seqs]  # join chars to be tokens
        if unsort:
            pred_tokens = utils.unsort(pred_tokens, orig_idx)
        return pred_tokens
コード例 #13
0
 def process(self, document):
     batch = DataLoader(document,
                        self.config['batch_size'],
                        self.config,
                        self.pretrain,
                        vocab=self.vocab,
                        evaluation=True,
                        sort_during_eval=True)
     preds = []
     for i, b in enumerate(batch):
         preds += self.trainer.predict(b)
     preds = unsort(preds, batch.data_orig_idx)
     batch.doc.set([doc.UPOS, doc.XPOS, doc.FEATS],
                   [y for x in preds for y in x])
     return batch.doc
コード例 #14
0
 def process(self, document):
     batch = DataLoader(document,
                        self.config['batch_size'],
                        self.config,
                        self.pretrain,
                        vocab=self.vocab,
                        evaluation=True,
                        sort_during_eval=True)
     preds = []
     for i, b in enumerate(batch):
         preds += self.trainer.predict(b)
     preds = unsort(preds, batch.data_orig_idx)
     batch.doc.set([doc.HEAD, doc.DEPREL], [y for x in preds for y in x])
     # build dependencies based on predictions
     for sentence in batch.doc.sentences:
         sentence.build_dependencies()
     return batch.doc
コード例 #15
0
    def build_char_representation(self, all_word_labels, device, forward):
        CHARLM_START = "\n"
        CHARLM_END = " "

        if forward:
            charlm = self.forward_charlm
            vocab = self.forward_charlm_vocab
        else:
            charlm = self.backward_charlm
            vocab = self.backward_charlm_vocab

        all_data = []
        for idx, word_labels in enumerate(all_word_labels):
            if forward:
                word_labels = reversed(word_labels)
            else:
                word_labels = [x[::-1] for x in word_labels]

            chars = [CHARLM_START]
            offsets = []
            for w in word_labels:
                chars.extend(w)
                chars.append(CHARLM_END)
                offsets.append(len(chars) - 1)
            if not forward:
                offsets.reverse()
            chars = vocab.map(chars)
            all_data.append((chars, offsets, len(chars), len(all_data)))

        all_data.sort(key=itemgetter(2), reverse=True)
        chars, char_offsets, char_lens, orig_idx = tuple(zip(*all_data))
        chars = get_long_tensor(chars,
                                len(all_data),
                                pad_id=vocab.unit2id(' ')).to(device=device)

        # TODO: surely this should be stuffed in the charlm model itself rather than done here
        with torch.no_grad():
            output, _, _ = charlm.forward(chars, char_lens)
            res = [
                output[i, offsets] for i, offsets in enumerate(char_offsets)
            ]
            res = unsort(res, orig_idx)

        return res
コード例 #16
0
ファイル: coptic.py プロジェクト: CopticScriptorium/stanza
    def predict(self, eval_file_or_string):
        eval_file = _read_conllu_arg(eval_file_or_string,
                                     self.feature_config,
                                     predict=True)
        doc = Document(CoNLL.conll2dict(input_file=eval_file))
        batch = DataLoader(doc,
                           self.batch_size,
                           self.loaded_args,
                           self.pretrain,
                           vocab=self.vocab,
                           evaluation=True,
                           sort_during_eval=True)

        preds = []
        if len(batch) > 0:
            for i, b in enumerate(batch):
                preds += self.trainer.predict(b)
        preds = utils.unsort(preds, batch.data_orig_idx)
        batch.doc.set([HEAD, DEPREL], [y for x in preds for y in x])

        doc_conll = CoNLL.convert_dict(batch.doc.to_dict())
        conll_string = CoNLL.conll_as_string(doc_conll)
        return conll_string
コード例 #17
0
    def process(self, document):
        batch = DataLoader(
            document,
            self.config['batch_size'],
            self.config,
            self.pretrain,
            vocab=self.vocab,
            evaluation=True,
            sort_during_eval=self.config.get('sort_during_eval', True),
            max_sentence_size=self.config.get('max_sentence_size', None))
        preds = []
        for i, b in enumerate(batch):
            preds += self.trainer.predict(b)
        if batch.data_orig_idx is not None:
            preds = unsort(preds, batch.data_orig_idx)

        for i, sentence in enumerate(batch.doc.sentences):
            sentence.alt_score = preds[i][0][2]

        batch.doc.set([doc.ALT_HEAD, doc.ALT_DEPREL],
                      [y for x in preds for y in x])

        return batch.doc
コード例 #18
0
    def predict(self, batch, unsort=True):
        inputs, orig_idx, word_orig_idx, sentlens, wordlens = unpack_batch(batch, self.use_cuda)
        word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained = inputs

        self.model.eval()
        batch_size = word.size(0)
        _, preds = self.model(word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, word_orig_idx, sentlens, wordlens)
        n_pred = self.n_pred
        bpi = lambda word: np.argsort(word)[-1:-(n_pred + 1):-1]
        best_predictions = lambda vocab, word: self.vocab[vocab].unmap(bpi(word))
        best_scores = lambda word: np.sort(word)[-1:-(n_pred + 1):-1]
        zipper = lambda vocab, word: tuple(
                zip(
                    best_predictions(vocab, word),
                    best_scores(word)
                )
            )

        feats_zip = lambda word: tuple(
                zip(
                    bpi(word.detach().numpy()), 
                    best_scores(word.detach().numpy()))
                )
        feats_zipper = lambda word: tuple(zip(
            self.vocab['feats'].unmap(word[0].astype(int)),
            word[1].tolist()
        ))

        upos_seqs = [[zipper('upos', word) for word in sent] for sent in preds[0].tolist()]
        xpos_seqs = [[zipper('xpos', word) for word in sent] for sent in preds[1].tolist()]
        feats_seqs = [[[feats_zip(word) for word in sent] for sent in feat] for feat in preds[2]]
        feats_seqs = np.array(feats_seqs).transpose((1, 2, 4, 3, 0))
        feats_seqs = [[feats_zipper(word) for word in sent] for sent in feats_seqs]
        pred_tokens = [[[upos_seqs[i][j], xpos_seqs[i][j], feats_seqs[i][j]] for j in range(sentlens[i])] for i in range(batch_size)]
        if unsort:
            pred_tokens = utils.unsort(pred_tokens, orig_idx)
        return pred_tokens
コード例 #19
0
def train(args):
    model_file = model_file_name(args)
    utils.ensure_dir(os.path.split(model_file)[0])

    # load pretrained vectors if needed
    pretrain = load_pretrain(args)

    # load data
    logger.info("Loading data with batch size {}...".format(
        args['batch_size']))
    train_data, _ = CoNLL.conll2dict(input_file=args['train_file'])
    # possibly augment the training data with some amount of fake data
    # based on the options chosen
    logger.info("Original data size: {}".format(len(train_data)))
    train_data.extend(
        augment_punct(train_data,
                      args['augment_nopunct'],
                      keep_original_sentences=False))
    logger.info("Augmented data size: {}".format(len(train_data)))
    train_doc = Document(train_data)
    train_batch = DataLoader(train_doc,
                             args['batch_size'],
                             args,
                             pretrain,
                             evaluation=False)
    vocab = train_batch.vocab
    dev_doc = CoNLL.conll2doc(input_file=args['eval_file'])
    dev_batch = DataLoader(dev_doc,
                           args['batch_size'],
                           args,
                           pretrain,
                           vocab=vocab,
                           evaluation=True,
                           sort_during_eval=True)

    # pred and gold path
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    # skip training if the language does not have training or dev data
    if len(train_batch) == 0 or len(dev_batch) == 0:
        logger.info("Skip training because no data available...")
        sys.exit(0)

    logger.info("Training parser...")
    trainer = Trainer(args=args,
                      vocab=vocab,
                      pretrain=pretrain,
                      use_cuda=args['cuda'])

    global_step = 0
    max_steps = args['max_steps']
    dev_score_history = []
    best_dev_preds = []
    current_lr = args['lr']
    global_start_time = time.time()
    format_str = 'Finished STEP {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

    using_amsgrad = False
    last_best_step = 0
    # start training
    train_loss = 0
    while True:
        do_break = False
        for i, batch in enumerate(train_batch):
            start_time = time.time()
            global_step += 1
            loss = trainer.update(batch, eval=False)  # update step
            train_loss += loss
            if global_step % args['log_step'] == 0:
                duration = time.time() - start_time
                logger.info(
                    format_str.format(global_step, max_steps, loss, duration,
                                      current_lr))

            if global_step % args['eval_interval'] == 0:
                # eval on dev
                logger.info("Evaluating on dev set...")
                dev_preds = []
                for batch in dev_batch:
                    preds = trainer.predict(batch)
                    dev_preds += preds
                dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx)

                dev_batch.doc.set([HEAD, DEPREL],
                                  [y for x in dev_preds for y in x])
                CoNLL.write_doc2conll(dev_batch.doc, system_pred_file)
                _, _, dev_score = scorer.score(system_pred_file, gold_file)

                train_loss = train_loss / args[
                    'eval_interval']  # avg loss per batch
                logger.info(
                    "step {}: train_loss = {:.6f}, dev_score = {:.4f}".format(
                        global_step, train_loss, dev_score))
                train_loss = 0

                # save best model
                if len(dev_score_history
                       ) == 0 or dev_score > max(dev_score_history):
                    last_best_step = global_step
                    trainer.save(model_file)
                    logger.info("new best model saved.")
                    best_dev_preds = dev_preds

                dev_score_history += [dev_score]

            if global_step - last_best_step >= args['max_steps_before_stop']:
                if not using_amsgrad:
                    logger.info("Switching to AMSGrad")
                    last_best_step = global_step
                    using_amsgrad = True
                    trainer.optimizer = optim.Adam(trainer.model.parameters(),
                                                   amsgrad=True,
                                                   lr=args['lr'],
                                                   betas=(.9, args['beta2']),
                                                   eps=1e-6)
                else:
                    do_break = True
                    break

            if global_step >= args['max_steps']:
                do_break = True
                break

        if do_break: break

        train_batch.reshuffle()

    logger.info("Training ended with {} steps.".format(global_step))

    best_f, best_eval = max(dev_score_history) * 100, np.argmax(
        dev_score_history) + 1
    logger.info("Best dev F1 = {:.2f}, at iteration = {}".format(
        best_f, best_eval * args['eval_interval']))
コード例 #20
0
def train(args):
    utils.ensure_dir(args['save_dir'])
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
            else '{}/{}_parser.pt'.format(args['save_dir'], args['shorthand'])

    # load pretrained vectors if needed
    pretrain = None
    if args['pretrain']:
        vec_file = args['wordvec_file'] if args['wordvec_file'] else utils.get_wordvec_file(args['wordvec_dir'], args['shorthand'])
        pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'], args['shorthand'])
        pretrain = Pretrain(pretrain_file, vec_file, args['pretrain_max_vocab'])

    # load data
    print("Loading data with batch size {}...".format(args['batch_size']))
    train_doc = Document(CoNLL.conll2dict(input_file=args['train_file']))
    train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, evaluation=False)
    vocab = train_batch.vocab
    dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True)

    # pred and gold path
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    # skip training if the language does not have training or dev data
    if len(train_batch) == 0 or len(dev_batch) == 0:
        print("Skip training because no data available...")
        sys.exit(0)

    print("Training parser...")
    trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, use_cuda=args['cuda'])

    global_step = 0
    max_steps = args['max_steps']
    dev_score_history = []
    best_dev_preds = []
    current_lr = args['lr']
    global_start_time = time.time()
    format_str = '{}: step {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

    using_amsgrad = False
    last_best_step = 0
    # start training
    train_loss = 0
    while True:
        do_break = False
        for i, batch in enumerate(train_batch):
            start_time = time.time()
            global_step += 1
            loss = trainer.update(batch, eval=False) # update step
            train_loss += loss
            if global_step % args['log_step'] == 0:
                duration = time.time() - start_time
                print(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\
                        max_steps, loss, duration, current_lr))

            if global_step % args['eval_interval'] == 0:
                # eval on dev
                print("Evaluating on dev set...")
                dev_preds = []
                for batch in dev_batch:
                    preds = trainer.predict(batch)
                    dev_preds += preds
                dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx)

                dev_batch.doc.set([HEAD, DEPREL], [y for x in dev_preds for y in x])
                CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file)
                _, _, dev_score = scorer.score(system_pred_file, gold_file)

                train_loss = train_loss / args['eval_interval'] # avg loss per batch
                print("step {}: train_loss = {:.6f}, dev_score = {:.4f}".format(global_step, train_loss, dev_score))
                train_loss = 0

                # save best model
                if len(dev_score_history) == 0 or dev_score > max(dev_score_history):
                    last_best_step = global_step
                    trainer.save(model_file)
                    print("new best model saved.")
                    best_dev_preds = dev_preds

                dev_score_history += [dev_score]
                print("")

            if global_step - last_best_step >= args['max_steps_before_stop']:
                if not using_amsgrad:
                    print("Switching to AMSGrad")
                    last_best_step = global_step
                    using_amsgrad = True
                    trainer.optimizer = optim.Adam(trainer.model.parameters(), amsgrad=True, lr=args['lr'], betas=(.9, args['beta2']), eps=1e-6)
                else:
                    do_break = True
                    break

            if global_step >= args['max_steps']:
                do_break = True
                break

        if do_break: break

        train_batch.reshuffle()

    print("Training ended with {} steps.".format(global_step))

    best_f, best_eval = max(dev_score_history)*100, np.argmax(dev_score_history)+1
    print("Best dev F1 = {:.2f}, at iteration = {}".format(best_f, best_eval * args['eval_interval']))
コード例 #21
0
ファイル: model.py プロジェクト: rasimuvaikas/stanza
    def forward(self, word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, word_orig_idx,
                sentlens, wordlens, orig_idx=None, morph_dict=None, start=None, end=None):

        def pack(x):  # Packs a Tensor containing padded sequences of variable length.
            return pack_padded_sequence(x, sentlens, batch_first=True)

        inputs = []
        if self.args['word_emb_dim'] > 0:
            word_emb = self.word_emb(word)
            word_emb = pack(word_emb)
            inputs += [word_emb]

        if self.args['pretrain']:
            pretrained_emb = self.pretrained_emb(pretrained)
            pretrained_emb = self.trans_pretrained(pretrained_emb)
            pretrained_emb = pack(pretrained_emb)
            inputs += [pretrained_emb]

        def pad(x):  # inverse operation to pack_padded_sequence(). Pads a packed batch of variable length sequences.
            return pad_packed_sequence(PackedSequence(x, word_emb.batch_sizes), batch_first=True)[0]

        if self.args['char'] and self.args['char_emb_dim'] > 0:
            char_reps = self.charmodel(wordchars, wordchars_mask, word_orig_idx, sentlens, wordlens)
            char_reps = PackedSequence(self.trans_char(self.drop(char_reps.data)), char_reps.batch_sizes)
            inputs += [char_reps]

        lstm_inputs = torch.cat([x.data for x in inputs],1)
        lstm_inputs = self.worddrop(lstm_inputs, self.drop_replacement)
        lstm_inputs = self.drop(lstm_inputs)
        lstm_inputs = PackedSequence(lstm_inputs, inputs[0].batch_sizes)

        lstm_outputs, _ = self.taggerlstm(lstm_inputs, sentlens, hx=(
        self.taggerlstm_h_init.expand(2 * self.args['num_layers'], word.size(0), self.args['hidden_dim']).contiguous(),
        self.taggerlstm_c_init.expand(2 * self.args['num_layers'], word.size(0), self.args['hidden_dim']).contiguous()))
        lstm_outputs = lstm_outputs.data

        upos_hid = F.relu(self.upos_hid(self.drop(lstm_outputs)))
        upos_pred = self.upos_clf(self.drop(upos_hid))
        preds = [pad(upos_pred).max(2)[1]]

        upos = pack(upos).data
        loss = self.crit(upos_pred.view(-1, upos_pred.size(-1)), upos.view(-1))

        if self.share_hid:
            xpos_hid = upos_hid
            ufeats_hid = upos_hid

            clffunc = lambda clf, hid: clf(self.drop(hid))
        else:
            xpos_hid = F.relu(self.xpos_hid(self.drop(lstm_outputs)))
            ufeats_hid = F.relu(self.ufeats_hid(self.drop(lstm_outputs)))

            # this is where we get upos embeddings
            if self.training:
                upos_emb = self.upos_emb(upos)
            else:
                # get the top 5 upos predictions
                best_5 = [sorted(range(len(x)), key=lambda i: x[i], reverse=True)[:5] for x in upos_pred]
                # save upos emb for later
                upos_temp = self.upos_emb
                upos_emb = self.upos_emb(upos_pred.max(1)[1])

            clffunc = lambda clf, hid: clf(self.drop(hid), self.drop(upos_emb))  # ORG

        xpos = pack(xpos).data
        if isinstance(self.vocab['xpos'], CompositeVocab):
            xpos_preds = []
            for i in range(len(self.vocab['xpos'])):
                xpos_pred = clffunc(self.xpos_clf[i], xpos_hid)
                loss += self.crit(xpos_pred.view(-1, xpos_pred.size(-1)), xpos[:, i].view(-1))
                xpos_preds.append(pad(xpos_pred).max(2, keepdim=True)[1])
            preds.append(torch.cat(xpos_preds, 2))
        else:
            xpos_pred = clffunc(self.xpos_clf, xpos_hid)
            loss += self.crit(xpos_pred.view(-1, xpos_pred.size(-1)), xpos.view(-1))
            preds.append(pad(xpos_pred).max(2)[1])

        ufeats_preds = []
        ufeats = pack(ufeats).data
        for i in range(len(self.vocab['feats'])):
            ufeats_pred = clffunc(self.ufeats_clf[i], ufeats_hid)
            loss += self.crit(ufeats_pred.view(-1, ufeats_pred.size(-1)), ufeats[:, i].view(-1))
            ufeats_preds.append(pad(ufeats_pred).max(2, keepdim=True)[1])
        preds.append(torch.cat(ufeats_preds,2))

        # post-filter only if a morphological dictionary is present
        if morph_dict:

            # get the most likely ufeats tag for each top 5 upos tags predicted for a word
            feats_coeffs = list()
            for r in range(5):  # condition ufeats on a different upos tag embedding each time
                upos_2 = torch.LongTensor([x[r] for x in best_5])
                upos_emb2 = upos_temp(upos_2)
                clffunc_temp = lambda clf, hid: clf(self.drop(hid), self.drop(upos_emb2))

                ufeats_preds_temp = []
                for i in range(len(self.vocab['feats'])):
                    ufeats_pred = clffunc_temp(self.ufeats_clf[i], ufeats_hid)
                    ufeats_preds_temp.append(pad(ufeats_pred).max(2, keepdim=True)[1])
                feats_coeffs.append(torch.cat(ufeats_preds_temp, 2))

            # unmap all tags into readable format and unsort them into the original order that matches the sentence order
            upos_seqs = [self.vocab['upos'].unmap(up) for up in preds[0].tolist()]
            xpos_seqs = [self.vocab['xpos'].unmap(up) for up in preds[1].tolist()]
            feats_seqs = [self.vocab['feats'].unmap(up) for up in preds[2].tolist()]
            pred_tokens = [[[upos_seqs[i][j], xpos_seqs[i][j], feats_seqs[i][j]] for j in range(sentlens[i])] for i in
                           range(word.size(0))]
            pred_tokens = utils.unsort(pred_tokens, orig_idx)

            # pair the tags with the right words in the right sentences.
            sntncs = self.doc.sentences[start:end]
            sent_tokens = [[x.text for x in sent.tokens] for sent in sntncs]
            pair = [x for x in zip(sent_tokens, pred_tokens)]

            # 5 most likely upos tags for the token
            coeff = utils.unsort(pad(upos_pred).tolist(), orig_idx)
            coeff_max = [[sorted(range(len(x)), key=lambda i: x[i], reverse=True)[:5] for x in y] for y in coeff]

            # the most likely feats tag for each top 5 predicted upos tag
            fct = []
            for f in feats_coeffs:
                fct.append(utils.unsort(f, orig_idx))
            fct2 = [list(zip(*[fct[0][i], fct[1][i], fct[2][i], fct[3][i], fct[4][i]])) for i in range(len(fct[0]))]
            feats_coeffs = [[list(j[i]) for i in range(len(j))] for j in fct2]

            # initialise hunspell for Lithuanian
            if self.args['lang'] == 'lt':
                root = os.path.dirname(os.getcwd())
                hunspell = Hunchecker('lt-LT_morphology', root + '/data_files/hunspell')

            print('Post-filtering...')
            for p in range(len(pair)):  # get a sentence
                words = pair[p][0]
                tags = pair[p][1]

                a = 0
                while a < len(words):

                    lemma, upos, xpos, feats = morph_dict.find(words[a])
                    if upos is None:
                        lemma, upos, xpos, feats = morph_dict.find(words[a].lower())
                    else:
                        lemma2, upos2, xpos2, feats2 = morph_dict.find(words[a].lower())
                        if lemma2:
                            for i in range(len(lemma2)):
                                if upos2[i] not in upos or feats2[i] not in feats:
                                    lemma += [lemma2[i]]
                                    upos += [upos2[i]]
                                    xpos += [xpos2[i]]
                                    feats += [feats2[i]]

                    if self.args['lang'] == 'lt':
                        if upos is None:
                            lemma, upos, xpos, feats = hunspell.hunspell_to_conll(words[a])
                        else:
                            lemma_h, upos_h, xpos_h, feats_h = hunspell.hunspell_to_conll(words[a])
                            if upos_h is not None:
                                for i in range(len(upos_h)):
                                    if upos_h[i] not in upos or feats_h[i] not in feats:
                                        lemma += [lemma_h[i]]
                                        upos += [upos_h[i]]
                                        xpos += [xpos_h[i]]
                                        feats += [feats_h[i]]

                    if upos is not None:
                        if tags[a][0] not in upos:
                            new_upos = None
                            tag_idx = None
                            if len(upos) > 1:
                                max_values = self.vocab['upos'].unmap(coeff_max[p][a][1:])
                                # go through the values in the order of the most likely one
                                for m in range(len(max_values)):  # for every max upos tag
                                    # found one of the possible predicted values in the upos list
                                    if max_values[m] in upos:
                                        indices = [i for i, x in enumerate(upos) if x == max_values[m]]
                                        if len(indices) > 1:  # more than one upos list items matches the max value item
                                            # check if an exact match can be found, using the most informative ufeats tag
                                            for d in indices:
                                                if feats[d] == self.vocab['feats'].unmap(feats_coeffs[p][a][1:])[m] and \
                                                        upos[d] == max_values[m]:
                                                    new_upos = upos[d]
                                                    tag_idx = d
                                                    break
                                        if len(indices) == 1 or new_upos is None:
                                            new_upos = max_values[m]
                                            tag_idx = upos.index(max_values[m])
                                        break
                                if new_upos is None:  # last resort
                                    new_upos = upos[0]
                                    tag_idx = 0
                            else:  # only one item in upos list
                                new_upos = upos[0]
                                tag_idx = 0

                            new_xpos = xpos[tag_idx]
                            new_feats = feats[tag_idx]
                            # let the tagger deal with multiword tokens itself
                            if ('Hyph=Yes' not in new_feats and 'Hyph=Yes' in tags[a][2]) or (
                                    'Hyph=Yes' in new_feats and 'Hyph=Yes' not in tags[a][2]):
                                new_upos = new_xpos = new_feats = None

                            if new_upos is not None:
                                preds[0][orig_idx.index(p)][a] = self.vocab['upos'].map([new_upos])[0]
                                # sme has a 2D torch here, LT has 3D
                                if not isinstance(self.vocab['xpos'], CompositeVocab):
                                    preds[1][orig_idx.index(p)][a] = self.vocab['xpos'].map([new_xpos])[0]
                                else:
                                    preds[1][orig_idx.index(p)][a] = torch.LongTensor(
                                        self.vocab['xpos'].map([new_xpos])[0])
                                preds[2][orig_idx.index(p)][a] = torch.LongTensor(
                                    self.vocab['feats'].map([new_feats])[0])

                        else:
                            new_xpos = new_feats = None
                            all_found = False
                            for x in range(len(xpos)):
                                if tags[a][1] == xpos[x] and tags[a][2] == feats[x] and upos[x] == tags[a][0]:
                                    all_found = True
                                    break

                            if not all_found:
                                if len(upos) == 1 or (False not in [feats[a] == feats[a + 1] for a in
                                                                    range(len(feats) - 1)] and False not in [
                                                          upos[a] == upos[a + 1] for a in range(len(upos) - 1)]):
                                    new_feats = feats[0]
                                    if '*' not in tags[a][1]:
                                        new_xpos = xpos[0]
                                    all_found = True

                            if not all_found:
                                if len([i for i, x in enumerate(upos) if x == tags[a][0]]) == 1:
                                    new_feats = feats[upos.index(tags[a][0])]
                                    if '*' not in tags[a][1]:
                                        new_xpos = xpos[upos.index(tags[a][0])]
                                    all_found = True

                            if not all_found:
                                found_ft = False
                                for x in range(len(xpos)):
                                    if tags[a][2] == feats[x] and upos[x] == tags[a][0]:
                                        found_ft = True
                                        if xpos[x] != tags[a][1] and '*' not in tags[a][1]:
                                            new_xpos = xpos[x]
                                        break

                                if not found_ft:
                                    for x in range(len(xpos)):
                                        if tags[a][1] == xpos[x] and tags[a][2] != feats[x] and upos[x] == tags[a][0]:
                                            new_feats = feats[x]
                                            break

                            if new_feats:
                                if ('Hyph=Yes' not in new_feats and 'Hyph=Yes' in tags[a][2]) or (
                                        'Hyph=Yes' in new_feats and 'Hyph=Yes' not in tags[a][2]):
                                    # let the tagger deal with multiword tokens itself
                                    new_xpos = new_feats = None

                            if new_xpos is not None:
                                # non composite has a 2D torch here, composite has 3D
                                if not isinstance(self.vocab['xpos'], CompositeVocab):
                                    preds[1][orig_idx.index(p)][a] = self.vocab['xpos'].map([new_xpos])[0]
                                else:
                                    preds[1][orig_idx.index(p)][a] = torch.LongTensor(
                                        self.vocab['xpos'].map([new_xpos])[0])
                            if new_feats is not None:
                                preds[2][orig_idx.index(p)][a] = torch.LongTensor(
                                    self.vocab['feats'].map([new_feats])[0])

                    a += 1

        print('Post-filtering complete.')
        return loss, preds
コード例 #22
0
    def predict(self, batch, unsort=True):
        inputs, orig_idx, word_orig_idx, sentlens, wordlens = unpack_batch(
            batch, self.use_cuda)
        word, word_mask, wordchars, wordchars_mask, upos, xpos, ufeats, pretrained, lemma, head, deprel = inputs

        self.model.eval()
        batch_size = word.size(0)
        _, preds = self.model(word, word_mask, wordchars, wordchars_mask, upos,
                              xpos, ufeats, pretrained, lemma, head, deprel,
                              word_orig_idx, sentlens, wordlens)
        # head_seqs is dimension (sentence, word)
        # so let's see what happens here
        # adj[:l, :l] is an adjacency matrix telling who is the head of who
        # preds[0][i] is the adjacency matrix for the i-th sentence
        # preds[1][i] is the depprel adjacency matrix for the i-th sentence

        # so basically: let's get the head-sequence for each sentence
        # let's get the
        head_seqs = []
        deprel_seqs = []
        score_seqs = []
        # get the head graph and the deprel map for each sentence
        for i, (head, deps) in enumerate(zip(preds[0], preds[1])):
            head_seq = []
            deprel_seq = []
            score_seq = []
            edge_type = lambda edge: self.vocab['deprel'].unmap(
                (deps[edge.v][edge.u], ))[0]
            k_best = alternatives.GetKBest(
                head,
                self._n_trees,
                self._kalm_shuffle,
                edge_type,
                self._automatic_n_parses,
            )
            for j in range(sentlens[i] - 1):
                headc = []
                deprelc = []
                scorec = []
                for n_tree, (tree, score) in enumerate(k_best):
                    scorec.append(score)
                    best_in_edge = tree[j + 1]
                    source = best_in_edge.u - 1
                    headc.append(source + 1)
                    edge = self.vocab['deprel'].unmap(
                        (deps[j + 1][source + 1], ))[0]
                    deprelc.append(edge)
                head_seq.append(headc)
                deprel_seq.append(deprelc)
                score_seq.append(scorec)

            head_seqs.append(head_seq)
            deprel_seqs.append(deprel_seq)
            score_seqs.append(score_seq)

        pred_tokens = [[[head_seqs[i][j], deprel_seqs[i][j], score_seqs[i][j]]
                        for j in range(sentlens[i] - 1)]
                       for i in range(batch_size)]
        if unsort:
            pred_tokens = utils.unsort(pred_tokens, orig_idx)

        return pred_tokens