Example #1
0
 def on_epoch_train_end(self, data):
     records = self._model.get_records()
     if self._has_tagging_task:
         Log.i("tag_loss: {:.8f}, tag_accuracy: {:.8f}".format(
             records['tag_loss'], records['tag_accuracy']))
     if self._has_parsing_task:
         Log.i("action_loss: {:.8f}, action_accuracy: {:.8f}".format(
             records['action_loss'], records['action_accuracy']))
     self._model.reset_records()
Example #2
0
 def report(self, target):
     if self._has_tagging_task:
         postag_count = 0
         postag_correct = 0
         for tokens, postags in \
                 zip(self._buffer['sentences'], self._buffer['postags']):
             _iter = enumerate(tokens)
             next(_iter)
             for j, token in _iter:
                 postag_count += 1
                 if token['postag'] == \
                         self._loader.tag_map.lookup(postags[j]):
                     postag_correct += 1
         Log.i("[evaluation] tagging accuracy: {:.6f}".format(
             (postag_correct / postag_count) * 100))
     if not self._has_parsing_task:
         return
     command = [
         self.PERL, self.SCRIPT, '-g', self._gold_file, '-s', target, '-q'
     ]
     Log.v("exec command: {}".format(' '.join(command)))
     p = subprocess.run(command,
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE,
                        encoding='utf-8')
     if p.returncode == 0:
         Log.i("[evaluation]\n" + p.stdout.rstrip())
     else:
         Log.i("[evaluation] ERROR: " + p.stderr.rstrip())
Example #3
0
 def report(self):
     Log.i("[evaluation] UAS: {:.8f}, LAS: {:.8f}".format(
         self.record['UAS'] / self.record['count'] * 100,
         self.record['LAS'] / self.record['count'] * 100))
Example #4
0
def decode(model_file, target_file, gpu=-1, save_to=None):
    context = utils.load_context(model_file)
    if context.seed is not None:
        utils.set_random_seed(context.seed, gpu)
        Log.i("random seed: {}".format(context.seed))
    framework_utils.set_debug(App.debug)

    loader = context.loader
    Log.i('load test dataset from {}'.format(target_file))
    test_dataset = loader.load(target_file,
                               train=False,
                               size=16 if utils.is_dev() else None)
    Log.i('#samples {}'.format(len(test_dataset)))

    Log.v('')
    Log.v("initialize ...")
    Log.v('--------------------------------')
    Log.i('# gpu: {}'.format(gpu))
    Log.i('# tagset size: {}'.format(len(loader.tag_map)))
    Log.i('# model layers: {}'.format(context.models))
    Log.i('# context: {}'.format(context))
    Log.v('--------------------------------')
    Log.v('')

    models.USE_ORTHONORMAL = False
    # Set up a neural network model
    layers = [
        models.Input(
            word_embeddings=loader.get_embeddings('word'),
            char_embeddings=loader.get_embeddings('char'),
            char_feature_size=50,
            dropout=0.5,
        ),
        models.Recurrent(n_layers=2,
                         in_size=loader.get_embeddings('word').shape[1] + 50,
                         out_size=400,
                         dropout=0.5),
        models.Tagger(in_size=400 * 2,
                      out_size=len(loader.tag_map),
                      units=100,
                      dropout=0.5) if context.models[2] is models.Tagger else
        models.GoldTagger(out_size=len(loader.tag_map)),
    ]
    if models.Parser in context.models:
        layers.extend([
            models.Connection(in_size=400 * 2,
                              out_size=800,
                              tagset_size=len(loader.tag_map),
                              tag_embed_size=50,
                              dropout=0.5),
            models.Parser(in_size=850,
                          n_deprels=len(loader.rel_map),
                          n_blstm_layers=1,
                          lstm_hidden_size=400,
                          parser_mlp_units=800,
                          dropout=0.50),
        ])
    model = models.MTL(*layers)
    chainer.serializers.load_npz(model_file, model)
    if gpu >= 0:
        framework_utils.set_model_to_device(model, device_id=gpu)
    # Setup an evaluator
    evaluator = models.Evaluator(loader, target_file, save_to)
    evaluator.add_target(model)

    # Start decoding
    framework_utils.chainer_train_off()
    evaluator.on_epoch_validate_begin({'epoch': 0})
    for batch_index, batch in enumerate(
            test_dataset.batch(context.batch_size, colwise=True,
                               shuffle=False)):
        xs, ts = batch[:-1], batch[-1]
        evaluator.on_batch_begin({'train': False, 'xs': xs, 'ts': ts})
        model(*xs)
        evaluator.on_batch_end({'train': False, 'xs': xs, 'ts': ts})
    evaluator.on_epoch_validate_end({'epoch': 0})
Example #5
0
def test(model_file, target_file, decode=False, gpu=-1):
    # Load context
    context = teras.utils.load_context(model_file)
    if context.backend == 'chainer':
        import chainer
        import chainer_model as models
        import teras.framework.chainer as framework_utils
        framework_utils.set_debug(App.debug)

        def _load_test_model(model, file, device_id=-1):
            chainer.serializers.load_npz(file, model)
            framework_utils.set_model_to_device(model, device_id)
            framework_utils.chainer_train_off()
    elif context.backend == 'pytorch':
        import torch
        import pytorch_model as models
        import teras.framework.pytorch as framework_utils

        def _load_test_model(model, file, device_id=-1):
            model.load_state_dict(torch.load(file))
            framework_utils.set_model_to_device(model, device_id)
            model.eval()
    else:
        raise ValueError("backend={} is not supported.".format(
            context.backend))

    # Load files
    Log.i('load dataset from {}'.format(target_file))
    loader = context.loader
    dataset = loader.load(target_file, train=False)

    Log.v('')
    Log.v("initialize ...")
    Log.v('--------------------------------')
    Log.i('# gpu: {}'.format(gpu))
    Log.i('# model: {}'.format(context.model_cls))
    Log.i('# context: {}'.format(context))
    Log.v('--------------------------------')
    Log.v('')

    # Set up a neural network model
    model = context.model_cls(
        embeddings=({
            'initialW':
            loader.get_embeddings('word_pretrained', normalize='l2'),
            'fixed_weight':
            True
        }, {
            'initialW': loader.get_embeddings('word'),
            'fixed_weight': False
        }, loader.get_embeddings('pos')),
        n_labels=len(loader.label_map),
        **context.model_params,
    )
    _load_test_model(model, model_file, device_id=gpu)

    parser = models.BiaffineParser(model)
    pos_map = loader.get_processor('pos').vocabulary
    label_map = loader.label_map
    evaluator = utils.Evaluator(parser, pos_map, ignore_punct=True)

    # Start testing
    UAS, LAS, count = 0.0, 0.0, 0.0
    for batch_index, batch in enumerate(
            dataset.batch(context.batch_size, shuffle=False)):
        pretrained_word_tokens, word_tokens, pos_tokens = batch[:-1]
        true_arcs, true_labels = batch[-1].T
        arcs_batch, labels_batch = parser.parse(pretrained_word_tokens,
                                                word_tokens, pos_tokens)
        for i, (p_arcs, p_labels, t_arcs, t_labels) in enumerate(
                zip(arcs_batch, labels_batch, true_arcs, true_labels)):
            mask = evaluator.create_ignore_mask(pos_tokens[i])
            _uas, _las, _count = evaluator.evaluate(p_arcs, p_labels, t_arcs,
                                                    t_labels, mask)
            if decode:
                words = loader.get_sentence(word_tokens[i])
                for word, pos_id, arc, label_id in zip(words[1:],
                                                       pos_tokens[i][1:],
                                                       p_arcs[1:],
                                                       p_labels[1:]):
                    print("\t".join([
                        word,
                        pos_map.lookup(pos_id),
                        str(arc),
                        label_map.lookup(label_id)
                    ]))
                print()
            UAS, LAS, count = UAS + _uas, LAS + _las, count + _count
    Log.i("[evaluation] UAS: {:.8f}, LAS: {:.8f}".format(
        UAS / count * 100, LAS / count * 100))
Example #6
0
def train(train_file,
          test_file=None,
          embed_file=None,
          embed_size=100,
          n_epoch=20,
          batch_size=32,
          lr=0.002,
          model_params={},
          gpu=-1,
          save_to=None,
          seed=None,
          backend='chainer'):
    if backend == 'chainer':
        import chainer
        import chainer_model as models
        import teras.framework.chainer as framework_utils
        framework_utils.set_debug(App.debug)
        if seed is not None:
            import random
            import numpy
            random.seed(seed)
            numpy.random.seed(seed)
            if gpu >= 0:
                try:
                    import cupy
                    cupy.cuda.runtime.setDevice(gpu)
                    cupy.random.seed(seed)
                except Exception as e:
                    Log.e(str(e))
            Log.i("random seed: {}".format(seed))
    elif backend == 'pytorch':
        import torch
        import pytorch_model as models
        import teras.framework.pytorch as framework_utils
        if seed is not None:
            import random
            import numpy
            random.seed(seed)
            numpy.random.seed(seed)
            torch.manual_seed(seed)
            Log.i("random seed: {}".format(seed))
    else:
        raise ValueError("backend={} is not supported.".format(backend))

    # Load files
    Log.i('initialize DataLoader with embed_file={} and embed_size={}'.format(
        embed_file, embed_size))
    loader = utils.DataLoader(word_embed_file=embed_file,
                              word_embed_size=embed_size,
                              pos_embed_size=embed_size)
    Log.i('load train dataset from {}'.format(train_file))
    train_dataset = loader.load(train_file, train=True)
    if test_file:
        Log.i('load test dataset from {}'.format(test_file))
        test_dataset = loader.load(test_file, train=False)
    else:
        test_dataset = None

    model_cls = models.DeepBiaffine

    Log.v('')
    Log.v("initialize ...")
    Log.v('--------------------------------')
    Log.i('# Minibatch-size: {}'.format(batch_size))
    Log.i('# epoch: {}'.format(n_epoch))
    Log.i('# gpu: {}'.format(gpu))
    Log.i('# model: {}'.format(model_cls))
    Log.i('# model params: {}'.format(model_params))
    Log.v('--------------------------------')
    Log.v('')

    # Set up a neural network model
    model = model_cls(
        embeddings=({
            'initialW':
            loader.get_embeddings('word_pretrained', normalize='l2'),
            'fixed_weight':
            True
        }, {
            'initialW': loader.get_embeddings('word'),
            'fixed_weight': False
        }, loader.get_embeddings('pos')),
        n_labels=len(loader.label_map),
        **model_params,
    )
    if gpu >= 0:
        framework_utils.set_model_to_device(model, device_id=gpu)

    # Setup an optimizer
    if backend == 'chainer':
        optimizer = chainer.optimizers.Adam(alpha=lr,
                                            beta1=0.9,
                                            beta2=0.9,
                                            eps=1e-12)
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.GradientClipping(5.0))
        optimizer.add_hook(
            framework_utils.optimizers.ExponentialDecayAnnealing(
                initial_lr=lr,
                decay_rate=0.75,
                decay_step=5000,
                lr_key='alpha'))
    elif backend == 'pytorch':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=lr,
                                     betas=(0.9, 0.9),
                                     eps=1e-12)
        torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=5.0)

        class Annealing(object):
            def __init__(self, optimizer):
                self.step = 0
                self.optimizer = optimizer

            def __call__(self, data):
                if not data['train']:
                    return
                self.step = self.step + 1
                decay, decay_step = 0.75, 5000
                decay_rate = decay**(self.step / decay_step)
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = lr * decay_rate

        annealing = Annealing(optimizer)
    Log.i('optimizer: Adam(alpha={}, beta1=0.9, '
          'beta2=0.9, eps=1e-12), grad_clip=5.0'.format(lr))

    # Setup a trainer
    parser = models.BiaffineParser(model)

    trainer = Trainer(optimizer,
                      parser,
                      loss_func=parser.compute_loss,
                      accuracy_func=parser.compute_accuracy)
    trainer.configure(framework_utils.config)
    if backend == 'pytorch':
        trainer.add_hook(Event.EPOCH_TRAIN_BEGIN, lambda data: model.train())
        trainer.add_hook(Event.EPOCH_VALIDATE_BEGIN, lambda data: model.eval())
        trainer.add_hook(Event.BATCH_BEGIN, annealing)
    if test_dataset:
        trainer.attach_callback(
            utils.Evaluator(parser,
                            pos_map=loader.get_processor('pos').vocabulary,
                            ignore_punct=True))

    if save_to is not None:
        accessid = Log.getLogger().accessid
        date = Log.getLogger().accesstime.strftime('%Y%m%d')
        trainer.attach_callback(
            framework_utils.callbacks.Saver(model,
                                            basename="{}-{}".format(
                                                date, accessid),
                                            directory=save_to,
                                            context=dict(App.context,
                                                         model_cls=model_cls,
                                                         loader=loader)))

    # Start training
    trainer.fit(train_dataset,
                None,
                batch_size=batch_size,
                epochs=n_epoch,
                validation_data=test_dataset,
                verbose=App.verbose)
Example #7
0
def train(
        train_file,
        test_file=None,
        embed_file=None,
        embed_size=100,
        n_epoch=20,
        batch_size=32,
        lr=0.001,
        l2_lambda=0.0,
        grad_clip=5.0,
        tasks='tp',
        gpu=-1,
        save_to=None,
        seed=None):
    if seed is not None:
        utils.set_random_seed(seed, gpu)
        Log.i("random seed: {}".format(seed))
    framework_utils.set_debug(App.debug)

    # Select Task
    with_tagging_task = False
    with_parsing_task = False
    for char in tasks:
        if char == 't':
            with_tagging_task = True
        elif char == 'p':
            with_parsing_task = True
        else:
            raise ValueError("Invalid task specified: {}".format(char))
    if not any([with_tagging_task, with_parsing_task]):
        raise RuntimeError("No valid task specified")
    Log.i('Task: tagging={}, parsing={}'
          .format(with_tagging_task, with_parsing_task))

    # Transition System
    transition_system = transition.ArcStandard
    if with_parsing_task:
        Log.i('Transition System: {}'.format(transition_system))

    # Load files
    Log.i('initialize DataLoader with embed_file={} and embed_size={}'
          .format(embed_file, embed_size))
    loader = dataset.DataLoader(word_embed_file=embed_file,
                                word_embed_size=embed_size,
                                char_embed_size=10,
                                transition_system=transition_system)
    Log.i('load train dataset from {}'.format(train_file))
    train_dataset = loader.load(train_file, train=True,
                                size=120 if utils.is_dev() else None)
    if test_file:
        Log.i('load test dataset from {}'.format(test_file))
        test_dataset = loader.load(test_file, train=False,
                                   size=16 if utils.is_dev() else None)
    else:
        test_dataset = None

    Log.v('')
    Log.v("initialize ...")
    Log.v('--------------------------------')
    Log.i('# Minibatch-size: {}'.format(batch_size))
    Log.i('# epoch: {}'.format(n_epoch))
    Log.i('# gpu: {}'.format(gpu))
    Log.i('# tagset size: {}'.format(len(loader.tag_map)))
    Log.v('--------------------------------')
    Log.v('')

    # Set up a neural network model
    layers = [
        models.Input(
            word_embeddings=loader.get_embeddings('word'),
            char_embeddings=loader.get_embeddings('char'),
            char_feature_size=50,
            dropout=0.5,
        ),
        models.Recurrent(
            n_layers=2,
            in_size=loader.get_embeddings('word').shape[1] + 50,
            out_size=400,
            dropout=0.5),
        models.Tagger(
            in_size=400 * 2,
            out_size=len(loader.tag_map),
            units=100,
            dropout=0.5) if with_tagging_task else
        models.GoldTagger(out_size=len(loader.tag_map)),
    ]
    if with_parsing_task:
        layers.extend([
            models.Connection(
                tagset_size=len(loader.tag_map),
                tag_embed_size=50,
                dropout=0.5),
            models.Parser(
                in_size=850,
                n_deprels=len(loader.rel_map),
                n_blstm_layers=2,
                lstm_hidden_size=400,
                parser_mlp_units=800,
                dropout=0.50,
                transition_system=transition_system),
        ])
    model = models.MTL(*layers)
    if gpu >= 0:
        framework_utils.set_model_to_device(model, device_id=gpu)

    # Setup an optimizer
    optimizer = chainer.optimizers.Adam(
        alpha=lr, beta1=0.9, beta2=0.999, eps=1e-08)
    optimizer.setup(model)
    if l2_lambda > 0.0:
        optimizer.add_hook(chainer.optimizer.WeightDecay(l2_lambda))
    else:
        l2_lambda = False
    if grad_clip > 0.0:
        optimizer.add_hook(chainer.optimizer.GradientClipping(grad_clip))
    else:
        grad_clip = False
    # optimizer.add_hook(
    #     framework_utils.optimizers.ExponentialDecayAnnealing(
    #         initial_lr=lr, decay_rate=0.75, decay_step=5000, lr_key='alpha'))
    Log.i('optimizer: Adam(alpha={}, beta1=0.9, '
          'beta2=0.999, eps=1e-08), grad_clip={}, '
          'regularization: WeightDecay(lambda={})'
          .format(lr, grad_clip, l2_lambda))

    # Setup a trainer
    trainer = Trainer(optimizer, model,
                      loss_func=model.compute_loss,
                      accuracy_func=model.compute_accuracy)
    trainer.configure(framework_utils.config)
    if test_dataset:
        evaluator = models.Evaluator(loader, test_file, save_to)
        evaluator.add_target(model)
        trainer.attach_callback(evaluator)

    if save_to is not None:
        accessid = Log.getLogger().accessid
        date = Log.getLogger().accesstime.strftime('%Y%m%d')
        trainer.attach_callback(
            framework_utils.callbacks.Saver(
                model,
                basename="{}-{}".format(date, accessid),
                directory=save_to,
                context=dict(App.context,
                             models=[type(layer) for layer in layers],
                             loader=loader)))

    # Start training
    trainer.fit(train_dataset, None,
                batch_size=batch_size,
                epochs=n_epoch,
                validation_data=test_dataset,
                verbose=App.verbose)