コード例 #1
0
def run(options):
    words_field = fields.WordsField()
    tags_field = fields.TagsField()
    fields_tuples = [('words', words_field), ('tags', tags_field)]
    fields_tuples += features.load(options.load)

    if options.test_path is None and options.text is None:
        raise Exception('You should inform a path to test data or a text.')

    if options.test_path is not None and options.text is not None:
        raise Exception('You cant inform both a path to test data and a text.')

    dataset_iter = None

    if options.test_path is not None and options.text is None:
        logging.info('Building test dataset: {}'.format(options.test_path))
        test_tuples = list(filter(lambda x: x[0] != 'tags', fields_tuples))
        test_dataset = dataset.build(options.test_path, test_tuples, options)

        logging.info('Building test iterator...')
        dataset_iter = iterator.build(test_dataset,
                                      options.gpu_id,
                                      options.dev_batch_size,
                                      is_train=False)

    if options.text is not None and options.test_path is None:
        logging.info('Preparing text...')
        test_tuples = list(filter(lambda x: x[0] != 'tags', fields_tuples))
        test_dataset = dataset.build_texts(options.text, test_tuples, options)

        logging.info('Building iterator...')
        dataset_iter = iterator.build(test_dataset,
                                      options.gpu_id,
                                      options.dev_batch_size,
                                      is_train=False)

    logging.info('Loading vocabularies...')
    fields.load_vocabs(options.load, fields_tuples)

    logging.info('Loading model...')
    model = models.load(options.load, fields_tuples)

    logging.info('Predicting...')
    predicter = Predicter(dataset_iter, model)
    predictions = predicter.predict(options.prediction_type)

    logging.info('Preparing to save...')
    if options.prediction_type == 'classes':
        prediction_tags = transform_classes_to_tags(tags_field, predictions)
        predictions_str = transform_predictions_to_text(prediction_tags)
    else:
        predictions_str = transform_predictions_to_text(predictions)

    if options.test_path is not None:
        save_predictions(options.output_dir, predictions_str)
    else:
        logging.info(options.text)
        logging.info(predictions_str)

    return predictions
コード例 #2
0
 def __init__(self, gpu_id=None):
     words_field = fields.WordsField()
     tags_field = fields.TagsField()
     self.fields_tuples = [('words', words_field), ('tags', tags_field)]
     self._loaded = False
     self.options = None
     self.model = None
     self.optimizer = None
     self.scheduler = None
     self.gpu_id = gpu_id
コード例 #3
0
ファイル: train.py プロジェクト: gazzola/deeptagger
def run(options):
    words_field = fields.WordsField()
    tags_field = fields.TagsField()
    fields_tuples = [('words', words_field), ('tags', tags_field)]
    fields_tuples += features.build(options)

    logging.info('Building train corpus: {}'.format(options.train_path))
    train_dataset = dataset.build(options.train_path, fields_tuples, options)
    logging.info('Building train iterator...')
    train_iter = iterator.build(train_dataset,
                                options.gpu_id,
                                options.train_batch_size,
                                is_train=True)

    dev_dataset = None
    dev_iter = None
    if options.dev_path is not None:
        logging.info('Building dev dataset: {}'.format(options.dev_path))
        dev_dataset = dataset.build(options.dev_path, fields_tuples, options)
        logging.info('Building dev iterator...')
        dev_iter = iterator.build(dev_dataset,
                                  options.gpu_id,
                                  options.dev_batch_size,
                                  is_train=False)

    test_dataset = None
    test_iter = None
    if options.test_path is not None:
        logging.info('Building test dataset: {}'.format(options.test_path))
        test_dataset = dataset.build(options.test_path, fields_tuples, options)
        logging.info('Building test iterator...')
        test_iter = iterator.build(test_dataset,
                                   options.gpu_id,
                                   options.dev_batch_size,
                                   is_train=False)

    datasets = [train_dataset, dev_dataset, test_dataset]
    datasets = list(filter(lambda x: x is not None, datasets))
    if options.load:
        logging.info('Loading vocabularies...')
        fields.load_vocabs(options.load, fields_tuples)
        logging.info('Word vocab size: {}'.format(len(words_field.vocab)))
        logging.info('Tag vocab size: {}'.format(len(tags_field.vocab)))
        logging.info('Loading model...')
        model = models.load(options.load, fields_tuples)
        logging.info('Loading optimizer...')
        optim = optimizer.load(options.load, model.parameters())
        logging.info('Loading scheduler...')
        sched = scheduler.load(options.load, optim)
    else:
        logging.info('Building vocabulary...')
        fields.build_vocabs(fields_tuples, train_dataset, datasets, options)
        logging.info('Word vocab size: {}'.format(len(words_field.vocab)))
        logging.info('Tag vocab size: {}'.format(len(tags_field.vocab)))
        logging.info('Building model...')
        model = models.build(options, fields_tuples)
        logging.info('Building optimizer...')
        optim = optimizer.build(options, model.parameters())
        logging.info('Building scheduler...')
        sched = scheduler.build(options, optim)

    logging.info('Building trainer...')
    trainer = Trainer(train_iter,
                      model,
                      optim,
                      sched,
                      options,
                      dev_iter=dev_iter,
                      test_iter=test_iter)

    if options.resume_epoch and options.load is None:
        logging.info('Resuming training...')
        trainer.resume(options.resume_epoch)

    trainer.train()

    if options.save:
        logging.info('Saving path: {}'.format(options.save))
        config_path = Path(options.save)
        config_path.mkdir(parents=True, exist_ok=True)
        logging.info('Saving config options...')
        opts.save(config_path, options)
        logging.info('Saving vocabularies...')
        fields.save_vocabs(config_path, fields_tuples)
        logging.info('Saving model...')
        models.save(config_path, model)
        logging.info('Saving optimizer...')
        optimizer.save(config_path, optim)
        logging.info('Saving scheduler...')
        scheduler.save(config_path, sched)

    return fields_tuples, model, optim, sched
コード例 #4
0
def run(options):
    logging.info('Running with options: {}'.format(options))

    words_field = fields.WordsField()
    tags_field = fields.TagsField()
    fields_tuples = [('words', words_field), ('tags', tags_field)]
    fields_tuples += features.build(options)

    logging.info('Building train corpus: {}'.format(options.train_path))
    train_dataset = dataset.build(options.train_path, fields_tuples, options)

    logging.info('Building train iterator...')
    train_iter = iterator.build(train_dataset,
                                options.gpu_id,
                                options.train_batch_size,
                                is_train=True)

    dev_dataset = None
    dev_iter = None
    if options.dev_path is not None:
        logging.info('Building dev dataset: {}'.format(options.dev_path))
        dev_dataset = dataset.build(options.dev_path, fields_tuples, options)
        logging.info('Building dev iterator...')
        dev_iter = iterator.build(dev_dataset,
                                  options.gpu_id,
                                  options.dev_batch_size,
                                  is_train=False)

    test_dataset = None
    test_iter = None
    if options.test_path is not None:
        logging.info('Building test dataset: {}'.format(options.test_path))
        test_dataset = dataset.build(options.test_path, fields_tuples, options)
        logging.info('Building test iterator...')
        test_iter = iterator.build(test_dataset,
                                   options.gpu_id,
                                   options.dev_batch_size,
                                   is_train=False)

    datasets = [train_dataset, dev_dataset, test_dataset]
    datasets = list(filter(lambda x: x is not None, datasets))

    # BUILD
    if not options.load:
        logging.info('Building vocabulary...')
        fields.build_vocabs(fields_tuples, train_dataset, datasets, options)
        loss_weights = None
        if options.loss_weights == 'balanced':
            loss_weights = train_dataset.get_loss_weights()
        logging.info('Building model...')
        model = models.build(options, fields_tuples, loss_weights)
        logging.info('Building optimizer...')
        optim = optimizer.build(options, model.parameters())
        logging.info('Building scheduler...')
        sched = scheduler.build(options, optim)

    # OR LOAD
    else:
        logging.info('Loading vocabularies...')
        fields.load_vocabs(options.load, fields_tuples)
        logging.info('Loading model...')
        model = models.load(options.load, fields_tuples)
        logging.info('Loading optimizer...')
        optim = optimizer.load(options.load, model.parameters())
        logging.info('Loading scheduler...')
        sched = scheduler.load(options.load, optim)

    # STATS
    logging.info('Word vocab size: {}'.format(len(words_field.vocab)))
    logging.info('Tag vocab size: {}'.format(len(tags_field.vocab) - 1))
    logging.info('Number of training examples: {}'.format(len(train_dataset)))
    if dev_dataset:
        logging.info('Number of dev examples: {}'.format(len(dev_dataset)))
    if test_dataset:
        logging.info('Number of test examples: {}'.format(len(test_dataset)))

    logging.info('Model info: ')
    logging.info(str(model))
    logging.info('Optimizer info: ')
    logging.info(str(optim))
    logging.info('Scheduler info: ')
    logging.info(str(sched))

    # TRAIN
    logging.info('Building trainer...')
    trainer = Trainer(train_iter,
                      model,
                      optim,
                      sched,
                      options,
                      dev_iter=dev_iter,
                      test_iter=test_iter)

    if options.resume_epoch and options.load is None:
        logging.info('Resuming training...')
        trainer.resume(options.resume_epoch)

    trainer.train()

    # SAVE
    if options.save:
        logging.info('Saving path: {}'.format(options.save))
        config_path = Path(options.save)
        config_path.mkdir(parents=True, exist_ok=True)
        logging.info('Saving config options...')
        opts.save(config_path, options)
        logging.info('Saving vocabularies...')
        fields.save_vocabs(config_path, fields_tuples)
        logging.info('Saving model...')
        models.save(config_path, model)
        logging.info('Saving optimizer...')
        optimizer.save(config_path, optim)
        logging.info('Saving scheduler...')
        scheduler.save(config_path, sched)

    return fields_tuples, model, optim, sched