Beispiel #1
0
    def __call__(self, config):
        print("Load the models")
        vocab = torch.load(config.vocab)
        parser = load_parser(fetch_best_ckpt_name(config.parser_model))
        task = ParserTask(vocab, parser)
        if config.pred_tag:
            tagger = PosTagger.load(fetch_best_ckpt_name(config.tagger_model))
        else:
            tagger = None

        print("Load the dataset")
        corpus = Corpus.load(config.fdata)
        dataset = TextDataset(vocab.numericalize(corpus))
        # set the data loader
        loader = batchify(dataset, config.batch_size, config.buckets)

        print("Evaluate the dataset")
        loss, metric = task.evaluate(loader, config.punct, tagger, True)
        print(f"Loss: {loss:.4f} {metric}")
Beispiel #2
0
    def __call__(self, config):
        print("Load the models")
        vocab = torch.load(config.vocab)
        parser = load_parser(fetch_best_ckpt_name(config.parser_model))
        task = ParserTask(vocab, parser)
        if config.pred_tag:
            tagger = PosTagger.load(fetch_best_ckpt_name(config.tagger_model))
        else:
            tagger = None

        print("Load the dataset")
        corpus = Corpus.load(config.fdata)
        dataset = TextDataset(vocab.numericalize(corpus, training=False))
        # set the data loader
        loader = batchify(dataset, config.batch_size)

        print("Make predictions on the dataset")
        corpus.tags, corpus.heads, corpus.rels = task.predict(loader, tagger)

        saved_path = '{}/raw_result.conllx'.format(config.result_path)
        print(f"Save the predicted result to {saved_path}")
        corpus.save(saved_path)
Beispiel #3
0
    def __call__(self, config):
        print("Preprocess the data")
        train = Corpus.load(config.ftrain)
        dev = Corpus.load(config.fdev)
        test = Corpus.load(config.ftest)
        if os.path.exists(config.vocab):
            vocab = torch.load(config.vocab)
        else:
            vocab = Vocab.from_corpus(corpus=train, min_freq=2)
            vocab.read_embeddings(Pretrained.load(config.fembed, config.unk))
            torch.save(vocab, config.vocab)
        config.update({
            'n_words': vocab.n_train_words,
            'n_tags': vocab.n_tags,
            'n_rels': vocab.n_rels,
            'n_chars': vocab.n_chars,
            'pad_index': vocab.pad_index,
            'unk_index': vocab.unk_index
        })
        print(vocab)

        print("Load the dataset")
        trainset = TextDataset(vocab.numericalize(train))
        devset = TextDataset(vocab.numericalize(dev))
        testset = TextDataset(vocab.numericalize(test))
        # set the data loaders
        train_loader = batchify(dataset=trainset,
                                batch_size=config.batch_size,
                                n_buckets=config.buckets,
                                shuffle=True)
        dev_loader = batchify(dataset=devset,
                              batch_size=config.batch_size,
                              n_buckets=config.buckets)
        test_loader = batchify(dataset=testset,
                               batch_size=config.batch_size,
                               n_buckets=config.buckets)
        print(f"{'train:':6} {len(trainset):5} sentences in total, "
              f"{len(train_loader):3} batches provided")
        print(f"{'dev:':6} {len(devset):5} sentences in total, "
              f"{len(dev_loader):3} batches provided")
        print(f"{'test:':6} {len(testset):5} sentences in total, "
              f"{len(test_loader):3} batches provided")

        print("Create the models")
        assert config.train_task in ['parser', 'tagger']
        is_training_parser = config.train_task == 'parser'

        if config.augmentation_training:
            aug_test = Corpus.load(config.augmentation_test_file)
            aug_testset = TextDataset(vocab.numericalize(aug_test))
            aug_test_loader = batchify(dataset=aug_testset,
                                       batch_size=config.batch_size,
                                       n_buckets=config.buckets)
            print(f"{'test:':6} {len(aug_testset):5} sentences in total, "
                  f"{len(aug_test_loader):3} batches provided")

        if is_training_parser:
            model = init_parser(config, vocab.embeddings)
            task = ParserTask(vocab, model)
            best_e, best_metric = 1, ParserMetric()
        else:
            model = PosTagger(config, vocab.embeddings)
            task = TaggerTask(vocab, model)
            best_e, best_metric = 1, TaggerMetric()

        if torch.cuda.is_available():
            model = model.cuda()
        print(f"{model}\n")
        total_time = timedelta()
        # best_e, best_metric = 1, TaggerMetric()
        task.optimizer = Adam(task.model.parameters(), config.lr,
                              (config.beta_1, config.beta_2), config.epsilon)
        task.scheduler = ExponentialLR(task.optimizer,
                                       config.decay**(1 / config.steps))
        for epoch in range(1, config.epochs + 1):
            start = datetime.now()
            # train one epoch and update the parameters
            task.train(train_loader)

            print(f"Epoch {epoch} / {config.epochs}:")
            loss, train_metric = task.evaluate(train_loader, config.punct)
            print(f"{'train:':6} Loss: {loss:.4f} {train_metric}")
            loss, dev_metric = task.evaluate(dev_loader, config.punct)
            print(f"{'dev:':6} Loss: {loss:.4f} {dev_metric}")
            loss, test_metric = task.evaluate(test_loader, config.punct)
            print(f"{'test:':6} Loss: {loss:.4f} {test_metric}")
            if config.augmentation_training:
                loss, aug_test_metric = task.evaluate(aug_test_loader,
                                                      config.punct)
                print(f"{'test:':6} Loss: {loss:.4f} {aug_test_metric}")

            t = datetime.now() - start

            if dev_metric > best_metric and epoch > config.patience:
                best_e, best_metric = epoch, dev_metric
                if is_training_parser:
                    task.model.save(config.parser_model + f".{best_e}")
                else:
                    task.model.save(config.tagger_model + f".{best_e}")
                print(f"{t}s elapsed (saved)\n")
            else:
                print(f"{t}s elapsed\n")
            sys.stdout.flush()
            total_time += t
            if epoch - best_e >= config.patience:
                break

        if is_training_parser:
            copyfile(config.parser_model + f'.{best_e}',
                     config.parser_model + '.best')
            task.model = load_parser(config.parser_model + f".{best_e}")
        else:
            copyfile(config.tagger_model + f'.{best_e}',
                     config.tagger_model + '.best')
            task.model = PosTagger.load(config.tagger_model + f".{best_e}")
        loss, metric = task.evaluate(test_loader, config.punct)

        print(f"max score of dev is {best_metric.score:.2%} at epoch {best_e}")
        print(f"the score of test at epoch {best_e} is {metric.score:.2%}")

        if config.augmentation_training:
            loss, metric = task.evaluate(aug_test_loader, config.punct)
            print(
                f"the score of aug test at epoch {best_e} is {metric.score:.2%}"
            )

        print(f"average time of each epoch is {total_time / epoch}s")
        print(f"{total_time}s elapsed")