Beispiel #1
0
    def __call__(self, args):
        print("Load the model")

        modelpath = args.mainpath + args.model + args.modelname + "/model_weights"
        vocabpath = args.mainpath + args.vocab + args.modelname + "/vocab.tag"

        config = torch.load(modelpath)['config']
        config.batch_size = 2
        config.buckets = 2

        vocab = torch.load(vocabpath)
        parser = BiaffineParser.load(modelpath)
        model = Model(vocab, parser, config, vocab.n_rels)

        print("Load the dataset")
        if args.input_type == "conllu":
            corpus = UniversalDependenciesDatasetReader()
            corpus.load(args.fdata)
        elif args.input_type == "conllx":
            corpus = Corpus.load(args.fdata)
        elif args.input_type == "raw":
            corpus = UniversalDependenciesRawDatasetReader(args.language)
            corpus.load(args.fdata)
        if args.use_predicted:
            if args.input_type == "conllu":
                corpus_predicted = UniversalDependenciesDatasetReader()
                corpus_predicted.load(args.finit)
            else:
                corpus_predicted = Corpus.load(args.finit)

        if args.use_predicted:
            dataset = TextDataset(vocab.numericalize(corpus, corpus_predicted))
        else:
            dataset = TextDataset(vocab.numericalize(corpus, training=False))
        # set the data loader
        loader, ids = batchify(dataset, config.batch_size, config.buckets)

        print("Make predictions on the dataset")
        if args.use_predicted:
            heads_pred, rels_pred, metric = model.predict_predicted(loader)
        else:
            heads_pred, rels_pred, metric = model.predict(loader)

        print(f"Save the predicted result to {args.fpred}")

        heads_pred = self.rearange(heads_pred, ids)
        rels_pred = self.rearange(rels_pred, ids)

        corpus.heads = heads_pred
        corpus.rels = rels_pred
        corpus.save(args.fpred)
Beispiel #2
0
    def __call__(self, args):
        print("Load the model")

        modelpath = args.mainpath + args.model + args.modelname + "/model_weights"
        vocabpath = args.mainpath + args.vocab + args.modelname + "/vocab.tag"

        config = torch.load(modelpath)['config']

        vocab = torch.load(vocabpath)
        parser = Parser.load(modelpath)
        model = Model(vocab, parser, config, vocab.n_rels)

        print("Load the dataset")
        corpus = Corpus.load(args.fdata)
        dataset = TextDataset(vocab.numericalize(corpus))
        # set the data loader
        loader, ids = batchify(dataset, 5 * config.batch_size, config.buckets)

        print("Make predictions on the dataset")
        heads_pred, rels_pred, metric = model.predict(loader)

        print(metric)
        print(f"Save the predicted result to {args.fpred}")

        heads_pred = self.rearange(heads_pred, ids)
        rels_pred = self.rearange(rels_pred, ids)

        corpus.heads = heads_pred
        corpus.rels = rels_pred
        corpus.save(args.fpred)
Beispiel #3
0
    def __call__(self, args):
        super(Predict, self).__call__(args)

        print("Load the dataset")
        corpus = Corpus.load(args.fdata, self.fields)
        dataset = TextDataset(corpus,
                              self.fields[:-1],
                              args.buckets)
        # set the data loader
        dataset.loader = batchify(dataset, args.batch_size)
        print(f"{len(dataset)} sentences, "
              f"{len(dataset.loader)} batches")

        print("Load the model")
        self.model = Model.load(args.model)
        print(f"{self.model}\n")

        print("Make predictions on the dataset")
        start = datetime.now()
        pred_labels = self.predict(dataset.loader)
        total_time = datetime.now() - start
        # restore the order of sentences in the buckets
        indices = torch.tensor([i
                                for bucket in dataset.buckets.values()
                                for i in bucket]).argsort()
        corpus.labels = [pred_labels[i] for i in indices]
        print(f"Save the predicted result to {args.fpred}")
        corpus.save(args.fpred)
        print(f"{total_time}s elapsed, "
              f"{len(dataset) / total_time.total_seconds():.2f} Sents/s")
Beispiel #4
0
    def __call__(self, config):
        print("Load the model")
        vocab = torch.load(config.vocab)
        parser = BiaffineParser.load(config.model)
        model = Model(vocab, parser)

        print("Load the dataset")
        corpus = Corpus.load(config.fdata)
        dataset = TextDataset(vocab.numericalize(corpus))
        # set the data loader
        loader = batchify(dataset, config.batch_size, config.buckets)

        print("Evaluate the dataset")
        loss, metric = model.evaluate(loader, config.punct)
        print(f"Loss: {loss:.4f} {metric}")
Beispiel #5
0
    def __call__(self, config):
        print("Load the model")
        vocab = torch.load(config.vocab)
        parser = BiaffineParser.load(config.model)
        model = Model(config, vocab, parser)

        print("Load the dataset")
        corpus = Corpus.load(config.fdata)
        dataset = TextDataset(vocab.numericalize(corpus), config.buckets)
        # set the data loader
        loader = batchify(dataset, config.batch_size)

        print("Evaluate the dataset")
        _, loss, _, metric_t, metric_p = model.evaluate(None, loader)
        print(f"Loss: {loss:.4f} {metric_t}, {metric_p}")
Beispiel #6
0
    def __call__(self, config):
        print("Load the model")
        vocab = torch.load(config.vocab)
        parser = BiaffineParser.load(config.model)
        model = Model(vocab, parser)

        print("Load the dataset")
        corpus = Corpus.load(config.fdata)
        dataset = TextDataset(vocab.numericalize(corpus, False))
        # set the data loader
        loader = batchify(dataset, config.batch_size)

        print("Make predictions on the dataset")
        corpus.heads, corpus.rels = model.predict(loader)

        print(f"Save the predicted result to {config.fpred}")
        corpus.save(config.fpred)
Beispiel #7
0
    def __call__(self, args):
        logger.info("Load the model")
        self.model = Model.load(args.model)
        # override from CLI args
        args = self.model.args.update(vars(args))

        super().__call__(args)

        logger.info("Load the dataset")
        if args.prob:
            self.fields = self.fields._replace(PHEAD=Field('probs'))
        if args.text:
            corpus = TextCorpus.load(args.fdata,
                                     self.fields,
                                     args.text,
                                     args.tokenizer_dir,
                                     use_gpu=args.device != 1)
        else:
            corpus = Corpus.load(args.fdata, self.fields)
        dataset = TextDataset(corpus, [self.WORD, self.FEAT], args.buckets)
        # set the data loader
        dataset.loader = batchify(dataset, args.batch_size)
        logger.info(f"{len(dataset)} sentences, "
                    f"{len(dataset.loader)} batches")

        logger.info("Make predictions on the dataset")
        start = datetime.now()
        pred_arcs, pred_rels, pred_probs = self.predict(dataset.loader)
        total_time = datetime.now() - start
        # restore the order of sentences in the buckets
        indices = torch.tensor([
            i for bucket in dataset.buckets.values() for i in bucket
        ]).argsort()
        corpus.arcs = [pred_arcs[i] for i in indices]
        corpus.rels = [pred_rels[i] for i in indices]
        if args.prob:
            corpus.probs = [pred_probs[i] for i in indices]
        logger.info(f"Save the predicted result to {args.fpred}")
        corpus.save(args.fpred)
        logger.info(f"{total_time}s elapsed, "
                    f"{len(dataset) / total_time.total_seconds():.2f} Sents/s")
Beispiel #8
0
    def __call__(self, args):
        super(Predict, self).__call__(args)

        print("Load the dataset")
        corpus = Corpus.load(args.fdata, self.fields)
        dataset = TextDataset(corpus, [self.WORD, self.FEAT])
        # set the data loader
        dataset.loader = batchify(dataset, args.batch_size)
        print(f"{len(dataset)} sentences, " f"{len(dataset.loader)} batches")

        print("Load the model")
        self.model = Model.load(args.model)
        print(f"{self.model}\n")

        print("Make predictions on the dataset")
        start = datetime.now()
        corpus.heads, corpus.rels = self.predict(dataset.loader)
        print(f"Save the predicted result to {args.fpred}")
        corpus.save(args.fpred)
        total_time = datetime.now() - start
        print(f"{total_time}s elapsed, "
              f"{len(dataset) / total_time.total_seconds():.2f} Sents/s")
Beispiel #9
0
    def __call__(self, args):
        super(Evaluate, self).__call__(args)

        print("Load the dataset")
        corpus = Corpus.load(args.fdata, self.fields)
        dataset = TextDataset(corpus, self.fields, args.buckets)
        # set the data loader
        dataset.loader = batchify(dataset, args.batch_size)
        print(f"{len(dataset)} sentences, "
              f"{len(dataset.loader)} batches, "
              f"{len(dataset.buckets)} buckets")

        print("Load the model")
        self.model = Model.load(args.model)
        print(f"{self.model}\n")

        print("Evaluate the dataset")
        start = datetime.now()
        loss, metric = self.evaluate(dataset.loader)
        total_time = datetime.now() - start
        print(f"Loss: {loss:.4f} {metric}")
        print(f"{total_time}s elapsed, "
              f"{len(dataset) / total_time.total_seconds():.2f} Sents/s")
Beispiel #10
0
    def __call__(self, config):
        print("Preprocess the data")
        train = Corpus.load(config.ftrain)
        dev = Corpus.load(config.fdev)
        test = Corpus.load(config.ftest)
        if os.path.exists(config.vocab):
            vocab = torch.load(config.vocab)
        else:
            vocab = Vocab.from_corpus(corpus=train, min_freq=2)
            vocab.read_embeddings(Embedding.load(config.fembed, config.unk))
            torch.save(vocab, config.vocab)
        config.update({
            'n_words': vocab.n_train_words,
            'n_tags': vocab.n_tags,
            'n_rels': vocab.n_rels,
            'pad_index': vocab.pad_index,
            'unk_index': vocab.unk_index
        })
        print(vocab)

        print("Load the dataset")
        trainset = TextDataset(vocab.numericalize(train))
        devset = TextDataset(vocab.numericalize(dev))
        testset = TextDataset(vocab.numericalize(test))
        # set the data loaders
        train_loader = batchify(dataset=trainset,
                                batch_size=config.batch_size,
                                n_buckets=config.buckets,
                                shuffle=True)
        dev_loader = batchify(dataset=devset,
                              batch_size=config.batch_size,
                              n_buckets=config.buckets)
        test_loader = batchify(dataset=testset,
                               batch_size=config.batch_size,
                               n_buckets=config.buckets)
        print(f"{'train:':6} {len(trainset):5} sentences in total, "
              f"{len(train_loader):3} batches provided")
        print(f"{'dev:':6} {len(devset):5} sentences in total, "
              f"{len(dev_loader):3} batches provided")
        print(f"{'test:':6} {len(testset):5} sentences in total, "
              f"{len(test_loader):3} batches provided")

        print("Create the model")
        parser = BiaffineParser(config, vocab.embeddings)
        if torch.cuda.is_available():
            parser = parser.cuda()
        print(f"{parser}\n")

        model = Model(vocab, parser)

        total_time = timedelta()
        best_e, best_metric = 1, Metric()
        model.optimizer = Adam(model.parser.parameters(),
                               config.lr,
                               (config.beta_1, config.beta_2),
                               config.epsilon)
        model.scheduler = ExponentialLR(model.optimizer,
                                        config.decay ** (1 / config.steps))

        for epoch in range(1, config.epochs + 1):
            start = datetime.now()
            # train one epoch and update the parameters
            model.train(train_loader)

            print(f"Epoch {epoch} / {config.epochs}:")
            loss, train_metric = model.evaluate(train_loader, config.punct)
            print(f"{'train:':6} Loss: {loss:.4f} {train_metric}")
            loss, dev_metric = model.evaluate(dev_loader, config.punct)
            print(f"{'dev:':6} Loss: {loss:.4f} {dev_metric}")
            loss, test_metric = model.evaluate(test_loader, config.punct)
            print(f"{'test:':6} Loss: {loss:.4f} {test_metric}")

            t = datetime.now() - start
            # save the model if it is the best so far
            if dev_metric > best_metric and epoch > config.patience:
                best_e, best_metric = epoch, dev_metric
                model.parser.save(config.model + f".{best_e}")
                print(f"{t}s elapsed (saved)\n")
            else:
                print(f"{t}s elapsed\n")
            total_time += t
            if epoch - best_e >= config.patience:
                break
        model.parser = BiaffineParser.load(config.model + f".{best_e}")
        loss, metric = model.evaluate(test_loader, config.punct)

        print(f"max score of dev is {best_metric.score:.2%} at epoch {best_e}")
        print(f"the score of test at epoch {best_e} is {metric.score:.2%}")
        print(f"average time of each epoch is {total_time / epoch}s")
        print(f"{total_time}s elapsed")
Beispiel #11
0
    def __call__(self, args):
        super(Train, self).__call__(args)

        rrr = os.popen(
            '"/usr/bin/nvidia-smi" --query-gpu=memory.total,memory.used --format=csv,nounits,noheader'
        )
        devices_info = rrr.read().strip().split("\n")
        total, used = devices_info[int(
            os.environ["CUDA_VISIBLE_DEVICES"])].split(',')
        total = int(total)
        used = int(used)
        max_mem = int(total * random.uniform(0.95, 0.97))
        block_mem = max_mem - used
        x = torch.cuda.FloatTensor(256, 1024, block_mem)
        del x
        rrr.close()

        logging.basicConfig(filename=args.output,
                            filemode='w',
                            format='%(asctime)s %(levelname)-8s %(message)s',
                            level=logging.INFO,
                            datefmt='%Y-%m-%d %H:%M:%S')
        train_corpus = Corpus.load(args.ftrain, self.fields, args.max_len)
        dev_corpus = Corpus.load(args.fdev, self.fields)
        dev40_corpus = Corpus.load(args.fdev, self.fields, args.max_len)
        test_corpus = Corpus.load(args.ftest, self.fields)
        test40_corpus = Corpus.load(args.ftest, self.fields, args.max_len)

        train = TextDataset(train_corpus,
                            self.fields,
                            args.buckets,
                            crf=args.crf)
        dev = TextDataset(dev_corpus, self.fields, args.buckets, crf=args.crf)
        dev40 = TextDataset(dev40_corpus,
                            self.fields,
                            args.buckets,
                            crf=args.crf)
        test = TextDataset(test_corpus,
                           self.fields,
                           args.buckets,
                           crf=args.crf)
        test40 = TextDataset(test40_corpus,
                             self.fields,
                             args.buckets,
                             crf=args.crf)
        # set the data loaders
        if args.self_train:
            train.loader = batchify(train, args.batch_size)
        else:
            train.loader = batchify(train, args.batch_size, True)
        dev.loader = batchify(dev, args.batch_size)
        dev40.loader = batchify(dev40, args.batch_size)
        test.loader = batchify(test, args.batch_size)
        test40.loader = batchify(test40, args.batch_size)
        logging.info(f"{'train:':6} {len(train):5} sentences, "
                     f"{len(train.loader):3} batches, "
                     f"{len(train.buckets)} buckets")
        logging.info(f"{'dev:':6} {len(dev):5} sentences, "
                     f"{len(dev.loader):3} batches, "
                     f"{len(dev.buckets)} buckets")
        logging.info(f"{'dev40:':6} {len(dev40):5} sentences, "
                     f"{len(dev40.loader):3} batches, "
                     f"{len(dev40.buckets)} buckets")
        logging.info(f"{'test:':6} {len(test):5} sentences, "
                     f"{len(test.loader):3} batches, "
                     f"{len(test.buckets)} buckets")
        logging.info(f"{'test40:':6} {len(test40):5} sentences, "
                     f"{len(test40.loader):3} batches, "
                     f"{len(test40.buckets)} buckets")

        logging.info("Create the model")
        self.model = Model(args)
        self.model = self.model.to(args.device)

        if args.E_Reg or args.T_Reg:
            source_model = Model(args)
            source_model = source_model.to(args.device)

        # load model
        if args.load != '':
            logging.info("Load source model")
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            state = torch.load(args.load, map_location=device)['state_dict']
            state_dict = self.model.state_dict()
            for k, v in state.items():
                if k in ['word_embed.weight']:
                    continue
                state_dict.update({k: v})
            self.model.load_state_dict(state_dict)
            init_params = {}
            for name, param in self.model.named_parameters():
                init_params[name] = param.clone()
            self.model.init_params = init_params

            if args.E_Reg or args.T_Reg:
                state_dict = source_model.state_dict()
                for k, v in state.items():
                    if k in ['word_embed.weight']:
                        continue
                    state_dict.update({k: v})
                source_model.load_state_dict(state_dict)
                init_params = {}
                for name, param in source_model.named_parameters():
                    init_params[name] = param.clone()
                source_model.init_params = init_params

        self.model = self.model.load_pretrained(self.WORD.embed)
        self.model = self.model.to(args.device)

        if args.self_train:
            train_arcs_preds = self.get_preds(train.loader)
            del self.model
            self.model = Model(args)
            self.model = self.model.load_pretrained(self.WORD.embed)
            self.model = self.model.to(args.device)

        if args.E_Reg or args.T_Reg:
            source_model = source_model.load_pretrained(self.WORD.embed)
            source_model = source_model.to(args.device)
            args.source_model = source_model

        self.optimizer = Adam(self.model.parameters(), args.lr,
                              (args.mu, args.nu), args.epsilon)
        self.scheduler = ExponentialLR(self.optimizer,
                                       args.decay**(1 / args.decay_steps))

        # test before train
        if args.load is not '':
            logging.info('\n')

            dev_loss, dev_metric = self.evaluate(dev40.loader)
            test_loss, test_metric = self.evaluate(test40.loader)
            logging.info(f"{'dev40:':4} Loss: {dev_loss:.4f} {dev_metric}")
            logging.info(f"{'test40:':4} Loss: {test_loss:.4f} {test_metric}")

            dev_loss, dev_metric = self.evaluate(dev.loader)
            test_loss, test_metric = self.evaluate(test.loader)
            logging.info(f"{'dev:':4} Loss: {dev_loss:.4f} {dev_metric}")
            logging.info(f"{'test:':4} Loss: {test_loss:.4f} {test_metric}")

        total_time = timedelta()
        best_e, best_metric = 1, Metric()
        logging.info("Begin training")
        if args.unsupervised:
            max_uas = 0.
            cnt = 0
            for epoch in range(1, args.epochs + 1):
                start = datetime.now()

                self.train(train.loader)

                logging.info(f"Epoch {epoch} / {args.epochs}:")

                dev_loss, dev_metric = self.evaluate(dev40.loader)
                test_loss, test_metric = self.evaluate(test40.loader)
                logging.info(f"{'dev40:':4} Loss: {dev_loss:.4f} {dev_metric}")
                logging.info(
                    f"{'test40:':4} Loss: {test_loss:.4f} {test_metric}")

                dev_loss, dev_metric = self.evaluate(dev.loader)
                test_loss, test_metric = self.evaluate(test.loader)
                logging.info(f"{'dev:':4} Loss: {dev_loss:.4f} {dev_metric}")
                logging.info(
                    f"{'test:':4} Loss: {test_loss:.4f} {test_metric}")

                t = datetime.now() - start
                logging.info(f"{t}s elapsed\n")
        else:
            for epoch in range(1, args.epochs + 1):
                start = datetime.now()

                if args.self_train:
                    self.train(train.loader, train_arcs_preds)
                else:
                    self.train(train.loader)

                logging.info(f"Epoch {epoch} / {args.epochs}:")
                if args.self_train is False:
                    dev_loss, dev_metric = self.evaluate(dev.loader)
                    logging.info(
                        f"{'dev:':4} Loss: {dev_loss:.4f} {dev_metric}")

                t = datetime.now() - start

                # save the model if it is the best so far
                if args.self_train:
                    loss, test_metric = self.evaluate(test.loader)
                    logging.info(f"{'test:':6} Loss: {loss:.4f} {test_metric}")
                else:
                    if dev_metric > best_metric and epoch > args.patience:
                        loss, test_metric = self.evaluate(test.loader)
                        logging.info(
                            f"{'test:':6} Loss: {loss:.4f} {test_metric}")

                        best_e, best_metric = epoch, dev_metric
                        if hasattr(self.model, 'module'):
                            self.model.module.save(args.model)
                        else:
                            self.model.save(args.model)
                        logging.info(
                            f"{t}s elapsed, best epoch {best_e} {best_metric} (saved)\n"
                        )
                    else:
                        logging.info(
                            f"{t}s elapsed, best epoch {best_e} {best_metric}\n"
                        )
                    total_time += t

                    if epoch - best_e >= args.patience:
                        break

            if args.self_train is False:
                self.model = Model.load(args.model)
                logging.info(
                    f"max score of dev is {best_metric.score:.2%} at epoch {best_e}"
                )
                loss, metric = self.evaluate(test.loader)
                logging.info(
                    f"the score of test at epoch {best_e} is {metric.score:.2%}"
                )
                logging.info(
                    f"average time of each epoch is {total_time / epoch}s, {total_time}s elapsed"
                )
Beispiel #12
0
    def __call__(self, config):
        print("Preprocess the data")
        train = Corpus.load(config.ftrain)
        dev = Corpus.load(config.fdev)
        test = Corpus.load(config.ftest)

        if path.exists(config.model) != True:
            os.mkdir(config.model)

        if path.exists("model/") != True:
            os.mkdir("model/")

        if path.exists(config.model + config.modelname) != True:
            os.mkdir(config.model + config.modelname)

        if config.checkpoint:
            vocab = torch.load(config.main_path + config.vocab +
                               config.modelname + "/vocab.tag")
        else:
            vocab = Vocab.from_corpus(config=config,
                                      corpus=train,
                                      corpus_dev=dev,
                                      corpus_test=test,
                                      min_freq=0)
        train_seq = read_seq(config.ftrain_seq, vocab)
        total_act = 0
        for x in train_seq:
            total_act += len(x)
        print("number of transitions:{}".format(total_act))

        torch.save(vocab, config.vocab + config.modelname + "/vocab.tag")

        config.update({
            'n_words': vocab.n_train_words,
            'n_tags': vocab.n_tags,
            'n_rels': vocab.n_rels,
            'n_trans': vocab.n_trans,
            'pad_index': vocab.pad_index,
            'unk_index': vocab.unk_index
        })

        print("Load the dataset")
        trainset = TextDataset(vocab.numericalize(train, train_seq))
        devset = TextDataset(vocab.numericalize(dev))
        testset = TextDataset(vocab.numericalize(test))

        # set the data loaders
        train_loader, _ = batchify(dataset=trainset,
                                   batch_size=config.batch_size,
                                   n_buckets=config.buckets,
                                   shuffle=True)
        dev_loader, _ = batchify(dataset=devset,
                                 batch_size=config.batch_size,
                                 n_buckets=config.buckets)
        test_loader, _ = batchify(dataset=testset,
                                  batch_size=config.batch_size,
                                  n_buckets=config.buckets)

        print(f"{'train:':6} {len(trainset):5} sentences in total, "
              f"{len(train_loader):3} batches provided")
        print(f"{'dev:':6} {len(devset):5} sentences in total, "
              f"{len(dev_loader):3} batches provided")
        print(f"{'test:':6} {len(testset):5} sentences in total, "
              f"{len(test_loader):3} batches provided")
        print("Create the model")

        if config.checkpoint:
            parser = Parser.load(config.main_path + config.model +
                                 config.modelname + "/parser-checkpoint")
        else:
            parser = Parser(config, vocab.bertmodel)

        print("number of parameters:{}".format(
            sum(p.numel() for p in parser.parameters() if p.requires_grad)))
        if torch.cuda.is_available():
            print('Train/Evaluate on GPU')
            device = torch.device('cuda')
            parser = parser.to(device)

        model = Model(vocab, parser, config, vocab.n_rels)
        total_time = timedelta()
        best_e, best_metric = 1, Metric()

        ## prepare optimisers
        num_train_optimization_steps = int(config.epochs * len(train_loader))
        warmup_steps = int(config.warmupproportion *
                           num_train_optimization_steps)
        ## one for parsing parameters, one for BERT parameters
        if config.use_two_opts:
            model_nonbert = []
            model_bert = []
            layernorm_params = [
                'layernorm_key_layer', 'layernorm_value_layer',
                'dp_relation_k', 'dp_relation_v'
            ]
            for name, param in parser.named_parameters():
                if 'bert' in name and not any(nd in name
                                              for nd in layernorm_params):
                    model_bert.append((name, param))
                else:
                    model_nonbert.append((name, param))

            # Prepare optimizer and schedule (linear warmup and decay) for Non-bert parameters
            no_decay = ['bias', 'LayerNorm.weight']
            optimizer_grouped_parameters_nonbert = [{
                'params': [
                    p for n, p in model_nonbert
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                config.weight_decay
            }, {
                'params': [
                    p for n, p in model_nonbert
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]
            model.optimizer_nonbert = AdamW(
                optimizer_grouped_parameters_nonbert, lr=config.lr2)

            model.scheduler_nonbert = get_linear_schedule_with_warmup(
                model.optimizer_nonbert,
                num_warmup_steps=warmup_steps,
                num_training_steps=num_train_optimization_steps)

            # Prepare optimizer and schedule (linear warmup and decay) for Bert parameters
            optimizer_grouped_parameters_bert = [{
                'params': [
                    p for n, p in model_bert
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                config.weight_decay
            }, {
                'params':
                [p for n, p in model_bert if any(nd in n for nd in no_decay)],
                'weight_decay':
                0.0
            }]

            model.optimizer_bert = AdamW(optimizer_grouped_parameters_bert,
                                         lr=config.lr)
            model.scheduler_bert = get_linear_schedule_with_warmup(
                model.optimizer_bert,
                num_warmup_steps=warmup_steps,
                num_training_steps=num_train_optimization_steps)

        else:
            # Prepare optimizer and schedule (linear warmup and decay)
            no_decay = ['bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [{
                'params': [
                    p for n, p in parser.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                config.weight_decay
            }, {
                'params': [
                    p for n, p in parser.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            }]
            model.optimizer = AdamW(optimizer_grouped_parameters, lr=config.lr)
            model.scheduler = get_linear_schedule_with_warmup(
                model.optimizer,
                num_warmup_steps=warmup_steps,
                num_training_steps=num_train_optimization_steps)

        start_epoch = 1

        ## load model, optimiser, and other parameters from a checkpoint
        if config.checkpoint:
            check_load = torch.load(config.main_path + config.model +
                                    config.modelname + "/checkpoint")
            if config.use_two_opts:
                model.optimizer_bert.load_state_dict(
                    check_load['optimizer_bert'])
                model.optimizer_nonbert.load_state_dict(
                    check_load['optimizer_nonbert'])
                model.scheduler_bert.load_state_dict(
                    check_load['lr_schedule_bert'])
                model.scheduler_nonbert.load_state_dict(
                    check_load['lr_schedule_nonbert'])
                start_epoch = check_load['epoch'] + 1
                best_e = check_load['best_e']
                best_metric = check_load['best_metric']
            else:
                model.optimizer.load_state_dict(check_load['optimizer'])
                model.scheduler.load_state_dict(check_load['lr_schedule'])
                start_epoch = check_load['epoch'] + 1
                best_e = check_load['best_e']
                best_metric = check_load['best_metric']

        f1 = open(config.model + config.modelname + "/baseline.txt", "a")

        f1.write("New Model:\n")
        f1.close()
        for epoch in range(start_epoch, config.epochs + 1):
            start = datetime.now()
            # train one epoch and update the parameters
            model.train(train_loader)
            print(f"Epoch {epoch} / {config.epochs}:")
            f1 = open(config.model + config.modelname + "/baseline.txt", "a")
            dev_metric = model.evaluate(dev_loader, config.punct)
            f1.write(str(epoch) + "\n")
            print(f"{'dev:':6} {dev_metric}")
            f1.write(f"{'dev:':6} {dev_metric}")
            f1.write("\n")
            f1.close()

            t = datetime.now() - start
            # save the model if it is the best so far
            if dev_metric > best_metric:
                best_e, best_metric = epoch, dev_metric
                print(config.model + config.modelname + "/model_weights")
                model.parser.save(config.model + config.modelname +
                                  "/model_weights")
                print(f"{t}s elapsed (saved)\n")
            else:
                print(f"{t}s elapsed\n")
            total_time += t
            if epoch - best_e >= config.patience:
                break

            ## save checkpoint
            if config.use_two_opts:
                checkpoint = {
                    "epoch": epoch,
                    "optimizer_bert": model.optimizer_bert.state_dict(),
                    "lr_schedule_bert": model.scheduler_bert.state_dict(),
                    "lr_schedule_nonbert":
                    model.scheduler_nonbert.state_dict(),
                    "optimizer_nonbert": model.optimizer_nonbert.state_dict(),
                    'best_metric': best_metric,
                    'best_e': best_e
                }
                torch.save(
                    checkpoint, config.main_path + config.model +
                    config.modelname + "/checkpoint")
                parser.save(config.main_path + config.model +
                            config.modelname + "/parser-checkpoint")
            else:
                checkpoint = {
                    "epoch": epoch,
                    "optimizer": model.optimizer.state_dict(),
                    "lr_schedule": model.scheduler.state_dict(),
                    'best_metric': best_metric,
                    'best_e': best_e
                }
                torch.save(
                    checkpoint, config.main_path + config.model +
                    config.modelname + "/checkpoint")
                parser.save(config.main_path + config.model +
                            config.modelname + "/parser-checkpoint")
        model.parser = Parser.load(config.model + config.modelname +
                                   "/model_weights")
        metric = model.evaluate(test_loader, config.punct)
        print(metric)
        print(f"max score of dev is {best_metric.score:.2%} at epoch {best_e}")
        print(f"the score of test at epoch {best_e} is {metric.score:.2%}")
        print(f"average time of each epoch is {total_time / epoch}s")
        print(f"{total_time}s elapsed")
Beispiel #13
0
    def __call__(self, args):
        # override config from CLI parameters
        args = Config(args.conf).update(vars(args))
        args.n_attentions = args.use_attentions  #  back compatibility

        # loads train corpus into self.trainset
        super().__call__(args)

        logger.info(f"Configuration parameters:\n{args}")

        #train = Corpus.load(args.ftrain, self.fields, args.max_sent_length)
        train = self.trainset
        dev = Corpus.load(args.fdev, self.fields, args.max_sent_length)
        if args.ftest:
            test = Corpus.load(args.ftest, self.fields, args.max_sent_length)

        train = TextDataset(train, self.fields, args.buckets)
        dev = TextDataset(dev, self.fields, args.buckets)
        if args.ftest:
            test = TextDataset(test, self.fields, args.buckets)
        # set the data loaders
        train.loader = batchify(train, args.batch_size, True)
        dev.loader = batchify(dev, args.batch_size)
        if args.ftest:
            test.loader = batchify(test, args.batch_size)
        logger.info(f"{'train:':6} {len(train):5} sentences, "
                    f"{len(train.loader):3} batches, "
                    f"{len(train.buckets)} buckets")
        logger.info(f"{'dev:':6} {len(dev):5} sentences, "
                    f"{len(dev.loader):3} batches, "
                    f"{len(train.buckets)} buckets")
        if args.ftest:
            logger.info(f"{'test:':6} {len(test):5} sentences, "
                        f"{len(test.loader):3} batches, "
                        f"{len(train.buckets)} buckets")

        logger.info("Create the model")
        self.model = Model(args, mask_token_id=self.FEAT.mask_token_id)
        if self.WORD:
            self.model.load_pretrained(self.WORD.embed)
        self.model = self.model.to(args.device)
        if torch.cuda.device_count() > 1:
            self.model = TransparentDataParallel(self.model)
        logger.info(f"{self.model}\n")
        if args.optimizer == 'adamw':
            self.optimizer = AdamW(self.model.parameters(), args.lr,
                                   (args.mu, args.nu), args.epsilon,
                                   args.decay)
            training_steps = len(train.loader) // self.args.accumulation_steps \
                             * self.args.epochs
            warmup_steps = math.ceil(training_steps *
                                     self.args.warmup_steps_ratio)
            self.scheduler = get_linear_schedule_with_warmup(
                self.optimizer,
                num_warmup_steps=warmup_steps,
                num_training_steps=training_steps)
        else:
            self.optimizer = Adam(self.model.parameters(), args.lr,
                                  (args.mu, args.nu), args.epsilon)
            self.scheduler = ExponentialLR(self.optimizer,
                                           args.decay**(1 / args.decay_steps))

        total_time = timedelta()
        best_e, best_metric = 1, Metric()

        for epoch in range(1, args.epochs + 1):
            start = datetime.now()

            logger.info(f"Epoch {epoch} / {args.epochs}:")
            loss, train_metric = self.train(train.loader)
            logger.info(f"{'train:':6} Loss: {loss:.4f} {train_metric}")
            loss, dev_metric = self.evaluate(dev.loader)
            logger.info(f"{'dev:':6} Loss: {loss:.4f} {dev_metric}")
            if args.ftest:
                loss, test_metric = self.evaluate(test.loader)
                logger.info(f"{'test:':6} Loss: {loss:.4f} {test_metric}")

            t = datetime.now() - start
            # save the model if it is the best so far
            if dev_metric > best_metric and epoch > args.patience // 10:
                best_e, best_metric = epoch, dev_metric
                if hasattr(self.model, 'module'):
                    self.model.module.save(args.model)
                else:
                    self.model.save(args.model)
                logger.info(f"{t}s elapsed (saved)\n")
            else:
                logger.info(f"{t}s elapsed\n")
            total_time += t
            if epoch - best_e >= args.patience:
                break
        self.model = Model.load(args.model)
        if args.ftest:
            loss, metric = self.evaluate(test.loader)

        logger.info(
            f"max score of dev is {best_metric.score:.2%} at epoch {best_e}")
        if args.ftest:
            logger.info(
                f"the score of test at epoch {best_e} is {metric.score:.2%}")
        logger.info(f"average time of each epoch is {total_time / epoch}s")
        logger.info(f"{total_time}s elapsed")
Beispiel #14
0
    def __call__(self, args):
        super(Train, self).__call__(args)

        train = Corpus.load(args.ftrain, self.fields)
        dev = Corpus.load(args.fdev, self.fields)
        test = Corpus.load(args.ftest, self.fields)

        train = TextDataset(train, self.fields, args.buckets)
        dev = TextDataset(dev, self.fields, args.buckets)
        test = TextDataset(test, self.fields, args.buckets)
        # set the data loaders
        train.loader = batchify(train, args.batch_size, True)
        dev.loader = batchify(dev, args.batch_size)
        test.loader = batchify(test, args.batch_size)
        print(f"{'train:':6} {len(train):5} sentences, "
              f"{len(train.loader):3} batches, "
              f"{len(train.buckets)} buckets")
        print(f"{'dev:':6} {len(dev):5} sentences, "
              f"{len(dev.loader):3} batches, "
              f"{len(train.buckets)} buckets")
        print(f"{'test:':6} {len(test):5} sentences, "
              f"{len(test.loader):3} batches, "
              f"{len(train.buckets)} buckets")

        print("Create the model")
        self.model = Model(args).load_pretrained(self.WORD.embed)
        print(f"{self.model}\n")
        self.model = self.model.to(args.device)
        if torch.cuda.device_count() > 1:
            self.model = nn.DataParallel(self.model)
        self.optimizer = Adam(self.model.parameters(), args.lr,
                              (args.mu, args.nu), args.epsilon)
        self.scheduler = ExponentialLR(self.optimizer,
                                       args.decay**(1 / args.decay_steps))

        total_time = timedelta()
        best_e, best_metric = 1, Metric()

        for epoch in range(1, args.epochs + 1):
            start = datetime.now()
            # train one epoch and update the parameters
            self.train(train.loader)

            print(f"Epoch {epoch} / {args.epochs}:")
            loss, train_metric = self.evaluate(train.loader)
            print(f"{'train:':6} Loss: {loss:.4f} {train_metric}")
            loss, dev_metric = self.evaluate(dev.loader)
            print(f"{'dev:':6} Loss: {loss:.4f} {dev_metric}")
            loss, test_metric = self.evaluate(test.loader)
            print(f"{'test:':6} Loss: {loss:.4f} {test_metric}")

            t = datetime.now() - start
            # save the model if it is the best so far
            if dev_metric > best_metric and epoch > args.patience:
                best_e, best_metric = epoch, dev_metric
                if hasattr(self.model, 'module'):
                    self.model.module.save(args.model)
                else:
                    self.model.save(args.model)
                print(f"{t}s elapsed (saved)\n")
            else:
                print(f"{t}s elapsed\n")
            total_time += t
            if epoch - best_e >= args.patience:
                break

        if hasattr(self.model, 'module'):
            self.model.module.save(args.model)
        else:
            self.model.save(args.model)
        print(f"{t}s elapsed (saved)\n")

        self.model = Model.load(args.model)
        loss, metric = self.evaluate(test.loader)

        print(f"max score of dev is {best_metric.score:.2%} at epoch {best_e}")
        print(f"the score of test at epoch {best_e} is {metric.score:.2%}")
        print(f"average time of each epoch is {total_time / epoch}s")
        print(f"{total_time}s elapsed")
Beispiel #15
0
    def __call__(self, config):
        print("Preprocess the data")

        if config.input_type == "conllu":
            train = UniversalDependenciesDatasetReader()
            train.load(config.ftrain)
            dev = UniversalDependenciesDatasetReader()
            dev.load(config.fdev)
            test = UniversalDependenciesDatasetReader()
            test.load(config.ftest)
        else:
            train = Corpus.load(config.ftrain)
            dev = Corpus.load(config.fdev)
            test = Corpus.load(config.ftest)

        if config.use_predicted:
            if config.input_type == "conllu":
                train_predicted = UniversalDependenciesDatasetReader()
                train_predicted.load(config.fpredicted_train)
                dev_predicted = UniversalDependenciesDatasetReader()
                dev_predicted.load(config.fpredicted_dev)
                test_predicted = UniversalDependenciesDatasetReader()
                test_predicted.load(config.fpredicted_test)
            else:
                train_predicted = Corpus.load(config.fpredicted_train)
                dev_predicted = Corpus.load(config.fpredicted_dev)
                test_predicted = Corpus.load(config.fpredicted_test)

        if path.exists(config.main_path + "/exp") != True:
            os.mkdir(config.main_path + "/exp")

        if path.exists(config.main_path + "/model") != True:
            os.mkdir(config.main_path + "/model")

        if path.exists(config.main_path + config.model + config.modelname) != True:
            os.mkdir(config.main_path + config.model + config.modelname)

        vocab = Vocab.from_corpus(config=config, corpus=train, min_freq=2)

        torch.save(vocab, config.main_path + config.vocab + config.modelname + "/vocab.tag")

        config.update({
            'n_words': vocab.n_train_words,
            'n_tags': vocab.n_tags,
            'n_rels': vocab.n_rels,
            'pad_index': vocab.pad_index,
            'unk_index': vocab.unk_index
        })

        print("Load the dataset")

        if config.use_predicted:
            trainset = TextDataset(vocab.numericalize(train, train_predicted))
            devset = TextDataset(vocab.numericalize(dev, dev_predicted))
            testset = TextDataset(vocab.numericalize(test, test_predicted))
        else:
            trainset = TextDataset(vocab.numericalize(train))
            devset = TextDataset(vocab.numericalize(dev))
            testset = TextDataset(vocab.numericalize(test))

        # set the data loaders
        train_loader, _ = batchify(dataset=trainset,
                                   batch_size=config.batch_size,
                                   n_buckets=config.buckets,
                                   shuffle=True)
        dev_loader, _ = batchify(dataset=devset,
                                 batch_size=config.batch_size,
                                 n_buckets=config.buckets)
        test_loader, _ = batchify(dataset=testset,
                                  batch_size=config.batch_size,
                                  n_buckets=config.buckets)
        print(f"{'train:':6} {len(trainset):5} sentences in total, "
              f"{len(train_loader):3} batches provided")
        print(f"{'dev:':6} {len(devset):5} sentences in total, "
              f"{len(dev_loader):3} batches provided")
        print(f"{'test:':6} {len(testset):5} sentences in total, "
              f"{len(test_loader):3} batches provided")

        print("Create the model")
        parser = BiaffineParser(config, vocab.n_rels, vocab.bertmodel)

        print("number of pars:{}".format(sum(p.numel() for p in parser.parameters()
                                             if p.requires_grad)))
        if torch.cuda.is_available():
            print('device:cuda')
            device = torch.device('cuda')
            parser = parser.to(device)
        # print(f"{parser}\n")

        model = Model(vocab, parser, config, vocab.n_rels)
        total_time = timedelta()
        best_e, best_metric = 1, Metric()

        num_train_optimization_steps = int(config.num_iter_encoder * config.epochs * len(train_loader))
        warmup_steps = int(config.warmupproportion * num_train_optimization_steps)

        if config.use_two_opts:
            model_nonbert = []
            model_bert = []
            layernorm_params = ['layernorm_key_layer', 'layernorm_value_layer', 'dp_relation_k', 'dp_relation_v']
            for name, param in parser.named_parameters():
                if 'bert' in name and not any(nd in name for nd in layernorm_params):
                    model_bert.append((name, param))
                else:
                    model_nonbert.append((name, param))

            # Prepare optimizer and schedule (linear warmup and decay) for Non-bert parameters
            no_decay = ['bias', 'LayerNorm.weight']
            optimizer_grouped_parameters_nonbert = [
                {'params': [p for n, p in model_nonbert if not any(nd in n for nd in no_decay)],
                 'weight_decay': config.weight_decay},
                {'params': [p for n, p in model_nonbert if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
            model.optimizer_nonbert = AdamW(optimizer_grouped_parameters_nonbert, lr=config.lr2)

            model.scheduler_nonbert = get_linear_schedule_with_warmup(model.optimizer_nonbert,
                                                                      num_warmup_steps=warmup_steps,
                                                                      num_training_steps=num_train_optimization_steps)

            # Prepare optimizer and schedule (linear warmup and decay) for Bert parameters
            optimizer_grouped_parameters_bert = [
                {'params': [p for n, p in model_bert if not any(nd in n for nd in no_decay)],
                 'weight_decay': config.weight_decay},
                {'params': [p for n, p in model_bert if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

            model.optimizer_bert = AdamW(optimizer_grouped_parameters_bert, lr=config.lr1)
            model.scheduler_bert = get_linear_schedule_with_warmup(
                model.optimizer_bert, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps
            )

        else:
            # Prepare optimizer and schedule (linear warmup and decay)
            no_decay = ['bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [
                {'params': [p for n, p in parser.named_parameters() if not any(nd in n for nd in no_decay)],
                 'weight_decay': config.weight_decay},
                {'params': [p for n, p in parser.named_parameters() if any(nd in n for nd in no_decay)],
                 'weight_decay': 0.0}
            ]
            model.optimizer = AdamW(optimizer_grouped_parameters, lr=config.lr1)
            model.scheduler = get_linear_schedule_with_warmup(
                model.optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps
            )

        for epoch in range(1, config.epochs + 1):
            start = datetime.now()
            # train one epoch and update the parameters
            if config.use_predicted:
                model.train_predicted(train_loader)
            else:
                model.train(train_loader)
            print(f"Epoch {epoch} / {config.epochs}:")

            if config.use_predicted:
                loss, dev_metric = model.evaluate_predicted(dev_loader, config.punct)
            else:
                loss, dev_metric = model.evaluate(dev_loader, config.punct)

            print(f"{'dev:':6} Loss: {loss:.4f} {dev_metric}")
            if config.use_predicted:
                loss, test_metric = model.evaluate_predicted(test_loader, config.punct)
            else:
                loss, test_metric = model.evaluate(test_loader, config.punct)
            print(f"{'test:':6} Loss: {loss:.4f} {test_metric}")

            t = datetime.now() - start
            # save the model if it is the best so far
            if dev_metric > best_metric:
                best_e, best_metric = epoch, dev_metric
                print(config.model + config.modelname + "/model_weights")
                model.parser.save(config.main_path + config.model + config.modelname + "/model_weights")
                print(f"{t}s elapsed (saved)\n")
            else:
                print(f"{t}s elapsed\n")
            total_time += t
            if epoch - best_e >= config.patience:
                break
        model.parser = BiaffineParser.load(config.main_path + config.model + config.modelname + "/model_weights")
        if config.use_predicted:
            loss, metric = model.evaluate_predicted(test_loader, config.punct)
        else:
            loss, metric = model.evaluate(test_loader, config.punct)
        print(metric)
        print(f"max score of dev is {best_metric.score:.2%} at epoch {best_e}")
        print(f"the score of test at epoch {best_e} is {metric.score:.2%}")
        print(f"average time of each epoch is {total_time / epoch}s")
        print(f"{total_time}s elapsed")
Beispiel #16
0
    def __call__(self, config):
        if not os.path.exists(config.file):
            os.mkdir(config.file)
        if config.preprocess or not os.path.exists(config.vocab):
            print("Preprocess the corpus")
            pos_train = Corpus.load(config.fptrain, [1, 4], config.pos)
            dep_train = Corpus.load(config.ftrain)
            pos_dev = Corpus.load(config.fpdev, [1, 4])
            dep_dev = Corpus.load(config.fdev)
            pos_test = Corpus.load(config.fptest, [1, 4])
            dep_test = Corpus.load(config.ftest)
            print("Create the vocab")
            vocab = Vocab.from_corpora(pos_train, dep_train, 2)
            vocab.read_embeddings(Embedding.load(config.fembed))
            print("Load the dataset")
            pos_trainset = TextDataset(vocab.numericalize(pos_train, False),
                                       config.buckets)
            dep_trainset = TextDataset(vocab.numericalize(dep_train),
                                       config.buckets)
            pos_devset = TextDataset(vocab.numericalize(pos_dev, False),
                                     config.buckets)
            dep_devset = TextDataset(vocab.numericalize(dep_dev),
                                     config.buckets)
            pos_testset = TextDataset(vocab.numericalize(pos_test, False),
                                      config.buckets)
            dep_testset = TextDataset(vocab.numericalize(dep_test),
                                      config.buckets)
            torch.save(vocab, config.vocab)
            torch.save(pos_trainset, os.path.join(config.file, 'pos_trainset'))
            torch.save(dep_trainset, os.path.join(config.file, 'dep_trainset'))
            torch.save(pos_devset, os.path.join(config.file, 'pos_devset'))
            torch.save(dep_devset, os.path.join(config.file, 'dep_devset'))
            torch.save(pos_testset, os.path.join(config.file, 'pos_testset'))
            torch.save(dep_testset, os.path.join(config.file, 'dep_testset'))
        else:
            print("Load the vocab")
            vocab = torch.load(config.vocab)
            print("Load the datasets")
            pos_trainset = torch.load(os.path.join(config.file,
                                                   'pos_trainset'))
            dep_trainset = torch.load(os.path.join(config.file,
                                                   'dep_trainset'))
            pos_devset = torch.load(os.path.join(config.file, 'pos_devset'))
            dep_devset = torch.load(os.path.join(config.file, 'dep_devset'))
            pos_testset = torch.load(os.path.join(config.file, 'pos_testset'))
            dep_testset = torch.load(os.path.join(config.file, 'dep_testset'))
        config.update({
            'n_words': vocab.n_init,
            'n_chars': vocab.n_chars,
            'n_pos_tags': vocab.n_pos_tags,
            'n_dep_tags': vocab.n_dep_tags,
            'n_rels': vocab.n_rels,
            'pad_index': vocab.pad_index,
            'unk_index': vocab.unk_index
        })
        # set the data loaders
        pos_train_loader = batchify(
            pos_trainset, config.pos_batch_size // config.update_steps, True)
        dep_train_loader = batchify(dep_trainset,
                                    config.batch_size // config.update_steps,
                                    True)
        pos_dev_loader = batchify(pos_devset, config.pos_batch_size)
        dep_dev_loader = batchify(dep_devset, config.batch_size)
        pos_test_loader = batchify(pos_testset, config.pos_batch_size)
        dep_test_loader = batchify(dep_testset, config.batch_size)

        print(vocab)
        print(f"{'pos_train:':10} {len(pos_trainset):7} sentences in total, "
              f"{len(pos_train_loader):4} batches provided")
        print(f"{'dep_train:':10} {len(dep_trainset):7} sentences in total, "
              f"{len(dep_train_loader):4} batches provided")
        print(f"{'pos_dev:':10} {len(pos_devset):7} sentences in total, "
              f"{len(pos_dev_loader):4} batches provided")
        print(f"{'dep_dev:':10} {len(dep_devset):7} sentences in total, "
              f"{len(dep_dev_loader):4} batches provided")
        print(f"{'pos_test:':10} {len(pos_testset):7} sentences in total, "
              f"{len(pos_test_loader):4} batches provided")
        print(f"{'dep_test:':10} {len(dep_testset):7} sentences in total, "
              f"{len(dep_test_loader):4} batches provided")

        print("Create the model")
        parser = BiaffineParser(config, vocab.embed).to(config.device)
        print(f"{parser}\n")

        model = Model(config, vocab, parser)

        total_time = timedelta()
        best_e, best_metric = 1, AttachmentMethod()
        model.optimizer = Adam(model.parser.parameters(), config.lr,
                               (config.mu, config.nu), config.epsilon)
        model.scheduler = ExponentialLR(model.optimizer,
                                        config.decay**(1 / config.decay_steps))

        for epoch in range(1, config.epochs + 1):
            start = datetime.now()
            # train one epoch and update the parameters
            model.train(pos_train_loader, dep_train_loader)
            print(f"Epoch {epoch} / {config.epochs}:")
            lp, ld, mp, mdt, mdp = model.evaluate(None, dep_train_loader)
            print(f"{'train:':6} LP: {lp:.4f} LD: {ld:.4f} {mp} {mdt} {mdp}")
            lp, ld, mp, mdt, dev_m = model.evaluate(pos_dev_loader,
                                                    dep_dev_loader)
            print(f"{'dev:':6} LP: {lp:.4f} LD: {ld:.4f} {mp} {mdt} {dev_m}")
            lp, ld, mp, mdt, mdp = model.evaluate(pos_test_loader,
                                                  dep_test_loader)
            print(f"{'test:':6} LP: {lp:.4f} LD: {ld:.4f} {mp} {mdt} {mdp}")

            t = datetime.now() - start
            # save the model if it is the best so far
            if dev_m > best_metric and epoch > config.patience:
                best_e, best_metric = epoch, dev_m
                model.parser.save(config.model)
                print(f"{t}s elapsed (saved)\n")
            else:
                print(f"{t}s elapsed\n")
            total_time += t
            if epoch - best_e >= config.patience:
                break
        model.parser = BiaffineParser.load(config.model)
        lp, ld, mp, mdt, mdp = model.evaluate(pos_test_loader, dep_test_loader)

        print(f"max score of dev is {best_metric.score:.2%} at epoch {best_e}")
        print(f"the score of test at epoch {best_e} is {mdp.score:.2%}")
        print(f"average time of each epoch is {total_time / epoch}s")
        print(f"{total_time}s elapsed")