Beispiel #1
0
    def init_data_loader(self, batch_size):

        self.train_data_loader = DataSetIter(self.train_set,
                                             batch_size,
                                             sampler=RandomSampler())
        self.dev_data_loader = DataSetIter(self.dev_set,
                                           batch_size,
                                           sampler=SequentialSampler())
        self.test_data_loader = DataSetIter(self.test_set,
                                            batch_size,
                                            sampler=SequentialSampler())
Beispiel #2
0
 def train_dataloader(self):
     # Random
     args = self.hparams.train.training
     train_sampler = RandomSampler()
     train_loader = DataSetIter(batch_size=args.batch_size,
                                dataset=self.train_dataset, sampler=train_sampler, drop_last=False)
     return train_loader
Beispiel #3
0
 def test_dataloader(self):
     args = self.hparams.test
     test_sampler = ConstantTokenNumSampler(seq_len=self.test_dataset.get_field(seq_len).content,
                                            max_token=args.max_tokens, num_bucket=args.bucket)
     test_loader = DataSetIter(self.test_dataset, batch_size=1, sampler=None, as_numpy=False, num_workers=4,
                               pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None,
                               batch_sampler=test_sampler)
     return test_loader
Beispiel #4
0
 def train_init_dataloder_for_model2(self):
     assert self.hparams.joint_training
     init_sampler = RandomSampler()
     args = self.hparams.train.init.model2
     init_loader = DataSetIter(batch_size=args.batch_size,
                               dataset=self.train_dataset_init_for_model2,
                               sampler=init_sampler,
                               drop_last=False)
     return init_loader
Beispiel #5
0
    def train_init_dataloader(self):
        if self.train_dataset_init is None:
            return None

        init_sampler = RandomSampler()
        args = self.hparams.train.init if not self.hparams.joint_training else self.hparams.train.init.model1
        init_loader = DataSetIter(batch_size=args.batch_size,
                                   dataset=self.train_dataset_init,
                                   sampler=init_sampler,
                                   drop_last=False)
        return init_loader
Beispiel #6
0
    def test_fastnlp_10min_tutorial(self):
        # 从csv读取数据到DataSet
        sample_path = "test/data_for_tests/tutorial_sample_dataset.csv"
        dataset = CSVLoader(headers=['raw_sentence', 'label'], sep='	')._load(sample_path)
        print(len(dataset))
        print(dataset[0])
        print(dataset[-3])

        dataset.append(Instance(raw_sentence='fake data', label='0'))
        # 将所有数字转为小写
        dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence')
        # label转int
        dataset.apply(lambda x: int(x['label']), new_field_name='label')

        # 使用空格分割句子
        def split_sent(ins):
            return ins['raw_sentence'].split()

        dataset.apply(split_sent, new_field_name='words')

        # 增加长度信息
        dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')
        print(len(dataset))
        print(dataset[0])

        # DataSet.drop(func)筛除数据
        dataset.drop(lambda x: x['seq_len'] <= 3, inplace=True)
        print(len(dataset))

        # 设置DataSet中,哪些field要转为tensor
        # set target,loss或evaluate中的golden,计算loss,模型评估时使用
        dataset.set_target("label")
        # set input,模型forward时使用
        dataset.set_input("words", "seq_len")

        # 分出测试集、训练集
        test_data, train_data = dataset.split(0.5)
        print(len(test_data))
        print(len(train_data))

        # 构建词表, Vocabulary.add(word)
        vocab = Vocabulary(min_freq=2)
        train_data.apply(lambda x: [vocab.add(word) for word in x['words']])
        vocab.build_vocab()

        # index句子, Vocabulary.to_index(word)
        train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')
        test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')
        print(test_data[0])

        # 如果你们需要做强化学习或者GAN之类的项目,你们也可以使用这些数据预处理的工具
        from fastNLP.core.batch import DataSetIter
        from fastNLP.core.sampler import RandomSampler

        batch_iterator = DataSetIter(dataset=train_data, batch_size=2, sampler=RandomSampler())
        for batch_x, batch_y in batch_iterator:
            print("batch_x has: ", batch_x)
            print("batch_y has: ", batch_y)
            break

        from fastNLP.models import CNNText
        model = CNNText((len(vocab), 50), num_classes=5, dropout=0.1)

        from fastNLP import Trainer
        from copy import deepcopy

        # 更改DataSet中对应field的名称,要以模型的forward等参数名一致
        train_data.rename_field('label', 'label_seq')
        test_data.rename_field('label', 'label_seq')

        loss = CrossEntropyLoss(target="label_seq")
        metric = AccuracyMetric(target="label_seq")

        # 实例化Trainer,传入模型和数据,进行训练
        # 先在test_data拟合(确保模型的实现是正确的)
        copy_model = deepcopy(model)
        overfit_trainer = Trainer(train_data=test_data, model=copy_model, loss=loss, batch_size=32, n_epochs=5,
                                  dev_data=test_data, metrics=metric, save_path=None)
        overfit_trainer.train()

        # 用train_data训练,在test_data验证
        trainer = Trainer(model=model, train_data=train_data, dev_data=test_data,
                          loss=CrossEntropyLoss(target="label_seq"),
                          metrics=AccuracyMetric(target="label_seq"),
                          save_path=None,
                          batch_size=32,
                          n_epochs=5)
        trainer.train()
        print('Train finished!')

        # 调用Tester在test_data上评价效果
        from fastNLP import Tester

        tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(target="label_seq"),
                        batch_size=4)
        acc = tester.test()
        print(acc)
def main():
    parser = argparse.ArgumentParser(description='Transformer Model')

    # Where to find data
    parser.add_argument(
        '--data_path',
        type=str,
        default='/remote-home/dqwang/Datasets/CNNDM/train.label.jsonl',
        help='Path expression to pickle datafiles.')
    parser.add_argument(
        '--valid_path',
        type=str,
        default='/remote-home/dqwang/Datasets/CNNDM/val.label.jsonl',
        help='Path expression to pickle valid datafiles.')
    parser.add_argument('--vocab_path',
                        type=str,
                        default='/remote-home/dqwang/Datasets/CNNDM/vocab',
                        help='Path expression to text vocabulary file.')
    parser.add_argument('--embedding_path',
                        type=str,
                        default='/remote-home/dqwang/Glove/glove.42B.300d.txt',
                        help='Path expression to external word embedding.')

    # Important settings
    parser.add_argument('--mode',
                        type=str,
                        default='train',
                        help='must be one of train/test')
    parser.add_argument(
        '--restore_model',
        type=str,
        default='None',
        help=
        'Restore model for further training. [bestmodel/bestFmodel/earlystop/None]'
    )
    parser.add_argument(
        '--test_model',
        type=str,
        default='evalbestmodel',
        help=
        'choose different model to test [evalbestmodel/evalbestFmodel/trainbestmodel/trainbestFmodel/earlystop]'
    )
    parser.add_argument('--use_pyrouge',
                        action='store_true',
                        default=False,
                        help='use_pyrouge')

    # Where to save output
    parser.add_argument('--save_root',
                        type=str,
                        default='save/',
                        help='Root directory for all model.')
    parser.add_argument('--log_root',
                        type=str,
                        default='log/',
                        help='Root directory for all logging.')

    # Hyperparameters
    parser.add_argument('--gpu',
                        type=str,
                        default='0',
                        help='GPU ID to use. For cpu, set -1 [default: -1]')
    parser.add_argument('--cuda',
                        action='store_true',
                        default=False,
                        help='use cuda')
    parser.add_argument(
        '--vocab_size',
        type=int,
        default=100000,
        help=
        'Size of vocabulary. These will be read from the vocabulary file in order. If the vocabulary file contains fewer words than this number, or if this number is set to 0, will take all words in the vocabulary file.'
    )
    parser.add_argument('--n_epochs',
                        type=int,
                        default=20,
                        help='Number of epochs [default: 20]')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Mini batch size [default: 128]')

    parser.add_argument('--word_embedding',
                        action='store_true',
                        default=True,
                        help='whether to use Word embedding')
    parser.add_argument('--word_emb_dim',
                        type=int,
                        default=300,
                        help='Word embedding size [default: 200]')
    parser.add_argument(
        '--embed_train',
        action='store_true',
        default=False,
        help='whether to train Word embedding [default: False]')
    parser.add_argument('--min_kernel_size',
                        type=int,
                        default=1,
                        help='kernel min length for CNN [default:1]')
    parser.add_argument('--max_kernel_size',
                        type=int,
                        default=7,
                        help='kernel max length for CNN [default:7]')
    parser.add_argument('--output_channel',
                        type=int,
                        default=50,
                        help='output channel: repeated times for one kernel')
    parser.add_argument('--n_layers',
                        type=int,
                        default=12,
                        help='Number of deeplstm layers')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=512,
                        help='hidden size [default: 512]')
    parser.add_argument(
        '--ffn_inner_hidden_size',
        type=int,
        default=2048,
        help='PositionwiseFeedForward inner hidden size [default: 2048]')
    parser.add_argument('--n_head',
                        type=int,
                        default=8,
                        help='multihead attention number [default: 8]')
    parser.add_argument('--recurrent_dropout_prob',
                        type=float,
                        default=0.1,
                        help='recurrent dropout prob [default: 0.1]')
    parser.add_argument('--atten_dropout_prob',
                        type=float,
                        default=0.1,
                        help='attention dropout prob [default: 0.1]')
    parser.add_argument(
        '--ffn_dropout_prob',
        type=float,
        default=0.1,
        help='PositionwiseFeedForward dropout prob [default: 0.1]')
    parser.add_argument('--use_orthnormal_init',
                        action='store_true',
                        default=True,
                        help='use orthnormal init for lstm [default: true]')
    parser.add_argument(
        '--sent_max_len',
        type=int,
        default=100,
        help='max length of sentences (max source text sentence tokens)')
    parser.add_argument(
        '--doc_max_timesteps',
        type=int,
        default=50,
        help='max length of documents (max timesteps of documents)')
    parser.add_argument('--save_label',
                        action='store_true',
                        default=False,
                        help='require multihead attention')

    # Training
    parser.add_argument('--lr',
                        type=float,
                        default=0.0001,
                        help='learning rate')
    parser.add_argument('--lr_descent',
                        action='store_true',
                        default=False,
                        help='learning rate descent')
    parser.add_argument('--warmup_steps',
                        type=int,
                        default=4000,
                        help='warmup_steps')
    parser.add_argument('--grad_clip',
                        action='store_true',
                        default=False,
                        help='for gradient clipping')
    parser.add_argument(
        '--max_grad_norm',
        type=float,
        default=1.0,
        help='for gradient clipping max gradient normalization')

    parser.add_argument('-m',
                        type=int,
                        default=3,
                        help='decode summary length')
    parser.add_argument('--limited',
                        action='store_true',
                        default=False,
                        help='limited decode summary length')

    args = parser.parse_args()

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    torch.set_printoptions(threshold=50000)

    hps = args

    # File paths
    DATA_FILE = args.data_path
    VALID_FILE = args.valid_path
    VOCAL_FILE = args.vocab_path
    LOG_PATH = args.log_root

    # train_log setting
    if not os.path.exists(LOG_PATH):
        if hps.mode == "train":
            os.makedirs(LOG_PATH)
        else:
            logger.exception(
                "[Error] Logdir %s doesn't exist. Run in train mode to create it.",
                LOG_PATH)
            raise Exception(
                "[Error] Logdir %s doesn't exist. Run in train mode to create it."
                % (LOG_PATH))
    nowTime = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    log_path = os.path.join(LOG_PATH, hps.mode + "_" + nowTime)
    file_handler = logging.FileHandler(log_path)
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)

    logger.info("Pytorch %s", torch.__version__)
    logger.info(args)
    logger.info(args)

    sum_loader = SummarizationLoader()

    if hps.mode == 'test':
        paths = {"test": DATA_FILE}
        hps.recurrent_dropout_prob = 0.0
        hps.atten_dropout_prob = 0.0
        hps.ffn_dropout_prob = 0.0
        logger.info(hps)
    else:
        paths = {"train": DATA_FILE, "valid": VALID_FILE}

    dataInfo = sum_loader.process(paths=paths,
                                  vocab_size=hps.vocab_size,
                                  vocab_path=VOCAL_FILE,
                                  sent_max_len=hps.sent_max_len,
                                  doc_max_timesteps=hps.doc_max_timesteps,
                                  load_vocab=os.path.exists(VOCAL_FILE))

    vocab = dataInfo.vocabs["vocab"]
    model = TransformerModel(hps, vocab)

    if len(hps.gpu) > 1:
        gpuid = hps.gpu.split(',')
        gpuid = [int(s) for s in gpuid]
        model = nn.DataParallel(model, device_ids=gpuid)
        logger.info("[INFO] Use Multi-gpu: %s", hps.gpu)
    if hps.cuda:
        model = model.cuda()
        logger.info("[INFO] Use cuda")

    if hps.mode == 'train':
        trainset = dataInfo.datasets["train"]
        train_sampler = BucketSampler(batch_size=hps.batch_size,
                                      seq_len_field_name=Const.INPUT)
        train_batch = DataSetIter(batch_size=hps.batch_size,
                                  dataset=trainset,
                                  sampler=train_sampler)
        validset = dataInfo.datasets["valid"]
        validset.set_input("text", "summary")
        valid_batch = DataSetIter(batch_size=hps.batch_size, dataset=validset)
        setup_training(model, train_batch, valid_batch, hps)
    elif hps.mode == 'test':
        logger.info("[INFO] Decoding...")
        testset = dataInfo.datasets["test"]
        testset.set_input("text", "summary")
        test_batch = DataSetIter(batch_size=hps.batch_size, dataset=testset)
        run_test(model, test_batch, hps, limited=hps.limited)
    else:
        logger.error("The 'mode' flag must be one of train/eval/test")
        raise ValueError("The 'mode' flag must be one of train/eval/test")
Beispiel #8
0
    def _train_epoch(self):
        total_loss = 0
        corrects, samples = 0, 0

        n_tasks = len(self.task_lst)
        task_seq = list(np.random.permutation(n_tasks))
        empty_task = copy.deepcopy(self.empty_tasks)
        self.model.train()
        self.model.zero_grad()

        for cur_step in range(self.n_steps_per_epoch):
            for task_id in task_seq:
                if task_id in empty_task:
                    continue
                task = find_task(task_id, self.task_lst)
                batch = next(task.train_data_loader, None)
                if batch is None:
                    # empty_task.add(task_id)
                    task.train_data_loader = DataSetIter(
                        task.train_set,
                        self.batch_size,
                        sampler=RandomSampler())
                    task.train_data_loader = iter(task.train_data_loader)
                    continue
                x, y = batch
                batch_task_id = x["task_id"].cuda()
                batch_x = x["x"].cuda()
                batch_y = y["y"].cuda()

                self.masker.before_forward(batch_task_id[0].item())
                if "seq_len" in x:
                    seq_len = x["seq_len"].cuda()
                    out = self.model(batch_task_id, batch_x, batch_y, seq_len)
                else:
                    seq_len = None
                    out = self.model(batch_task_id, batch_x, batch_y)
                loss, pred = out["loss"], out["pred"]
                self.steps += 1

                total_loss += loss.item()
                loss = loss / self.accumulation_steps
                loss.backward()
                self.masker.after_forward(batch_task_id[0].item())
                self.metrics[task_id].evaluate(pred, batch_y, seq_len)

                if self.steps % self.accumulation_steps == 0:
                    nn.utils.clip_grad_value_(self.model.parameters(), 5)

                    if self.scheduler is not None:
                        self.scheduler.step()
                    self.optim.step()
                    self.optim.zero_grad()

                if self.steps % self.print_every == 0:
                    self.summary_writer.add_scalar(
                        "train_loss", total_loss / self.print_every,
                        self.steps)
                    score = self.metrics[task_id].get_metric()
                    metric_name = "acc" if "acc" in score else "f1"
                    score = score["acc"] if "acc" in score else score["f"]
                    self.summary_writer.add_scalar("train_acc", score,
                                                   self.steps)
                    self.logger.info(" - Step {}: loss {}\t{}\t{}: {}".format(
                        self.steps,
                        total_loss / self.print_every,
                        task.task_name,
                        metric_name,
                        score,
                    ))
                    total_loss = 0
        if self.epoch_scheduler is not None:
            self.epoch_scheduler.step()