def get_test_predictions(self, test_filename, save_filename):
        test_dataset = DialogueDataset(
            test_filename,
            self.config.sentence_len,
            self.vocab,
            False)

        test_data_loader = torch.utils.data.DataLoader(
            test_dataset, self.config.val_batch_size, shuffle=True)

        with open(test_filename, 'r') as f:
            data = json.load(f)

        start = time.clock()
        phase_metrics = dict()
        epoch_loss = list()
        epoch_metrics = list()
        results = {"accuracy": list(), "precision": list(), "recall": list(),
                   "F1": list()}
        average_epoch_loss = None
        for i, batch in enumerate(tqdm(test_data_loader,
                                       mininterval=2, desc='test', leave=False)):
            # prepare data
            src_seq, src_pos, src_seg, tgt = map(
                lambda x: x.to(self.device), batch[:4])

            ids = batch[4]
            start_end_idx = batch[5]

            # forward
            pred = self.model(src_seq, src_pos, src_seg, tgt)

            loss = F.cross_entropy(self.prepare_pred(pred).view(-1, 2),
                                   tgt.view(-1))

            average_loss = float(loss)
            epoch_loss.append(average_loss)
            average_epoch_loss = np.mean(epoch_loss)

            output = torch.argmax(self.prepare_pred(pred), 3)

            record_predictions(output, data, ids, start_end_idx)

            get_results(tgt.view(-1).cpu(), output.view(-1).cpu(), results)

        phase_metrics["avg_results"] = {key: np.mean(value) for key, value in
                                        results.items()}
        phase_metrics["loss"] = average_epoch_loss

        phase_metrics["time_taken"] = time.clock() - start
        string = ' {} loss: {:.3f} '.format('test', average_epoch_loss)
        print(string, end='\n')

        data["results"] = phase_metrics

        with open(save_filename, 'w') as f:
            json.dump(data, f)

        return phase_metrics
    def __init__(self, args):
        # get the dir with pre-trained model

        load_dir = os.path.join(args.experiment_dir, args.old_model_dir)

        # initialize, and load vocab
        self.vocab = Vocab()
        vocab_filename = os.path.join(load_dir, "vocab.json")
        self.vocab.load_from_dict(vocab_filename)

        # load configuration
        with open(os.path.join(load_dir, "config.json"), "r") as f:
            config = json.load(f)

        args.response_len = config["response_len"]
        args.history_len = config["history_len"]

        # initialize an empty dataset. used to get input features
        self.dataset = DialogueDataset(None,
                                       history_len=config["history_len"],
                                       response_len=config["response_len"],
                                       vocab=self.vocab,
                                       update_vocab=False)

        # set device
        self.device = torch.device(args.device)

        # initialize model
        model = Transformer(config["vocab_size"],
                            config["vocab_size"],
                            config["history_len"],
                            config["response_len"],
                            d_word_vec=config["embedding_dim"],
                            d_model=config["model_dim"],
                            d_inner=config["inner_dim"],
                            n_layers=config["num_layers"],
                            n_head=config["num_heads"],
                            d_k=config["dim_k"],
                            d_v=config["dim_v"],
                            dropout=config["dropout"],
                            pretrained_embeddings=None).to(self.device)

        # load checkpoint
        checkpoint = torch.load(os.path.join(load_dir, args.old_model_name),
                                map_location=self.device)
        model.load_state_dict(checkpoint['model'])

        # create chatbot
        self.chatbot = Chatbot(args, model)

        self.args = args
    def __init__(self, args):

        # set up output directory
        self.output_dir = os.path.join(args.experiment_dir, args.run_name)
        if not os.path.exists(args.experiment_dir):
            os.mkdir(args.experiment_dir)
        if not os.path.exists(self.output_dir):
            os.mkdir(self.output_dir)
        if not os.path.exists(os.path.join(args.experiment_dir,"runs/")):
            os.mkdir(os.path.join(args.experiment_dir,"runs/"))

        # initialize tensorboard writer
        self.runs_dir = os.path.join(args.experiment_dir,"runs/",args.run_name)
        self.writer = SummaryWriter(self.runs_dir)

        # initialize global steps
        self.train_gs = 0
        self.val_gs = 0

        # initialize model config
        self.config = ModelConfig(args)

        # check if there is a model to load
        if args.old_model_dir is not None:
            self.use_old_model = True
            self.load_dir = args.old_model_dir
            self.config.load_from_file(
                os.path.join(self.load_dir, "config.json"))

            # create vocab
            self.vocab = Vocab()
            self.vocab.load_from_dict(os.path.join(self.load_dir, "vocab.json"))
            self.update_vocab = False
            self.config.min_count=1
        else:
            self.use_old_model = False

            self.vocab = None
            self.update_vocab = True

        # create data sets
        self.dataset_filename = args.dataset_filename

        # train
        self.train_dataset = DialogueDataset(
            os.path.join(self.dataset_filename, "train_data.json"),
            self.config.sentence_len,
            self.vocab,
            self.update_vocab)
        self.data_loader_train = torch.utils.data.DataLoader(
            self.train_dataset, self.config.train_batch_size, shuffle=True)
        self.config.train_len = len(self.train_dataset)

        self.vocab = self.train_dataset.vocab

        # eval
        self.val_dataset = DialogueDataset(
            os.path.join(self.dataset_filename, "val_data.json"),
            self.config.sentence_len,
            self.vocab,
            self.update_vocab)
        self.data_loader_val = torch.utils.data.DataLoader(
            self.val_dataset, self.config.val_batch_size, shuffle=True)
        self.config.val_len = len(self.val_dataset)

        # update, and save vocab
        self.vocab = self.val_dataset.vocab
        self.train_dataset.vocab = self.vocab
        if (self.config.min_count > 1):
            self.config.old_vocab_size = len(self.vocab)
            self.vocab.prune_vocab(self.config.min_count)
        self.vocab.save_to_dict(os.path.join(self.output_dir, "vocab.json"))
        self.vocab_size = len(self.vocab)
        self.config.vocab_size = self.vocab_size

        # load embeddings
        if self.config.pretrained_embeddings_dir is None:
            pretrained_embeddings = get_pretrained_embeddings(self.config.pretrained_embeddings_dir , self.vocab)
        else:
            pretrained_embeddings = None

        # print and save the config file
        self.config.print_config(self.writer)
        self.config.save_config(os.path.join(self.output_dir, "config.json"))

        # set device
        self.device = torch.device('cuda')

        # create model
        self.model = Transformer(
            self.config.vocab_size,
            self.config.label_len,
            self.config.sentence_len,
            d_word_vec=self.config.embedding_dim,
            d_model=self.config.model_dim,
            d_inner=self.config.inner_dim,
            n_layers=self.config.num_layers,
            n_head=self.config.num_heads,
            d_k=self.config.dim_k,
            d_v=self.config.dim_v,
            dropout=self.config.dropout,
            pretrained_embeddings=pretrained_embeddings
        ).to(self.device)

        # create optimizer
        self.optimizer = torch.optim.Adam(
            filter(lambda x: x.requires_grad, self.model.parameters()),
            betas=(0.9, 0.98), eps=1e-09)

        # load old model, optimizer if there is one
        if self.use_old_model:
            self.model, self.optimizer = load_checkpoint(
                os.path.join(self.load_dir, "model.bin"),
                self.model, self.optimizer, self.device)


        # create a sceduled optimizer object
        self.optimizer = ScheduledOptim(
            self.optimizer, self.config.model_dim, self.config.warmup_steps)
Esempio n. 4
0
def run():
    logger.info("using device: {}".format(config.DEVICE))
    train_data = process_raw_data()
    train_list, test_list = train_test_split(train_data,
                                             test_size=0.2,
                                             random_state=34)

    # 加载GPT2模型
    model, n_ctx = create_model(True)
    model.to(config.DEVICE)
    # 是否使用多块GPU进行并行运算: 可以选择要使用哪几块显卡来进行训练
    multi_gpu = False
    if torch.cuda.is_available() and torch.cuda.device_count() > 1:
        logger.info("Using more than one GPUs to train...")
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = config.DEVICE_NUM
        model = DataParallel(
            model, device_ids=[int(i) for i in config.DEVICE_NUM.split(",")])
        multi_gpu = True

    # 记录模型参数数量
    num_parameters = sum(
        [parameter.numel() for parameter in model.parameters()])
    logger.info("number of model parameters: {}".format(num_parameters))

    # 加载数据
    logger.info("loading training data")
    train_dataset = DialogueDataset(train_list, n_ctx)
    batch_num = len(train_dataset) // config.BATCH_SIZE
    test_dataset = DialogueDataset(test_list, n_ctx)
    test_batch_num = len(test_dataset) // config.BATCH_SIZE

    train_data_loader = DataLoader(train_dataset,
                                   batch_size=config.BATCH_SIZE,
                                   shuffle=True,
                                   num_workers=4,
                                   collate_fn=collate_fn)

    test_data_loader = DataLoader(test_dataset,
                                  batch_size=config.BATCH_SIZE,
                                  shuffle=True,
                                  num_workers=1,
                                  collate_fn=collate_fn)

    # 计算所有epoch进行参数优化的总步数total_steps
    total_steps = int(
        len(train_data_loader) * config.EPOCHS / config.BATCH_SIZE /
        config.GRADIENT_ACCUMULATION)
    logger.info('total training steps = {}'.format(total_steps))

    # 设置优化器,并且在初始训练时,使用warmup策略
    optimizer = AdamW(model.parameters(),
                      lr=config.LEARNING_RATE,
                      correct_bias=True)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=config.WARM_STEPS,
        num_training_steps=total_steps)

    logger.info("start training...")
    best_loss = 100
    best_accuracy = 0
    for epoch in range(config.EPOCHS):
        train_fn(model, train_data_loader, optimizer, scheduler, epoch,
                 batch_num, multi_gpu)
        loss, accuracy = eval_fn(model, test_data_loader, test_batch_num,
                                 multi_gpu)
        if loss < best_loss or accuracy > best_accuracy:
            logger.info('saving model for epoch {}, best loss: {}'.format(
                epoch + 1, loss))
            model_to_save = model.module if hasattr(model, 'module') else model
            model_to_save.save_pretrained(config.MODEL_PATH)
            best_loss = loss
            best_accuracy = accuracy
Esempio n. 5
0
def run():
    logger.info("using device: {}".format(config.DEVICE))

    if config.TRAIN_MMI:
        train_data = process_mmi_raw_data()
    else:
        train_data = process_raw_data()
    train_list, test_list = train_test_split(train_data,
                                             test_size=0.2,
                                             random_state=34)

    # 加载GPT2模型
    model, n_ctx = create_model(mmi=config.TRAIN_MMI)
    model.to(config.DEVICE)

    # 是否使用多块GPU进行并行运算: 可以选择要使用哪几块显卡来进行训练
    multi_gpu = False
    if torch.cuda.is_available() and torch.cuda.device_count() > 1:
        logger.info("Using GPU to train...")
        model = DataParallel(
            model, device_ids=[int(i) for i in config.DEVICE_NUM.split(",")])
        multi_gpu = True
    else:
        logger.info("Using cpu to train...")

    # 记录模型参数数量
    num_parameters = sum(
        [parameter.numel() for parameter in model.parameters()])
    logger.info("number of model parameters: {}".format(num_parameters))

    # 加载数据
    logger.info("loading training data")
    train_dataset = DialogueDataset(train_list, n_ctx)
    batch_num = len(train_dataset) // config.BATCH_SIZE
    test_dataset = DialogueDataset(test_list, n_ctx)
    test_batch_num = len(test_dataset) // config.BATCH_SIZE

    train_data_loader = DataLoader(train_dataset,
                                   batch_size=config.BATCH_SIZE,
                                   shuffle=True,
                                   num_workers=config.TRAIN_NUM_WORKERS,
                                   collate_fn=collate_fn)

    test_data_loader = DataLoader(test_dataset,
                                  batch_size=config.BATCH_SIZE,
                                  shuffle=True,
                                  num_workers=config.TEST_NUM_WORKERS,
                                  collate_fn=collate_fn)

    # 计算所有epoch进行参数优化的总步数total_steps
    total_steps = int(
        len(train_data_loader) * config.EPOCHS / config.BATCH_SIZE /
        config.GRADIENT_ACCUMULATION)
    logger.info('total training steps = {}'.format(total_steps))

    # 设置优化器,并且在初始训练时,使用warmup策略
    optimizer = AdamW(model.parameters(),
                      lr=config.LEARNING_RATE,
                      correct_bias=True)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=config.WARM_STEPS,
        num_training_steps=total_steps)

    logger.info("start training...")
    best_accuracy = 0
    best_loss = 100
    for epoch in range(config.EPOCHS):
        epoch_start_time = datetime.now()
        train_fn(model, train_data_loader, optimizer, scheduler, epoch,
                 batch_num, multi_gpu)
        logger.info("time for epoch {}: {}".format(
            epoch + 1,
            datetime.now() - epoch_start_time))
        loss, accuracy = eval_fn(model, test_data_loader, test_batch_num,
                                 multi_gpu)
        if accuracy > best_accuracy or loss < best_loss:
            logger.info(
                'saving model for epoch {}, best accuracy is {}'.format(
                    epoch + 1, accuracy))
            if config.TRAIN_MMI:  # 当前训练MMI模型
                model_path = config.MMI_MODEL_PATH
            else:
                model_path = config.DIALOGUE_MODEL_PATH
            if not os.path.exists(model_path):
                os.mkdir(model_path)
            model_to_save = model.module if hasattr(model, 'module') else model
            model_to_save.save_pretrained(model_path)
            best_accuracy = accuracy
            best_loss = loss