Example #1
0
def run():
    lu.setup_logging()
    # Show the config
    logger.info("Config:\n{}".format(cfg))
    # seed everything for reproducability
    common.seed_everything(cfg.RNG_SEED)
    # Configure the CUDNN backend
    torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK
    # train the model
    train_model()
Example #2
0
def main():
    args = get_argparse().parse_args()
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)
    time_ = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())
    log_file = os.path.join(args.output_dir, 'tener-{}.log'.format(time_))
    init_logger(log_file=log_file)
    if args.gpu and args.use_cuda:
        device = torch.device("cuda:%s" % args.gpu)
        args.n_gpu = 1
    elif args.local_rank == -1 or args.use_cuda:
        device = torch.device(
            "cuda" if torch.cuda.is_available() and args.use_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)  # torch.Tensor分配到的设备对象
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1))
    seed_everything(args.seed)

    ner_processor = CluenerProcessor(args.data_path)
    model = NER_model(ner_processor, args)
    model.to(args.device)
    if args.model_path is not None:
        model.load_state_dict(
            torch.load(os.path.join(args.model_path, 'pytorch_model.bin')))

    if args.do_train:
        train(args, ner_processor, model)

    if args.do_eval:
        evaluate(args, ner_processor, model, show_entity_info=True)

    if args.do_predict:
        predict(args, ner_processor, model)
Example #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--gpt_config_file",
        default="CDial-GPT2_LCCC-base/config.json",
        type=str,
        help="The config json file corresponding to the pre-trained GPT model."
    )
    parser.add_argument(
        "--vocab_file",
        default="CDial-GPT2_LCCC-base/vocab.txt",
        type=str,
        help="The vocabulary file that the GPT model was trained on.")
    parser.add_argument(
        "--init_checkpoint",
        default="CDial-GPT2_LCCC-base/pytorch_model.bin",
        type=str,
        help="Initial checkpoint (usually from a pre-trained GPT model).")

    # Required parameters
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the predictions will be written.")
    parser.add_argument(
        "--model_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )
    parser.add_argument(
        "--cached_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the data cache will be written.")

    # Other parameters
    parser.add_argument("--name",
                        type=str,
                        default="GPT2Gen",
                        help="name of the model")
    parser.add_argument("--dataset",
                        type=str,
                        default="GPT2Gen",
                        help="Dataloader class.")
    parser.add_argument("--datapath",
                        type=str,
                        default="resources://OpenSubtitles",
                        help="Directory for data set.")
    parser.add_argument("--max_sent_length",
                        default=192,
                        tpye=int,
                        help="The max length of the sentence pair.")
    parser.add_argument("--num_turns",
                        default=8,
                        type=int,
                        help="The max turn length of the post field.")
    parser.add_argument(
        "--is_relative",
        action="store_true",
        help=
        "If True, use relative turn embedding, else use absolute turn embedding."
    )

    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        action="store_true",
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--cache",
                        action="store_ture",
                        help="Whether to save the data result.")

    parser.add_argument("--train_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for prediction.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for the optimizer.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")

    # ----------------- 用于推理阶段的一些参数 -------------------
    parser.add_argument("--no_sample",
                        action="store_true",
                        help="Set to use greedy decoding instead of sampling.")
    parser.add_argument("--min_decoder_length",
                        type=int,
                        default=3,
                        help="The minimum length of the generated response.")
    parser.add_argument("--max_decoder_length",
                        type=int,
                        default=30,
                        help="The maximum length of the generated response.")
    parser.add_argument("--temperature",
                        type=float,
                        default=1,
                        help="Sampling softmax temperature.")
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.0,
        help="Nucleus filetring (top-p) before sampling (<=0.0: no filtering)")
    # --------------------------------------------------------

    # 表示训练的前多少进行warm_up
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1=10% "
        "of training.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action="store_true",
                        help="Whether not to use CUDA when available.")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action="store_true",
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )

    args = parser.parse_args()

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir, exist_ok=True)
    if not os.path.exists(args.model_dir):
        os.makedirs(args.model_dir, exist_ok=True)

    data_class = GPTGen
    config_class = GPT2Config
    model_class = GPT2LMHeadModel
    tokenizer_class = BertTokenizer

    # 加载数据
    def load_dataset(file_id, vocab_name, do_lower_case, max_sent_length,
                     num_turns, is_relative):
        dm = data_class(file_id=file_id,
                        vocab_name=vocab_name,
                        do_lower_case=do_lower_case,
                        max_sent_length=max_sent_length,
                        num_turns=num_turns,
                        is_relative=is_relative)
        return dm

    logger.info("模型训练侧加载数据")
    if args.cache:
        dataManager = try_cache(
            load_dataset, {
                "file_id": args.datapath,
                "vocab_name": args.vocab_file,
                "do_lower_case": args.do_lower_case,
                "max_sent_length": args.max_sent_length,
                "num_turns": args.num_turns,
                "is_relative": args.is_relative
            }, args.cache_dir, data_class.__name__)
    else:
        dataManager = load_dataset(file_id=args.datapath,
                                   vocab_name=args.vocab_file,
                                   do_lower_case=args.do_lower_case,
                                   max_sent_length=args.max_sent_length,
                                   num_turns=args.num_turns,
                                   is_relative=args.is_relative)

    if args.do_train:
        if not args.no_cuda:
            if not "CUDA_VISIBLE_DEVICES" in os.environ:
                os.environ["CUDA_VISIBLE_DEVICES"] = '0'

        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
        logger.info(f"device: {device} | n_gpu: {n_gpu}")

        if args.gradient_accumulation_steps < 1:
            raise ValueError(
                f"Invalid gradient_accumulation_steps parameter: {args.gradient_accumulation_steps}, should be >= 1"
            )

        args.train_batch_size = int(args.train_batch_size /
                                    args.gradient_accumulation_steps)

        seed_everything(args.seed)

        logger.info("train examples {}".format(
            len(dataManager.data["train"]["resp"])))
        num_train_steps = int(len(dataManager.data["train"]["resp"]) \
                              / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)

        # 加载预训练模型
        config = config_class.from_json_file(args.gpt_config_file)
        config.num_turns = args.num_turns
        model = model_class(config)
        if args.init_checkpoint is not None:
            logger.info("加载GPT预训练权重")
            state_dict = torch.load(args.init_checkpoint, map_location="cpu")
            missing_keys = []
            unexpected_keys = []
            error_msgs = []
            # 深拷贝state_dict,便于下面的_load_from_state_dict进行修改
            metadata = getattr(state_dict, "_metadata", None)
            state_dict = state_dict.copy()
            if metadata is not None:
                state_dict._metadata = metadata

            def load(module, prefix=""):
                local_metadata = {} if metadata is None else metadata.get(
                    prefix[:-1], {})
                module._load_from_state_dict(state_dict, prefix,
                                             local_metadata, True,
                                             missing_keys, unexpected_keys,
                                             error_msgs)
                for name, child in module._modules.items():
                    if child is not None:
                        load(child, prefix + name + ".")

            load(
                model,
                prefix="" if hasattr(model, "transformer") else "transformer.")
            logger.info("missing keys: {}".format(missing_keys))
            logger.info("unexpected keys: {}".format(unexpected_keys))
            logger.info("error msgs: {}".format(error_msgs))

        model.to(device)
        model = torch.nn.DataParallel(model)

        # 准备优化器和优化参数
        param_optimizer = list(model.named_parameters())

        # 去除pooling层,这一层会产生梯度None
        # 影响apex的使用
        param_optimizer = [n for n in param_optimizer if "pooler" not in n[0]]
        no_decay = [
            "bias", "ln_1.bias", "ln_1.weight", "ln_2.bias", "ln_2.weight"
        ]
        optimizer_grouped_parameters = [{
            "params": [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.01
        }, {
            "params":
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay":
            0.0
        }]

        t_total = num_train_steps
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
        global_step = 0

        logger.info("***** Running training *****")
        logger.info("  Num training_examples = %d",
                    len(dataManager.data['train']['resp']))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        model.train()
        losses = []
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            model.zero_grad()
            # 初始化数据
            dataManager.restart(key="train",
                                batch_size=args.train_batch_size,
                                shuffle=True)
            # 获取下一个batch的数据
            data = dataManager.get_next_batch(key="train")
            step = 0
            loss_value = 0
            while data is not None:
                if n_gpu == 1:
                    preprocess_batch(data, device)
                else:
                    preprocess_batch(data)

                outputs = model(input_ids=data["input_ids"],
                                attention_mask=data["input_mask"],
                                token_type_ids=data["token_type_ids"],
                                turn_ids=data["turn_ids"],
                                labels=data["lm_labels"])
                # loss: 是一个tensor型的标量
                # lm_logits: [batch, seq_length, vocab_size]
                loss, lm_logits = outputs[0], outputs[1]
                if n_gpu > 1:
                    loss = loss.mean()
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss_value += loss.cpu().item(
                ) * args.gradient_accumulation_steps
                loss.backward()
                # 如果达到累积的梯度数量,则反向传播
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify leanring rate with special warm up GPT
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / t_total, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
                    losses.append(loss.detach().cpu().item())

                # 如果要记录当前的损失值
                if (step + 1) % 1000 == 0:
                    logger.info(
                        f"step: {step + 1} | loss: {(loss_value / 1000):.4f} | ppl: {(np.exp(loss_value / 1000)):.4f}"
                    )
                    loss_value = 0

                step += 1
                data = dataManager.get_next_batch(key="train")

            logger.info(
                f"保存模型 pytorch_model.{int(args.num_train_epochs)}.{epoch+1}.bin"
            )
            output_model_file = os.path.join(
                args.model_dir,
                f"pytorch_model.{int(args.num_train_epochs)}.{int(epoch+1)}.bin"
            )

            # 保存训练好的模型
            model_to_save = model.module if hasattr(model, "module") else model
            torch.save(model_to_save.state_dict(), output_model_file)

        # 保存损失
        logger.info("保存训练过程中的loss")
        save_losses(args.model_dir, losses={"loss": losses})
        logger.info("训练结束")

    # 定义测试集上运行的过程
    if args.do_predict:

        total_epoch = int(args.num_train_epochs)
        chosen_epoch = 10

        if not args.no_cuda:
            if not "CUDA_VISIBLE_DEVICES" in os.environ:
                os.environ["CUDA_VISIBLE_DEVICES"] = '0'

        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()

        seed_everything(args.seed)

        output_model_file = os.path.join(
            args.model_dir,
            "pytorch_model.%d.%d.bin" % (total_epoch, chosen_epoch))
        model_state_dict = torch.load(output_model_file)

        tokenizer = tokenizer_class(vocab_file=args.vocab_file,
                                    do_lower_case=args.do_lower_case)
        config = config_class.from_json_file(args.gpt_config_file)
        config.num_turns = args.num_turns
        model = model_class(config)
        model.load_state_dict(model_state_dict)
        model.to(device)

        logger.info(f"transform special tokens {SPECIAL_TOKENS} to ids")
        special_tokens_ids = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)

        if n_gpu > 1:
            model = torch.nn.DataParallel(model)

        # 这里的myMetrics主要用来计算bleu和distinct
        # 这里bleu和distinct计算是按照中文word计算的,而不是按照char
        metric1 = MyMetrics()
        metric2 = MyPerplexity(unk_id=dataManager.bert_unk_id)

        logger.info("***** Running testing *****")
        logger.info("  Num post-response pairs = %d",
                    len(dataManager.data['test']['resp']))
        logger.info("  Batch size = %d", args.predict_batch_size)

        model.eval()
        logger.info("Start evaluating")
        dataManager.restart(key='test',
                            batch_size=args.predict_batch_size,
                            shuffle=False)
        data = dataManager.get_next_batch(key='test')

        # 保存预测和生成的结果
        gold_strings = []
        gen_strings = []

        while data is not None:
            cur_batch_size = int(len(data["input_ids"]))
            for i in range(cur_batch_size):
                input_ids = data["input_ids"][i]
                token_type_ids = data["token_type_ids"][i]
                turn_ids = data["turn_ids"][i]
                # posts_len = data["posts_len"][i]
                resp_list = data["resp"][i]  # 这里是tokenizer分词之后的列表,并没有转化为id
                resp_length = data["resp_lens"][i]

                # 这里得到最终输出的所有ids
                with torch.no_grad():
                    # 这里的pred_logits是经过log_softmax之后的
                    # pred_ids: [seq_len], list型
                    # pred_logits: [seq_len, vocab_size], torch.Tensor类型
                    pred_ids, pred_logits = sample_sequence(
                        history=input_ids,
                        model=model,
                        args=args,
                        device=device,
                        special_tokens_ids=special_tokens_ids,
                        token_type_ids=token_type_ids,
                        turn_ids=turn_ids,
                        current_output=None)
                # 将输出的ids转化为tokens
                # decode的输出是一个字符串,token之间使用空格拼接
                pred_text = tokenizer.decode(pred_ids,
                                             skip_special_tokens=False)
                # 计算bleu和distinct指标
                pred_text_string = "".join(pred_text.split())
                resp_text_string = "".join(resp_list)
                metric1.forward(ref=resp_text_string, hyp=pred_text_string)
                # 计算ppl
                # 将resp_list转化为torch.Tensor类型的token_id
                resp_ids = torch.tensor(
                    tokenizer.convert_tokens_to_ids(resp_list),
                    dtype=torch.long,
                    device=pred_logits.device)
                metric2.forward(resp_length=resp_length,
                                resp_ids=resp_ids,
                                gen_log_prob=pred_logits)
                gold_strings.append(resp_text_string)
                gen_strings.append(pred_text_string)
            data = dataManager.get_next_batch(key="test")

        hits = test_process_hits(dataManager, model, args)
        result = metric1.close()
        result.update(metric2.close())
        result.update(hits)

        # 保存预测结果
        output_prediction_file = args.output_dir + f"/{args.name}_test.{total_epoch}.{chosen_epoch}.txt"
        logger.info(f"预测指标保存的路径 {output_prediction_file}")
        with open(output_prediction_file, "w", encoding="utf-8") as f:
            print("Test Result: ")
            res_print = list(result.items())
            res_print.sort(key=lambda x: x[0])
            for key, value in res_print:
                if isinstance(value, float):
                    print(f"\t{key}:\t{value}")
                    f.write(f"{key}:\t{value}\n")
            f.write("\n")

            for gold, gen in zip(gold_strings, gen_strings):
                f.write(f"resp:\t{gold}\n")
                f.write(f"gen:\t{gen}\n\n")
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--bert_config_file", default="chinese_wwm_pytorch/bert_config.json",
                        type=str, help="The config json file corresponding to the pre-trained BERT model. "
                                       "This specifies the model architecture.")
    parser.add_argument("--vocab_file", default="chinese_wwm_pytorch/vocab.txt", type=str,
                        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument("--init_checkpoint", default="chinese_wwm_pytorch/pytorch_model.bin",
                        type=str, help="Initial checkpoint (usually from a pre-trained BERT model).")

    ## Required parameters
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model checkpoints and predictions will be written.")
    parser.add_argument("--model_dir", default=None, type=str, required=True,
                        help="The output directory where the model checkpoints and predictions will be written.")
    parser.add_argument("--cache_dir", default=None, type=str, required=True, help="Whether to run training.")

    ## Other parameters
    parser.add_argument('--name', type=str, default='BERTRetrieval', help='name of model')

    parser.add_argument('--dataset', type=str, default='ChDialogMemBERTRetrieval', help='Dataloader class. Default: OpenSubtitles')
    parser.add_argument('--datapath', type=str, default='resources://OpenSubtitles',
                        help='Directory for data set. Default: resources://OpenSubtitles')
    parser.add_argument('--wv_class', type=str, default='TencentChinese',
                        help="Wordvector class, none for not using pretrained wordvec. Default: Glove")
    parser.add_argument('--wv_path', type=str, default='/home/zhengchujie/wordvector/chinese',
                        help="Directory for pretrained wordvector. Default: resources://Glove300d")
    parser.add_argument('--embedding_size', type=int, default=200,
                        help="The embed dim of the pretrained word vector.")

    parser.add_argument("--num_choices", default=10, type=int,
                        help="the number of retrieval options")
    parser.add_argument("--max_sent_length", default=192, type=int,
                        help="The max length of the sentence pair.")
    parser.add_argument("--max_know_length", default=100, type=int,
                        help="The max length of the knowledge triplets")
    parser.add_argument("--num_turns", default=8, type=int,
                        help="The max turn length of the post field.")

    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
    parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.")
    parser.add_argument("--cache", action='store_true', help="Whether to run training.")

    parser.add_argument("--train_batch_size", default=8, type=int, help="Total batch size for training.")
    parser.add_argument("--predict_batch_size", default=16, type=int, help="Total batch size for predictions.")
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs", default=3.0, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion", default=0.1, type=float,
                        help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
                             "of training.")
    parser.add_argument("--lamb", default=0.6, type=float,
                        help="The factor of the attention loss.")

    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--do_lower_case",
                        default=True,
                        action='store_true',
                        help="Whether to lower case the input text. True for uncased models, False for cased models.")

    args = parser.parse_args()

    if not args.do_train and not args.do_predict:
        raise ValueError("At least one of `do_train` or `do_predict` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir, exist_ok=True)
    if not os.path.exists(args.model_dir):
        os.makedirs(args.model_dir, exist_ok=True)

    data_class = MyMemBERTRetrieval
    wordvec_class = TencentChinese

    # 加载数据
    def load_dataset(file_id, bert_vocab_name, do_lower_case, num_choices, max_sent_length, max_know_length, num_turns):
        dm = data_class(file_id=file_id, bert_vocab_name=bert_vocab_name, do_lower_case=do_lower_case, num_choices=num_choices,
                        max_sent_length=max_sent_length, max_know_length=max_know_length, num_turns=num_turns)
        return dm

    logger.info("模型训练侧加载数据")
    if args.cache:
        if not os.path.isdir(args.cache_dir):
            os.mkdir(args.cache_dir)
        logger.info("加载缓存数据")
        dataManager = try_cache(load_dataset,
                                {"file_id": args.datapath, "bert_vocab_name": args.vocab_file,
                                 "do_lower_case": args.do_lower_case, "num_choices": args.num_choices,
                                 "max_sent_length": args.max_sent_length, "max_know_length": args.max_know_length,
                                 "num_turns": args.num_turns},
                                args.cache_dir,
                                data_class.__name__)
        vocab = dataManager.id2know_word
        logger.info("加载词向量文件")
        embed = try_cache(lambda wv, ez, vl: wordvec_class(wv).load_matrix(ez, vl), (args.wv_path, args.embedding_size, vocab),
                          args.cache_dir, wordvec_class.__name__)
    else:
        dataManager = load_dataset(file_id=args.datapath, bert_vocab_name=args.vocab_file, do_lower_case=args.do_lower_case,
                                   num_choices=args.num_choices, max_sent_length=args.max_sent_length,
                                   max_know_length=args.max_know_length, num_turns=args.num_turns)
        logger.info("定义并加载词向量文件")
        wv = wordvec_class(args.wv_path)
        vocab = dataManager.id2know_word
        embed = wv.load_matrix(args.embedding_size, vocab)

    #dataManager._max_know_length = 100

    if args.do_train:
        if not args.no_cuda:
            if not "CUDA_VISIBLE_DEVICES" in os.environ:
                os.environ["CUDA_VISIBLE_DEVICES"] = '0'

        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
        logger.info("device: {} n_gpu: {}".format(device, n_gpu))

        if args.gradient_accumulation_steps < 1:
            raise ValueError(
                "Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(args.gradient_accumulation_steps))

        args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

        seed_everything(args.seed)

        logger.info("train examples {}".format(len(dataManager.data['train']['resp'])))
        num_train_steps = int(len(dataManager.data['train'][
                                      'resp']) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)

        # Prepare model
        '''
        if os.path.exists(output_model_file):
            model_state_dict = torch.load(output_model_file)
            model = BERTRetrieval(num_choices=args.num_choices, bert_config_file=args.bert_config_file)
            model.load_state_dict(model_state_dict)
        '''
        model = BERTRetrieval(num_choices=args.num_choices, bert_config_file=args.bert_config_file, init_embeddings=embed)
        if args.init_checkpoint is not None:
            logger.info('load bert weight')
            state_dict = torch.load(args.init_checkpoint, map_location='cpu')
            missing_keys = []
            unexpected_keys = []
            error_msgs = []
            # copy state_dict so _load_from_state_dict can modify it
            metadata = getattr(state_dict, '_metadata', None)
            state_dict = state_dict.copy()
            if metadata is not None:
                state_dict._metadata = metadata

            def load(module, prefix=''):
                local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})

                module._load_from_state_dict(state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys,
                                             error_msgs)
                for name, child in module._modules.items():
                    # logger.info("name {} chile {}".format(name,child))
                    if child is not None:
                        load(child, prefix + name + '.')

            load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
            logger.info("missing keys:{}".format(missing_keys))
            logger.info('unexpected keys:{}'.format(unexpected_keys))
            logger.info('error msgs:{}'.format(error_msgs))

        model.to(device)
        model = torch.nn.DataParallel(model)

        # Prepare optimizer
        param_optimizer = list(model.named_parameters())
        #
        # # hack to remove pooler, which is not used
        # # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
        # 分层学习率
        bert_param_optimizer = [(n, p) for n, p in param_optimizer if ("bert" in n)]
        other_param_optimizer = [(n, p) for n, p in param_optimizer if ("bert" not in n)]



        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        # optimizer_grouped_parameters = [
        #     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        #     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

        optimizer_grouped_parameters = [
            {"params": [p for n, p in bert_param_optimizer if not any(nd in n for nd in no_decay)],
             "weight_decay": 0.01, "lr": 2e-5},
            {"params": [p for n, p in bert_param_optimizer if any(nd in n for nd in no_decay)],
             "weight_decay": 0.0, "lr": 2e-5},
            {"params": [p for n, p in other_param_optimizer if not any(nd in n for nd in no_decay)],
             "weight_decay": 0.01, "lr": args.learning_rate},
            {"params": [p for n, p in other_param_optimizer if any(nd in n for nd in no_decay)],
             "weight_decay": 0.0, "lr": args.learning_rate}
        ]

        t_total = num_train_steps
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(t_total * args.warmup_proportion),
                                                    num_training_steps=t_total)
        global_step = 0

        logger.info("***** Running training *****")
        logger.info("  Num post-response pairs = %d", len(dataManager.data['train']['resp']))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        model.train()
        losses = []
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            model.zero_grad()
            dataManager.restart(key='train', batch_size=args.train_batch_size)
            data = dataManager.get_next_batch(key='train')
            step = 0
            loss_value = 0
            kg_loss_value = 0
            kg_acc_value = 0
            while data is not None:
                if n_gpu == 1:
                    preprocess_batch(data, device) # multi-gpu does scattering it-self
                else:
                    preprocess_batch(data)
                loss, kg_loss, kg_acc = model(data, data['labels'])
                loss = loss + args.lamb * kg_loss
                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss_value += loss.cpu().item() * args.gradient_accumulation_steps
                kg_loss_value += kg_loss.cpu().item()
                kg_acc_value += kg_acc.cpu().item()
                loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    # lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion)
                    # for param_group in optimizer.param_groups:
                    #     param_group['lr'] = lr_this_step
                    optimizer.step()
                    scheduler.step()
                    model.zero_grad()
                    global_step += 1

                    # 每次反向传播之前记录一下当前的指标
                    losses.append(loss.detach().cpu().item())


                if (step + 1) % 1000 == 0:
                    logger.info("step:{} | loss@{} | kg_loss@{} | kg_acc@{}".format(step + 1,
                                                                                    loss_value / 1000,
                                                                                    kg_loss_value / 1000,
                                                                                    kg_acc_value / 1000))
                    loss_value = 0
                    kg_loss_value = 0
                    kg_acc_value = 0

                step += 1
                data = dataManager.get_next_batch(key='train')

            logger.info(f"保存模型 pytorch_model.{int(args.num_train_epochs)}.{epoch+1}.bin")
            output_model_file = os.path.join(args.model_dir,
                                             "pytorch_model.%d.%d.bin" % (int(args.num_train_epochs), epoch + 1))

            # Save a trained model
            model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
            torch.save(model_to_save.state_dict(), output_model_file)

        # 保存所有的损失值
        logger.info("保存训练过程的loss")
        save_losses(args.model_dir, losses={"loss": losses})
        logger.info("训练结束")
    # Load a trained model that you have fine-tuned

    if args.do_predict:
        total_epoch = int(args.num_train_epochs)
        chosen_epoch = 10

        if not args.no_cuda:
            if not "CUDA_VISIBLE_DEVICES" in os.environ:
                os.environ["CUDA_VISIBLE_DEVICES"] = '0'

        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()

        seed_everything(args.seed)

        output_model_file = os.path.join(args.model_dir, "pytorch_model.%d.%d.bin" %
                                         (total_epoch,
                                          chosen_epoch))

        model_state_dict = torch.load(output_model_file)
        model = BERTRetrieval(num_choices=args.num_choices, bert_config_file=args.bert_config_file, init_embeddings=embed)
        model.load_state_dict(model_state_dict)
        model.to(device)

        if n_gpu > 1:
            model = torch.nn.DataParallel(model)

        metric = MyMetrics()

        logger.info("***** Running testing *****")
        logger.info("  Num post-response pairs = %d", len(dataManager.data['test']['resp']))
        logger.info("  Batch size = %d", args.predict_batch_size)

        model.eval()
        logger.info("Start evaluating")
        dataManager.restart(key='test', batch_size=args.predict_batch_size, shuffle=False)
        data = dataManager.get_next_batch(key='test')

        gens = []
        gold = []
        choices = []

        hits = {1: [0, 0], 3:[0, 0], 5: [0, 0]}
        while data is not None:
            preprocess_batch(data, device)
            truth_response, can_responses = data['resp'], data['can_resps']

            with torch.no_grad():
                prob, pred = model(data)

            assert len(pred) == len(truth_response)
            assert len(pred) == len(can_responses)
            assert len(can_responses[0]) == args.num_choices

            for truth, pd, cans, prb in zip(truth_response, pred, can_responses, prob):
                metric.forword(truth, cans[pd])

                gold.append(truth)
                gens.append(cans[pd])
                choices.append(cans)

                idx = cans.index(truth)
                p_sort = np.argsort(prb)
                for key, count in hits.items():
                    if idx in p_sort[-key:]:
                        count[0] += 1
                    count[1] += 1

            data = dataManager.get_next_batch(key='test')

        result = metric.close()
        result.update({'hits@%d' % key: value[0] / value[1] for key, value in hits.items()})

        output_prediction_file = args.output_dir + f"/{args.name}_test.{total_epoch}.{chosen_epoch}.txt"
        with open(output_prediction_file, "w", encoding="utf-8") as f:
            print("Test Result:")
            res_print = list(result.items())
            res_print.sort(key=lambda x: x[0])
            for key, value in res_print:
                if isinstance(value, float):
                    print("\t%s:\t%f" % (key, value))
                    f.write("%s:\t%f\n" % (key, value))
            f.write('\n')

            for resp, gen, options in zip(gold, gens, choices):
                f.write("resp:\t%s\n" % resp)
                f.write("gen:\t%s\n\n" % gen)
                for i, option in enumerate(options):
                    f.write("candidate %d:\t%s\n" % (i, option))
                f.write("\n")
Example #5
0
                        help="number of arenas")
    parser.add_argument("--batch_size", type=int, default=32)
    parser.add_argument("--print_interval", type=int, default=20)
    parser.add_argument("--gpu",
                        action="store_true",
                        default=False,
                        help="Use GPU if available (default device)")
    parser.add_argument("--tag",
                        action="append",
                        nargs="+",
                        default=[],
                        help="add user tags to run")

    args = parser.parse_args()

    seed_everything(args.seed)

    token = os.environ.get('TELEGRAM_BOT_TOKEN', None)
    channel = os.environ.get('TELEGRAM_CHANNEL', None)
    if args.notg:
        token = None
        channel = None

    commit_hash = subprocess.check_output(
        ['git', 'rev-parse', '--short', 'HEAD']).decode('ascii').strip('\n')
    experiment_tags = ["dqn_v1", f"commit_{str(commit_hash)}"]
    tags = list(np.asarray(args.tag).flatten())
    experiment_tags.extend(tags)

    tgwriter = TelegramWriter(token, channel)
    with tgwriter.post() as f:
Example #6
0
def train(args, processor, model):
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_dataset = NER_dataset(
        load_and_cache_examples(args, processor, data_type='train'),
        args.train_max_seq_len)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=collate_fn)
    t_total = len(train_dataloader) * args.epoch

    transformer_param_optimizer = list(model.transformer.parameters())
    crf_param_optimizer = list(model.crf.parameters())
    linear_param_optimizer = list(model.out_fc.parameters())

    optimizer_grouped_parameters = [
        {
            'params': transformer_param_optimizer,
            'lr': args.learning_rate
        },
        {
            'params': crf_param_optimizer,
            'lr': args.crf_learning_rate
        },
        {
            'params': linear_param_optimizer,
            'lr': args.crf_learning_rate
        },
    ]
    args.warmup_steps = int(t_total * args.warmup_rate)
    if args.optim == 'sgd:':
        optimizer = optim.SGD(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              momentum=args.momentum_rate)
    elif args.optim == 'adam':
        optimizer = optim.Adam(optimizer_grouped_parameters,
                               lr=args.learning_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)
    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.epoch)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed) = %d",
        args.train_batch_size *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    steps_trained_in_current_epoch = 0

    best_f1 = 0.0
    tr_loss = 0.0
    model.zero_grad()
    # Added here for reproductibility (even between python 2 and 3)
    seed_everything(args.seed)
    for index in range(int(args.epoch)):
        for step, batch in enumerate(train_dataloader):
            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                "input_ids": batch[0],
                "input_mask": batch[1],
                "labels": batch[2],
                'input_lens': batch[3]
            }
            outputs = model(**inputs)
            # model outputs are always tuple in pytorch-transformers (see doc)
            loss = outputs[0]
            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            loss.backward()
            tr_loss += loss.item()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           args.max_grad_norm)
            scheduler.step()  # Update learning rate schedule
            optimizer.step()
            model.zero_grad()
            if global_step % args.log_steps == 0:
                logger.info(
                    "training porcess —— epoch:%d —— global_step-%d —— loss-%.4f"
                    % (index + 1, global_step + 1, loss.item()))
            global_step += 1
        if args.local_rank in [-1, 0]:
            # Log metrics
            print(" ")
            if args.local_rank == -1:
                # Only evaluate when single GPU otherwise metrics may not average well
                eval_results = evaluate(args, processor, model)
                if eval_results['f1'] > best_f1:
                    logger.info(
                        f"\nEpoch {index+1}: eval_f1 improved from {best_f1} to {eval_results['f1']}"
                    )
                    output_dir = os.path.join(args.output_dir, "best_model")
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    torch.save(model_to_save.state_dict(),
                               os.path.join(output_dir, "pytorch_model.bin"))
                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)
                    best_f1 = eval_results['f1']
        if 'cuda' in str(args.device):
            torch.cuda.empty_cache()
    return global_step, tr_loss / global_step