def main():
    # 设置模型训练参数
    args = set_args()

    # 设置cuda信息
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICE"] = args.device

    # 获取device信息,用于模型训练
    device = torch.device(
        "cuda" if torch.cuda.is_available() and int(args.device) >= 0 else "cpu"
    )

    # 设置随机种子
    if args.seed:
        torch.manual_seed(args.seed)
        random.seed(args.seed)
        np.random.seed(args.seed)

    # 加载模型的config
    model_config = GPT2Config.from_json_file(args.config_path)

    if args.pretrained_model_path:
        model = GPT2LMHeadModel.from_pretrained(args.pretrained_model_path)
    else:
        # 如果没有指定的预训练模型,则初始化模型
        model = GPT2LMHeadModel(config=model_config)

    # 实例化tokenizer
    tokenizer = BertTokenizer.from_pretrained(args.vocab_path, do_lower_case=True)

    # 将[space]作为一个分割整体,例如:"我爱[Space]中国。",使用原始tokenizer分词结果为"['我', '爱', '[', 'Space', ']', '中', '国', '。']";
    # 增加分割符号后的结果为"['我', '爱', '[Space]', '中', '国', '。']"
    tokenizer.add_tokens("[Space]", special_tokens=True)
    # 创建模型的输出目录
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    # 加载训练数据和测试数据
    train_data = GPT2NewsTitleDataSet(
        tokenizer,
        args.max_len,
        args.title_max_len,
        args.data_dir,
        "train",
        args.train_file_path,
    )
    test_data = GPT2NewsTitleDataSet(
        tokenizer,
        args.max_len,
        args.title_max_len,
        args.data_dir,
        "test",
        args.test_file_path,
    )
    # 开始训练
    train(model, device, train_data, test_data, args)
Esempio n. 2
0
def start_sever():
    args = set_args()
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICE"] = args.device
    device = torch.device("cuda" if torch.cuda.is_available()
                          and int(args.device) >= 0 else "cpu")
    # 实例化tokenizer和model
    tokenizer = BertTokenizer.from_pretrained(args.vocab_path,
                                              do_lower_case=True)
    model = GPT2LMHeadModel.from_pretrained(args.output_dir)
    model.to(device)
    model.eval()
    print("load model ending!")
    app = Flask(__name__)

    @app.route('/')
    def index():
        return "This is News Title Generate Model Server"

    @app.route('/news-title-generate', methods=['Get', 'POST'])
    def response_request():
        if request.method == 'POST':
            content = request.form.get('content')
            titles = predict_one_sample(model, tokenizer, device, args,
                                        content)
            title_str = ""
            for i, t in enumerate(titles):
                title_str += "生成的第{}个标题为:{}\n".format(i + 1, t)
            return render_template("index_ok.html",
                                   content=content,
                                   titles=title_str)
        return render_template("index.html")

    server = wsgi.WSGIServer((str(args.http_id), args.port), app)
    server.serve_forever()
Esempio n. 3
0
def setup(data_folder):
    np.random.seed(0)
    torch.cuda.manual_seed_all(0)
    torch.manual_seed(0)

    codec = get_encoder()

    dataset = NewsDataset(path=data_folder,
                          ctx_length=128,
                          codec=codec,
                          start_from_zero=True)

    config = GPT2Config()
    model = GPT2LMHeadModel(config)
    if not os.path.exists('gpt2-pytorch_model.bin'):
        print("Downloading GPT-2 checkpoint...")
        url = 'https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin'
        r = requests.get(url, allow_redirects=True)
        open('gpt2-pytorch_model.bin', 'wb').write(r.content)

    model = load_weight(
        model, torch.load('gpt2-pytorch_model.bin', map_location=device))
    model = model.to(device)
    model.eval()
    return codec, model, dataset, config
def main():
    """主函数"""
    # 设置预测的配置参数
    args = set_args()
    # 获取设备信息
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICE"] = args.device
    device = torch.device("cuda" if torch.cuda.is_available()
                          and int(args.device) >= 0 else "cpu")
    # 实例化tokenizer和model
    tokenizer = BertTokenizer.from_pretrained(args.vocab_path,
                                              do_lower_case=True)
    model = GPT2LMHeadModel.from_pretrained(args.model_path)
    model.to(device)
    model.eval()
    print('开始对新闻生成标题,输入CTRL + Z,则退出')
    try:
        while True:
            content = input("输入的新闻正文为:")
            titles = predict_one_sample(model, tokenizer, device, args,
                                        content)
            for i, title in enumerate(titles):
                print("生成的第{}个标题为:{}".format(i + 1, title))
    except:
        pass
Esempio n. 5
0
def main():
    # 设置模型训练参数
    args = set_args()

    # 设置随机种子,方便模型复现
    if args.seed:
        torch.manual_seed(args.seed)
        random.seed(args.seed)
        np.random.seed(args.seed)
    # 加载模型的config
    model_config = GPT2Config.from_json_file(args.config_path)

    # 实例化GPT2LMHeadModel模型,这里我们没有加载预训练好的模型,而是直接从头开始训练。
    if args.pretrained_model_path:
        model = GPT2LMHeadModel.from_pretrained(args.pretrained_model_path)
    else:
        # 如果没有指定的预训练模型,则初始化模型
        model = GPT2LMHeadModel(config=model_config)

    tokenizer = BertTokenizer.from_pretrained(args.vocab_path,
                                              do_lower_case=True)

    # 将[space]作为一个分割整体,例如:"我爱[Space]中国。",使用原始tokenizer分词结果为"['我', '爱', '[', 'Space', ']', '中', '国', '。']";
    # 增加分割符号后的结果为"['我', '爱', '[Space]', '中', '国', '。']"
    tokenizer.add_tokens("[Space]", special_tokens=True)

    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    # 加载训练数据和测试数据
    train_data = GPT2NewsTitleDataSet(tokenizer, args.max_len,
                                      args.title_max_len, args.data_dir,
                                      "train", args.train_file_path)
    test_data = GPT2NewsTitleDataSet(tokenizer, args.max_len,
                                     args.title_max_len, args.data_dir, "test",
                                     args.test_file_path)
    # 开始训练
    train(model, train_data, test_data, args)
Esempio n. 6
0
def setup(n_enc_layer=1):
    np.random.seed(0)
    torch.cuda.manual_seed_all(0)
    torch.manual_seed(0)

    codec = get_encoder()
    config = GPT2Config(n_enc_layer=n_enc_layer)
    model = GPT2LMHeadModel(config)
    if not os.path.exists('../gpt2-pytorch_model.bin'):
        print("Downloading GPT-2 checkpoint...")
        url = 'https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin'
        r = requests.get(url, allow_redirects=True)
        open('../gpt2-pytorch_model.bin', 'wb').write(r.content)

    model = load_weight(
        model, torch.load('../gpt2-pytorch_model.bin', map_location=device))
    model = model.to(device)
    return codec, model, config
Esempio n. 7
0
def main():
    args = set_args()

    # 实例化tokenizer和model
    tokenizer = BertTokenizer.from_pretrained(args.vocab_path,
                                              do_lower_case=True)
    model = GPT2LMHeadModel.from_pretrained(args.model_path)

    if torch.cuda.is_available():
        model.cuda()

    model.eval()
    print('开始对新闻生成标题,输入CTRL + Z,则退出')
    try:
        while True:
            content = input("输入的新闻正文为:")
            titles = predict_one_sample(model, tokenizer, args, content)
            for i, title in enumerate(titles):
                print("生成的第{}个标题为:{}".format(i + 1, title))
    except:
        pass
Esempio n. 8
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument("--train_dataset",
                        type=str,
                        default="data/corpus.small",
                        help="train dataset")
    parser.add_argument(
        "--test_dataset",
        type=str,
        default="data/corpus.small",
        help="test set for evaluation",
    )
    parser.add_argument("--vocab_file", default="gpt2-vocab.json", type=str)
    parser.add_argument("--merges_file", default="gpt2-merges.txt", type=str)
    parser.add_argument("--output_path",
                        default="output/",
                        type=str,
                        help="save path")
    parser.add_argument("--restore_file",
                        default=None,
                        type=str,
                        help="the path for pretrained model")

    parser.add_argument("--seq_len",
                        type=int,
                        default=128,
                        help="maximum sequence len")

    parser.add_argument("--batch_size",
                        type=int,
                        default=8,
                        help="number of batch_size")
    parser.add_argument("--epochs",
                        type=int,
                        default=5,
                        help="number of epochs")
    parser.add_argument("--num_workers",
                        type=int,
                        default=0,
                        help="dataloader worker size")

    parser.add_argument("--lr",
                        type=float,
                        default=3e-4,
                        help="learning rate of adam")
    parser.add_argument("--adam_weight_decay",
                        type=float,
                        default=0.01,
                        help="weight_decay of adam")
    parser.add_argument("--adam_beta1",
                        type=float,
                        default=0.98,
                        help="adam first beta value")
    parser.add_argument("--adam_beta2",
                        type=float,
                        default=0.999,
                        help="adam first beta value")
    parser.add_argument("--warmup_steps",
                        type=int,
                        default=1000,
                        help="warmup steps")
    parser.add_argument(
        "--accumulate_gradient_steps",
        type=int,
        default=1,
        help="accumulate gradient steps",
    )

    args = parser.parse_args()

    print("building tokenizer")
    tokenizer = build_tokenizer(
        vocab_file=args.vocab_file,
        merges_file=args.merges_file,
        tokenizer_type="GPT2BPETokenizer",
    )

    print("building train dataset")
    train_dataset = GPTDataset(args.train_dataset, tokenizer, args.seq_len)

    print("building test dataset")
    test_dataset = GPTDataset(args.test_dataset, tokenizer, args.seq_len)

    print("building train dataloader")
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers)

    print("building test dataloader")
    test_data_loader = DataLoader(
        test_dataset,
        shuffle=False,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
    )

    print("building model")
    config = GPT2Config()
    model = GPT2LMHeadModel(config)

    if args.restore_file is not None:
        model.load_state_dict(flow.load(args.restore_file))
    model.lm_head.weight = model.transformer.wte.weight

    trainer = Trainer(
        model,
        train_dataloader=train_data_loader,
        test_dataloader=test_data_loader,
        epoch=args.epochs,
        lr=args.lr,
        betas=(args.adam_beta1, args.adam_beta2),
        weight_decay=args.adam_weight_decay,
        warmup_steps=args.warmup_steps,
        accumulate_gradient_steps=args.accumulate_gradient_steps,
        output_path=args.output_path,
    )

    print("begin training")
    trainer.train()
Esempio n. 9
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--train_dataset",
        required=False,
        type=str,
        default="data/corpus.small",
        help="train dataset",
    )
    parser.add_argument(
        "--test_dataset",
        type=str,
        default="data/corpus.small",
        help="test set for evaluation",
    )
    parser.add_argument("--vocab_file",
                        required=False,
                        default="vocab.json",
                        type=str)
    parser.add_argument("--merges_file",
                        required=False,
                        default="merge.txt",
                        type=str)
    parser.add_argument(
        "--output_path",
        required=False,
        default="output/model",
        type=str,
        help="save path",
    )

    parser.add_argument("--seq_len",
                        type=int,
                        default=128,
                        help="maximum sequence len")

    parser.add_argument("--batch_size",
                        type=int,
                        default=4,
                        help="number of batch_size")
    parser.add_argument("--epochs",
                        type=int,
                        default=50,
                        help="number of epochs")
    parser.add_argument("--num_workers",
                        type=int,
                        default=0,
                        help="dataloader worker size")

    parser.add_argument(
        "--with_cuda",
        type=bool,
        default=True,
        help="training with CUDA: true, or false",
    )

    parser.add_argument("--lr",
                        type=float,
                        default=1e-4,
                        help="learning rate of adam")
    parser.add_argument("--adam_weight_decay",
                        type=float,
                        default=0.01,
                        help="weight_decay of adam")
    parser.add_argument("--adam_beta1",
                        type=float,
                        default=0.9,
                        help="adam first beta value")
    parser.add_argument("--adam_beta2",
                        type=float,
                        default=0.999,
                        help="adam first beta value")

    args = parser.parse_args()

    print("building tokenizer")
    tokenizer = build_tokenizer(
        vocab_file=args.vocab_file,
        merges_file=args.merges_file,
        tokenizer_type="GPT2BPETokenizer",
    )

    print("building train dataset")
    train_dataset = GPTDataset(args.train_dataset, tokenizer, args.seq_len)

    print("building train dataloader")
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers)

    for i, b in enumerate(train_data_loader):
        if i == 2:
            batch = b
            break

    of_batch = batch.cuda()

    print("building model")
    config = GPT2Config()

    pt_batch = torch.from_numpy(batch.numpy()).long().cuda()

    model = pt_GPT2LMHeadModel(config)

    model.load_state_dict(torch.load("gpt2_model.pt"))
    model.lm_head.weight = model.transformer.wte.weight

    model.cuda()
    model.eval()

    learning_rate = 0.01
    mom = 0.9
    pt_optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=0.0001,
        betas=(args.adam_beta1, args.adam_beta2),
        weight_decay=args.adam_weight_decay,
    )

    for_time = 0.0
    bp_time = 0.0
    update_time = 0.0
    pt_loss = list()
    loss = None
    print("start pytorch training loop....")
    start_t = time.time()
    for epoch in range(args.epochs):
        s_t = time.time()
        loss = model(pt_batch, labels=pt_batch)[0]
        for_time += time.time() - s_t

        pt_loss.append(loss.item())

        s_t = time.time()
        loss.backward()
        bp_time += time.time() - s_t

        s_t = time.time()
        pt_optimizer.step()
        pt_optimizer.zero_grad()
        update_time += time.time() - s_t

    end_t = time.time()

    print("pytorch traning loop avg time : {}".format(
        (end_t - start_t) / args.epochs))
    print("forward avg time : {}".format(for_time / args.epochs))
    print("backward avg time : {}".format(bp_time / args.epochs))
    print("update parameters avg time : {}".format(update_time / args.epochs))

    pt_parameters_names = []
    pt_parameters_value = []
    for name, param in model.named_parameters():
        pt_parameters_names.append(name)
        pt_parameters_value.append(param.cpu().detach().numpy())

    model = GPT2LMHeadModel(config)

    model.load_state_dict(flow.load("gpt2_oneflow_model"))
    model.lm_head.weight = model.transformer.wte.weight

    model.cuda()
    model.eval()

    optimizer = flow.optim.AdamW(
        model.parameters(),
        lr=0.0001,
        betas=(args.adam_beta1, args.adam_beta2),
        weight_decay=args.adam_weight_decay,
    )

    for_time = 0.0
    bp_time = 0.0
    update_time = 0.0
    of_loss = list()

    print("start oneflow training loop....")
    start_t = time.time()
    for epoch in range(args.epochs):
        s_t = time.time()
        loss = model(of_batch, labels=of_batch)[0]
        for_time += time.time() - s_t

        of_loss.append(loss.numpy())

        s_t = time.time()
        loss.backward()
        bp_time += time.time() - s_t

        s_t = time.time()
        optimizer.step()
        optimizer.zero_grad()
        update_time += time.time() - s_t

    end_t = time.time()

    print("oneflow traning loop avg time : {}".format(
        (end_t - start_t) / args.epochs))
    print("forward avg time : {}".format(for_time / args.epochs))
    print("backward avg time : {}".format(bp_time / args.epochs))
    print("update parameters avg time : {}".format(update_time / args.epochs))

    for i in range(args.epochs):
        print(i, of_loss[i], pt_loss[i])

    import matplotlib.pyplot as plt

    plt.switch_backend("agg")
    epochs = np.arange(1, args.epochs + 1)

    plt.plot(epochs, of_loss, label="oneflow")
    plt.plot(epochs, pt_loss, label="pytorch")
    plt.legend()
    plt.savefig("./1.jpg")
    plt.show()
Esempio n. 10
0
def text_generator(state_dict):
    parser = argparse.ArgumentParser()
    parser.add_argument("--text", type=str, required=True)
    parser.add_argument("--quiet", type=bool, default=False)
    parser.add_argument("--nsamples", type=int, default=1)
    parser.add_argument('--unconditional',
                        action='store_true',
                        help='If true, unconditional generation.')
    parser.add_argument("--batch_size", type=int, default=-1)
    parser.add_argument("--length", type=int, default=-1)
    parser.add_argument("--temperature", type=float, default=0.7)
    parser.add_argument("--top_k", type=int, default=40)
    args = parser.parse_args()

    if args.quiet is False:
        print(args)

    if args.batch_size == -1:
        args.batch_size = 1
    assert args.nsamples % args.batch_size == 0

    seed = random.randint(0, 2147483647)
    np.random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load Model
    enc = get_encoder()
    config = GPT2Config()
    model = GPT2LMHeadModel(config)
    model = load_weight(model, state_dict)
    model.to(device)
    model.eval()

    if args.length == -1:
        args.length = config.n_ctx // 2
    elif args.length > config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" %
                         config.n_ctx)

    print(args.text)
    context_tokens = enc.encode(args.text)

    generated = 0
    for _ in range(args.nsamples // args.batch_size):
        out = sample_sequence(
            model=model,
            length=args.length,
            context=context_tokens if not args.unconditional else None,
            start_token=enc.encoder['<|endoftext|>']
            if args.unconditional else None,
            batch_size=args.batch_size,
            temperature=args.temperature,
            top_k=args.top_k,
            device=device)
        out = out[:, len(context_tokens):].tolist()
        for i in range(args.batch_size):
            generated += 1
            text = enc.decode(out[i])
            if args.quiet is False:
                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
            print(text)
Esempio n. 11
0
def text_generator():
    parser = argparse.ArgumentParser()
    parser.add_argument("--text", type=str, required=True)
    parser.add_argument("--nsamples", type=int, default=1)
    parser.add_argument(
        "--unconditional",
        action="store_true",
        help="If true, unconditional generation.",
    )
    parser.add_argument("--batch_size", type=int, default=-1)
    parser.add_argument("--length", type=int, default=-1)
    parser.add_argument("--temperature", type=float, default=0.7)
    parser.add_argument("--top_k", type=int, default=40)
    parser.add_argument("--seed", type=int, default=1234)
    args = parser.parse_args()

    print(args)

    if args.batch_size == -1:
        args.batch_size = 1
    assert args.nsamples % args.batch_size == 0

    random.seed(args.seed)
    np.random.seed(args.seed)
    flow.manual_seed(args.seed)

    device = flow.device("cuda")

    tokenizer = build_tokenizer(vocab_file="vocab.json",
                                merges_file="merge.txt")
    config = GPT2Config()
    model = GPT2LMHeadModel(config)

    # convert_pt_checkpoint_to_of(model, pt_checkpoint_path="gpt2-pytorch_model.bin", of_checkpoint_path="gpt2_oneflow_model")

    state_dict = flow.load("gpt2_oneflow_model")
    model.load_state_dict(state_dict)
    model.tie_embeddings()

    model.to(device)
    model.eval()

    if args.length == -1:
        args.length = config.n_ctx // 2
    elif args.length > config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" %
                         config.n_ctx)

    text = args.text
    print(text)

    context_tokens = tokenizer.tokenize(text)
    generated = 0
    for _ in range(args.nsamples // args.batch_size):
        out = sample_sequence(
            model=model,
            length=args.length,
            context=context_tokens if not args.unconditional else None,
            start_token=tokenizer.vocab["<|endoftext|>"]
            if args.unconditional else None,
            batch_size=args.batch_size,
            temperature=args.temperature,
            top_k=args.top_k,
            device=device,
        )
        out = out[:, len(context_tokens):].tolist()
        for i in range(args.batch_size):
            generated += 1
            text = tokenizer.detokenize(out[i])
            print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
            print(text)
Esempio n. 12
0
def get_model(device, vocab_path, model_path):
    tokenizer = BertTokenizer.from_pretrained(vocab_path, do_lower_case=True)
    model = GPT2LMHeadModel.from_pretrained(model_path)
    model.to(device)
    model.eval()
    return tokenizer, model
Esempio n. 13
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--vocab_file", default="gpt2-vocab.json", type=str)
    parser.add_argument("--merges_file", default="gpt2-merges.txt", type=str)
    parser.add_argument(
        "--restore_file",
        default="gpt2_oneflow_model",
        type=str,
        help="Path to pre-trained model",
    )
    parser.add_argument("--prompt", type=str, default="")
    parser.add_argument("--length", type=int, default=20)
    parser.add_argument("--temperature", type=float, default=1.0)
    parser.add_argument("--top_k", type=int, default=1)
    parser.add_argument("--top_p", type=float, default=0.9)
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")
    args = parser.parse_args()

    args.device = flow.device("cuda" if not args.no_cuda else "cpu")

    set_seed(args)

    tokenizer = build_tokenizer(
        vocab_file=args.vocab_file,
        merges_file=args.merges_file,
        tokenizer_type="GPT2BPETokenizer",
    )
    config = GPT2Config()
    model = GPT2LMHeadModel(config)
    if args.restore_file is not None:
        model.load_state_dict(flow.load(args.restore_file))
    model.lm_head.weight = model.transformer.wte.weight
    model.to(args.device)
    model.eval()

    if args.length < 0 and config.max_position_embeddings > 0:
        args.length = config.max_position_embeddings
    elif 0 < config.max_position_embeddings < args.length:
        args.length = (config.max_position_embeddings
                       )  # No generation bigger than model size
    elif args.length < 0:
        args.length = MAX_LENGTH  # avoid infinite loop

    print(args)
    while True:
        raw_text = args.prompt if args.prompt else input("Model prompt >>> ")
        context_tokens = tokenizer.tokenize(raw_text)
        out = sample_sequence(
            model=model,
            context=context_tokens,
            length=args.length,
            temperature=args.temperature,
            top_k=args.top_k,
            top_p=args.top_p,
            device=args.device,
        )
        out = out[0, len(context_tokens):].tolist()
        text = tokenizer.detokenize(out)
        print(text)
        if args.prompt:
            break
    return text