Ejemplo n.º 1
0
def create_model(vocab_size, device):
    """

    :param vocab_size:字典大小
    :param device: 设备CPU/GPU
    :return:
    """
    if os.path.exists(config.checkpoint_path):  # 如果上次训练文件存在
        model = GPT2LMHeadModel.from_pretrained(config.checkpoint_path)
    else:  # 若没有指定预训练模型,则初始化模型
        model_config = GPT2Config.from_json_file(config.model_config)
        model = GPT2LMHeadModel(config=model_config)
    # 根据tokenizer的vocabulary调整GPT2模型的voca的大小
    model.resize_token_embeddings(vocab_size)
    nct_x = model.config.to_dict().get("n_ctx")
    # print('model config:\n{}'.format(model.config.to_json_string()))
    model = model.to(device)
    # 是否使用多块GPU进行并行运算
    if config.cuda and torch.cuda.device_count() > 1 and ',' in config.device:
        print("开始使用多GPU进行训练")
        model = DataParallel(model, device_ids=[int(i.strip()) for i in config.device.split(',')])
        multi_gpu = True
    elif config.cuda:
        print('当前使用单张GPU进行训练')
        multi_gpu = False
    else:
        print('当前使用CPU进行训练')
        multi_gpu = False
    # 记录模型参数数量
    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of model parameters: {}'.format(num_parameters))
    return model, nct_x, multi_gpu
Ejemplo n.º 2
0
    def get_pipeline(self):
        # When
        config = GPT2Config(
            vocab_size=263,
            n_ctx=128,
            max_length=128,
            n_embd=64,
            n_layer=1,
            n_head=8,
            bos_token_id=256,
            eos_token_id=257,
        )
        model = GPT2LMHeadModel(config)
        # Force model output to be L
        V, D = model.lm_head.weight.shape
        bias = torch.zeros(V)
        bias[76] = 1
        weight = torch.zeros((V, D), requires_grad=True)

        model.lm_head.bias = torch.nn.Parameter(bias)
        model.lm_head.weight = torch.nn.Parameter(weight)

        # # Created with:
        # import tempfile

        # from tokenizers import Tokenizer, models
        # from transformers.tokenization_utils_fast import PreTrainedTokenizerFast

        # vocab = [(chr(i), i) for i in range(256)]
        # tokenizer = Tokenizer(models.Unigram(vocab))
        # with tempfile.NamedTemporaryFile() as f:
        #     tokenizer.save(f.name)
        #     real_tokenizer = PreTrainedTokenizerFast(tokenizer_file=f.name, eos_token="<eos>", bos_token="<bos>")

        # real_tokenizer._tokenizer.save("dummy.json")
        # Special tokens are automatically added at load time.
        tokenizer = AutoTokenizer.from_pretrained(
            "Narsil/small_conversational_test")
        conversation_agent = pipeline(task="conversational",
                                      device=DEFAULT_DEVICE_NUM,
                                      model=model,
                                      tokenizer=tokenizer)
        return conversation_agent