Ejemplo n.º 1
0
def create_model(vocab_size, device):
    """

    :param vocab_size:字典大小
    :param device: 设备CPU/GPU
    :return:
    """
    if os.path.exists(config.checkpoint_path):  # 如果上次训练文件存在
        model = GPT2LMHeadModel.from_pretrained(config.checkpoint_path)
    else:  # 若没有指定预训练模型,则初始化模型
        model_config = GPT2Config.from_json_file(config.model_config)
        model = GPT2LMHeadModel(config=model_config)
    # 根据tokenizer的vocabulary调整GPT2模型的voca的大小
    model.resize_token_embeddings(vocab_size)
    nct_x = model.config.to_dict().get("n_ctx")
    # print('model config:\n{}'.format(model.config.to_json_string()))
    model = model.to(device)
    # 是否使用多块GPU进行并行运算
    if config.cuda and torch.cuda.device_count() > 1 and ',' in config.device:
        print("开始使用多GPU进行训练")
        model = DataParallel(model, device_ids=[int(i.strip()) for i in config.device.split(',')])
        multi_gpu = True
    elif config.cuda:
        print('当前使用单张GPU进行训练')
        multi_gpu = False
    else:
        print('当前使用CPU进行训练')
        multi_gpu = False
    # 记录模型参数数量
    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of model parameters: {}'.format(num_parameters))
    return model, nct_x, multi_gpu
Ejemplo n.º 2
0
    def get_pipeline(self):
        # When
        config = GPT2Config(
            vocab_size=263,
            n_ctx=128,
            max_length=128,
            n_embd=64,
            n_layer=1,
            n_head=8,
            bos_token_id=256,
            eos_token_id=257,
        )
        model = GPT2LMHeadModel(config)
        # Force model output to be L
        V, D = model.lm_head.weight.shape
        bias = torch.zeros(V)
        bias[76] = 1
        weight = torch.zeros((V, D), requires_grad=True)

        model.lm_head.bias = torch.nn.Parameter(bias)
        model.lm_head.weight = torch.nn.Parameter(weight)

        # # Created with:
        # import tempfile

        # from tokenizers import Tokenizer, models
        # from transformers.tokenization_utils_fast import PreTrainedTokenizerFast

        # vocab = [(chr(i), i) for i in range(256)]
        # tokenizer = Tokenizer(models.Unigram(vocab))
        # with tempfile.NamedTemporaryFile() as f:
        #     tokenizer.save(f.name)
        #     real_tokenizer = PreTrainedTokenizerFast(tokenizer_file=f.name, eos_token="<eos>", bos_token="<bos>")

        # real_tokenizer._tokenizer.save("dummy.json")
        # Special tokens are automatically added at load time.
        tokenizer = AutoTokenizer.from_pretrained(
            "Narsil/small_conversational_test")
        conversation_agent = pipeline(task="conversational",
                                      device=DEFAULT_DEVICE_NUM,
                                      model=model,
                                      tokenizer=tokenizer)
        return conversation_agent
Ejemplo n.º 3
0
def main():
    args = set_interact_args()
    logger = create_logger(args)
    # 当用户使用GPU,并且GPU可用时
    args.cuda = torch.cuda.is_available() and not args.no_cuda
    device = 'cuda' if args.cuda else 'cpu'
    logger.info('using device:{}'.format(device))
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device
    tokenizer = BertTokenizer(vocab_file=args.voca_path)
    # 对话model
    dialogue_model = GPT2LMHeadModel.from_pretrained(args.dialogue_model_path)
    dialogue_model.to(device)
    dialogue_model.eval()
    # 互信息mmi model
    mmi_model = GPT2LMHeadModel.from_pretrained(args.mmi_model_path)
    mmi_model.to(device)
    mmi_model.eval()
    if args.save_samples_path:
        if not os.path.exists(args.save_samples_path):
            os.makedirs(args.save_samples_path)
        samples_file = open(args.save_samples_path + '/mmi_samples.txt',
                            'a',
                            encoding='utf8')
        samples_file.write("聊天记录{}:\n".format(datetime.now()))
        # 存储聊天记录,每个utterance以token的id的形式进行存储
    history = []
    print('开始和chatbot聊天,输入CTRL + Z以退出')

    while True:
        try:
            text = input("user:"******"user:{}\n".format(text))
            history.append(tokenizer.encode(text))
            input_ids = [tokenizer.cls_token_id]  # 每个input以[CLS]为开头
            for history_id, history_utr in enumerate(
                    history[-args.max_history_len:]):
                input_ids.extend(history_utr)
                input_ids.append(tokenizer.sep_token_id)
            # 用于批量生成response,维度为(batch_size,token_len)
            input_ids = [
                copy.deepcopy(input_ids) for _ in range(args.batch_size)
            ]

            curr_input_tensors = torch.tensor(input_ids).long().to(device)
            generated = [
            ]  # 二维数组,维度为(生成的response的最大长度,batch_size),generated[i,j]表示第j个response的第i个token的id
            finish_set = set(
            )  # 标记是否所有response均已生成结束,若第i个response生成结束,即生成了sep_token_id,则将i放入finish_set
            # 最多生成max_len个token
            for _ in range(args.max_len):
                outputs = dialogue_model(input_ids=curr_input_tensors)
                next_token_logits = outputs[0][:, -1, :]
                # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率
                for index in range(args.batch_size):
                    for token_id in set(
                        [token_ids[index] for token_ids in generated]):
                        next_token_logits[index][
                            token_id] /= args.repetition_penalty
                next_token_logits = next_token_logits / args.temperature
                # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token
                for next_token_logit in next_token_logits:
                    next_token_logit[tokenizer.convert_tokens_to_ids(
                        '[UNK]')] = -float('Inf')
                filtered_logits = top_k_top_p_filtering(next_token_logits,
                                                        top_k=args.topk,
                                                        top_p=args.topp)
                # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标
                next_token = torch.multinomial(F.softmax(filtered_logits,
                                                         dim=-1),
                                               num_samples=1)
                # 判断是否有response生成了[SEP],将已生成了[SEP]的resposne进行标记
                for index, token_id in enumerate(next_token[:, 0]):
                    if token_id == tokenizer.sep_token_id:
                        finish_set.add(index)
                # 检验是否所有的response均已生成[SEP]
                finish_flag = True  # 是否所有的response均已生成[SEP]的token
                for index in range(args.batch_size):
                    if index not in finish_set:  # response批量生成未完成
                        finish_flag = False
                        break
                if finish_flag:
                    break
                generated.append([token.item() for token in next_token[:, 0]])
                # 将新生成的token与原来的token进行拼接
                curr_input_tensors = torch.cat(
                    (curr_input_tensors, next_token), dim=-1)
            candidate_responses = []  # 生成的所有候选response
            for batch_index in range(args.batch_size):
                response = []
                for token_index in range(len(generated)):
                    if generated[token_index][
                            batch_index] != tokenizer.sep_token_id:
                        response.append(generated[token_index][batch_index])
                    else:
                        break
                candidate_responses.append(response)

            # mmi模型的输入
            if args.debug:
                print("candidate response:")
            samples_file.write("candidate response:\n")
            min_loss = float('Inf')
            best_response = ""
            for response in candidate_responses:
                mmi_input_id = [tokenizer.cls_token_id]  # 每个input以[CLS]为开头
                mmi_input_id.extend(response)
                mmi_input_id.append(tokenizer.sep_token_id)
                for history_utr in reversed(history[-args.max_history_len:]):
                    mmi_input_id.extend(history_utr)
                    mmi_input_id.append(tokenizer.sep_token_id)
                mmi_input_tensor = torch.tensor(mmi_input_id).long().to(device)
                out = mmi_model(input_ids=mmi_input_tensor,
                                labels=mmi_input_tensor)
                loss = out[0].item()
                if args.debug:
                    text = tokenizer.convert_ids_to_tokens(response)
                    print("{} loss:{}".format("".join(text), loss))
                samples_file.write("{} loss:{}\n".format("".join(text), loss))
                if loss < min_loss:
                    best_response = response
                    min_loss = loss
            history.append(best_response)
            text = tokenizer.convert_ids_to_tokens(best_response)
            print("chatbot:" + "".join(text))
            if args.save_samples_path:
                samples_file.write("chatbot:{}\n".format("".join(text)))
        except KeyboardInterrupt:
            if args.save_samples_path:
                samples_file.close()
            break
Ejemplo n.º 4
0
# model_url = os.environ.get('MODEL_URL')
# config_url = os.environ.get('CONFIG_URL')

model_path = Path('model/pytorch_model.bin')
config_path = Path('model/config.json')
vocab_path = Path('model/encoder.json')
merges_path = Path('model/vocab.bpe')

# if not model_path.exists():
#     gdown.download(model_url, str(model_path.resolve()), quiet=False)

# if not config_path.exists():
#     gdown.download(config_url, str(config_path.resolve()), quiet=False)

print('Loading model...')
model = GPT2LMHeadModel.from_pretrained(str(model_path),
                                        config=str(config_path))
print('Model loaded.')

tokenizer = GPT2TokenizerFast(vocab_file=str(vocab_path),
                              merges_file=str(merges_path))


class ModelOut(BaseModel):
    prompt: str
    output: str


def generate(
    prompt: str = '',
    top_k: int = 40,
    top_p: float = 0.95,
Ejemplo n.º 5
0
def create_model(model_dir):
    return GPT2LMHeadModel.from_pretrained(model_dir, use_cache=False)
Ejemplo n.º 6
0
def main():
    logger = create_logger(config)
    # 当用户使用GPU,并且GPU可用时
    config.cuda = torch.cuda.is_available() and not config.no_cuda
    device = 'cuda' if config.cuda else 'cpu'
    logger.info('using device:{}'.format(device))
    os.environ["CUDA_VISIBLE_DEVICES"] = config.device
    tokenizer = BertTokenizer(vocab_file=config.vocab_path)
    model = GPT2LMHeadModel.from_pretrained(config.best_checkpoint_path)
    model.to(device)
    model.eval()
    if bool(config.save_samples_path):
        if not os.path.exists(config.save_samples_path):
            os.makedirs(config.save_samples_path)
        samples_file = open(config.save_samples_path + '/samples.txt',
                            'a',
                            encoding='utf8')
        samples_file.write("聊天记录{}:\n".format(datetime.now()))
        # 存储聊天记录,每个utterance以token的id的形式进行存储
    history = []
    print('开始和chatbot聊天,输入CTRL + Z以退出')

    while True:
        try:
            text = input("请输入广告词:")
            if config.save_samples_path:
                samples_file.write("请输入广告词:{}\n".format(text))
            history.append(tokenizer.encode(text))
            input_ids = [tokenizer.cls_token_id]  # 每个input以[CLS]为开头

            for history_id, history_utr in enumerate(
                    history[-config.max_history_len:]):
                input_ids.extend(history_utr)
                input_ids.append(tokenizer.sep_token_id)
            curr_input_tensor = torch.tensor(input_ids).long().to(device)
            generated = []
            # 最多生成max_len个token
            for _ in range(config.max_len):
                outputs = model(input_ids=curr_input_tensor)
                next_token_logits = outputs[0][-1, :]
                # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率
                for id in set(generated):
                    next_token_logits[id] /= config.repetition_penalty
                next_token_logits = next_token_logits / config.temperature
                # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token
                next_token_logits[tokenizer.convert_tokens_to_ids(
                    '[UNK]')] = -float('Inf')
                filtered_logits = top_k_top_p_filtering(next_token_logits,
                                                        top_k=config.top_k,
                                                        top_p=config.top_prob)
                # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标
                next_token = torch.multinomial(F.softmax(filtered_logits,
                                                         dim=-1),
                                               num_samples=1)
                if next_token == tokenizer.sep_token_id:  # 遇到[SEP]则表明response生成结束
                    break
                generated.append(next_token.item())
                curr_input_tensor = torch.cat((curr_input_tensor, next_token),
                                              dim=0)
                # his_text = tokenizer.convert_ids_to_tokens(curr_input_tensor.tolist())
                # print("his_text:{}".format(his_text))
            history.append(generated)
            text = tokenizer.convert_ids_to_tokens(generated)
            print("chatbot:" + "".join(text))
            if config.save_samples_path:
                samples_file.write("chatbot:{}\n".format("".join(text)))
        except KeyboardInterrupt:
            if config.save_samples_path:
                samples_file.close()
            break