Esempio n. 1
0
    def __init__(self):
        try:
            self.device = 'cuda' if config_model.use_cuda else 'cpu'
            LOGGER.info('using device: {}'.format(self.device))
            if self.device == 'cuda':
                os.environ["CUDA_VISIBLE_DEVICES"] = config_model.device_nums
            self.tokenizer = BertTokenizer(config_model.vocab_path)

            # dialogue model
            self.dialogue_model = GPT2LMHeadModel.from_pretrained(config_model.dialogue_model_path)
            self.dialogue_model.to(self.device)
            self.dialogue_model.eval()

            # mmi model
            self.mmi_model = GPT2LMHeadModel.from_pretrained(config_model.mmi_model_path)
            self.mmi_model.to(self.device)
            self.dialogue_model.eval()

            self.max_sequence_len = config_model.max_len
            self.batch_size = config_model.batch_size
            self.repetition_penalty = config_model.repetition_penalty
            self.temperature = config_model.temperature
            self.debug = config_model.debug
            self.topk = config_model.topk
            self.topp = config_model.topp


        except Exception as e:
            LOGGER.error("FAIL INIT: {}".format(str(e)))
            traceback.print_exc()
            sys.exit(-1)
Esempio n. 2
0
def create_model(pre_trained=False):
    if pre_trained:
        model = GPT2LMHeadModel.from_pretrained(config.MODEL_PATH)
    else:
        model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
            config.CONFIG_JSON_FILE)
        model = GPT2LMHeadModel(config=model_config)
    # model.resize_token_embeddings(vocab_size)
    n_ctx = model.config.to_dict().get("n_ctx")
    return model, n_ctx
Esempio n. 3
0
def create_model(hparams, vocab_size):
    if hparams.pretrained_model:  # 如果指定了预训练的GPT2模型
        model = GPT2LMHeadModel.from_pretrained(hparams.pretrained_model)
    else:  # 若没有指定预训练模型,则初始化模型
        model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
            hparams.model_config)
        model = GPT2LMHeadModel(config=model_config)
    # 根据tokenizer的vocabulary调整GPT2模型的vocab的大小
    model.resize_token_embeddings(vocab_size)
    logger.info('model config:\n{}'.format(model.config.to_json_string()))
    return model, model.config.to_dict().get("n_ctx")
Esempio n. 4
0
    def build_model(self):
        """

        :param args:
        :param vocab_size:字典大小
        :return:
        """
        if self.args.pretrained_model:
            # 如果指定了预训练的GPT2模型
            self.model = GPT2LMHeadModel.from_pretrained(
                self.args.pretrained_model)
        else:
            # 若没有指定预训练模型,则初始化模型
            model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
                self.args.model_config)
            self.model = GPT2LMHeadModel(config=model_config)

        # 根据tokenizer的vocabulary调整GPT2模型的voca的大小
        self.model.resize_token_embeddings(self.vocab_size)

        if self.use_cuda:
            self.model.to(self.device)

        self.logger.info('model config:\n{}'.format(
            self.model.config.to_json_string()))

        self.n_ctx = self.model.config.to_dict().get("n_ctx")

        # 建立模型存储路径
        if self.args.is_model_output and not os.path.exists(
                self.args.dialogue_model_output_path):
            os.mkdir(self.args.dialogue_model_output_path)

        # 记录模型参数数量
        num_parameters = 0
        parameters = self.model.parameters()
        for parameter in parameters:
            num_parameters += parameter.numel()
        self.logger.info(
            'number of model parameters: {}'.format(num_parameters))

        # 是否使用多块GPU进行并行运算
        if self.args.use_multi_gpu:
            if self.args.use_cuda and torch.cuda.device_count() > 1:
                self.logger.info("Let's use GPUs to train")
                self.model = DataParallel(
                    self.model,
                    device_ids=[int(i) for i in self.args.device.split(',')])
            else:
                self.args.use_multi_gpu = False
Esempio n. 5
0
def main():
    model_args, training_args = parse_args()

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # 创建模型
    model_config = GPT2Config.from_json_file(model_args.model_config_file)
    if not model_args.pretrained_model_path:
        model = GPT2LMHeadModel(config=model_config)
    else:
        model = GPT2LMHeadModel.from_pretrained(
            model_args.pretrained_model_path)

    # 计算参数数量
    num_parameters = 0
    for parameter in model.parameters():
        num_parameters += parameter.numel()
    logger.info('number of parameters: {}'.format(num_parameters))

    full_tokenizer = get_tokenizer(vocab_file=model_args.vocab_file)

    # 输入集
    train_dataset = GPT2Dataset(model_config.n_ctx,
                                stride=model_args.stride,
                                tokenized_file_path=model_args.data_dir,
                                tokenizer=full_tokenizer)

    trainer = MyTrainer(model=model,
                        args=training_args,
                        train_dataset=train_dataset)

    # 开始训练
    trainer.train(model_path=model_args.pretrained_model_path)
    trainer.save_model()
Esempio n. 6
0
def main():
    args = set_interact_args()
    logger = create_logger(args)
    # 当用户使用GPU,并且GPU可用时
    args.cuda = torch.cuda.is_available() and not args.no_cuda
    device = 'cuda' if args.cuda else 'cpu'
    logger.info('using device:{}'.format(device))
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device
    tokenizer = BertTokenizer(vocab_file=args.voca_path)
    # 对话model
    dialogue_model = GPT2LMHeadModel.from_pretrained(args.dialogue_model_path)
    dialogue_model.to(device)
    dialogue_model.eval()
    # 互信息mmi model
    mmi_model = GPT2LMHeadModel.from_pretrained(args.mmi_model_path)
    mmi_model.to(device)
    mmi_model.eval()
    if args.save_samples_path:
        if not os.path.exists(args.save_samples_path):
            os.makedirs(args.save_samples_path)
        samples_file = open(args.save_samples_path + '/mmi_samples.txt',
                            'a',
                            encoding='utf8')
        samples_file.write("聊天记录{}:\n".format(datetime.now()))
        # 存储聊天记录,每个utterance以token的id的形式进行存储
    history = []
    print('开始和chatbot聊天,输入CTRL + Z以退出')

    @app.route('/message', methods=['POST'])
    def reply():
        # time.sleep(2)
        input_msg = request.form.get('msg', None)
        fromGroup = request.form.get('group', None)
        fromQQ = request.form.get('qq', None)

        if not input_msg:
            return RepeaterResult(-1).toJSON()

        print(fromQQ + "(群" + fromGroup + "): ", input_msg)

        output = generate_reply("这是测试话", args, device, dialogue_model, history,
                                mmi_model, samples_file, tokenizer)
        output = ''.join(map(str, output))

        return RepeaterResult(0, output, "0").toJSON()

    # ===================循环====================
    app.run(port=7777, debug=True)
Esempio n. 7
0
    def __init__(self, discrim: str, seed=0, **kwargs):
        # Set random seed
        torch.manual_seed(seed)
        np.random.seed(seed)

        pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][
            "pretrained_model"]
        print("discrim = {}, pretrained_model set "
              "to discriminator's = {}".format(discrim, pretrained_model))

        # load pretrained model
        model = GPT2LMHeadModel.from_pretrained(pretrained_model,
                                                output_hidden_states=True)
        model.eval()

        # load tokenizer
        tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)

        # Freeze GPT-2 weights
        for param in model.parameters():
            param.requires_grad = False

        super().__init__(model=model, tokenizer=tokenizer, **kwargs)

        # Additional setup after creating model and tokenizer
        self.discrim = discrim
        classifier = get_classifier(self.discrim, self.device)
        self.classifier = classifier
Esempio n. 8
0
def gpt2LMHeadModel(*args, **kwargs):
    """
    gpt2LMHeadModel is the OpenAI GPT-2 Transformer model with the
    tied (pre-trained) language modeling head on top.

    Example:
        # Load the tokenizer
        >>> import torch
        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')

        #  Prepare tokenized input
        >>> text_1 = "Who was Jim Henson ?"
        >>> text_2 = "Jim Henson was a puppeteer"
        >>> indexed_tokens_1 = tokenizer.encode(text_1)
        >>> indexed_tokens_2 = tokenizer.encode(text_2)
        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])

        # Load gpt2LMHeadModel
        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2LMHeadModel', 'gpt2')
        >>> model.eval()

        # Predict hidden states features for each layer
        # past can be used to reuse precomputed hidden state in a subsequent predictions
        >>> with torch.no_grad():
                predictions_1, past = model(tokens_tensor_1)
                predictions_2, past = model(tokens_tensor_2, past=past)

        # Get the predicted last token
        >>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
        >>> predicted_token = tokenizer.decode([predicted_index])
        >>> assert predicted_token == ' who'
    """
    model = GPT2LMHeadModel.from_pretrained(*args, **kwargs)
    return model
    def __init__(self, max_output_length=25, max_input_length=300, device='cpu', tokenizer_type='gpt2', bpe_model="", starter_model=None):
        if tokenizer_type == "gpt2":
            self.tokenizer = utils_tokenizer.GPT2Tokenizer()
            config = GPT2Config.from_pretrained("gpt2")

        elif tokenizer_type == "bpecap":
            self.tokenizer = utils_tokenizer.BPETokenizer(bpe_model)
            config = GPT2Config.from_dict({"finetuning_task": None, "initializer_range": 0.02,
                            "layer_norm_epsilon": 1e-05, "n_ctx": 1024, "n_embd": 768, "n_head": 12, "n_layer": 12, "n_positions": 1024, "num_labels": 1,
                            "resid_pdrop": 0.1, "use_bfloat16": False, "vocab_size": self.tokenizer.vocab_size})
        else:
            print("Tokenizer unrecognized. Should be gpt2 or bpecap.")
            exit()

        self.model = GPT2LMHeadModel(config)

        self.model.to(device)
        self.device = device
        if starter_model is not None:
            self.reload(starter_model)

        self.max_output_length = max_output_length
        self.max_input_length = max_input_length

        self.model.train()
        self.mode = "train"
Esempio n. 10
0
def create_model(args, vocab_size):
    """

    :param args:
    :param vocab_size:字典大小
    :return:
    """
    if args.pretrained_model:  # 如果指定了预训练的GPT2模型
        model = GPT2LMHeadModel.from_pretrained(args.pretrained_model)
    else:  # 若没有指定预训练模型,则初始化模型
        model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(args.model_config)
        model = GPT2LMHeadModel(config=model_config)
    # 根据tokenizer的vocabulary调整GPT2模型的voca的大小
    model.resize_token_embeddings(vocab_size)
    logger.info('model config:\n{}'.format(model.config.to_json_string()))
    return model, model_config.n_ctx
Esempio n. 11
0
def create_model(vocab_size):
    '''
    创建模型
    :return:
    '''
    if Config.pretrained_model:
        # 若有预训练模型 则加载
        model = GPT2LMHeadModel.from_pretrained(Config.pretrained_model)
    else:
        # 若没有预训练模型  则创建模型  从头训练起
        model_config = modeling_gpt2.GPT2Config.from_json_file(
            Config.gpt2_config)
        model = GPT2LMHeadModel(config=model_config)

    # 根据tokenizer的vocabulary调整GPT2模型的vocab的大小
    model.resize_token_embeddings(vocab_size)
    return model, model.config.to_dict().get("n_ctx")  # 输入长度
Esempio n. 12
0
def main():
    args = set_interact_args()
    logger = create_logger(args)
    # 当用户使用GPU,并且GPU可用时
    args.cuda = torch.cuda.is_available() and not args.no_cuda
    # args.cuda = False
    device = 'cuda' if args.cuda else 'cpu'
    logger.info('using device:{}'.format(device))
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device
    tokenizer = BertTokenizer(vocab_file=args.voca_path)
    model = GPT2LMHeadModel.from_pretrained(args.dialogue_model_path)
    model.to(device)
    model.eval()

    print('***********************Summary model start************************')

    while True:
        try:

            text = input()
            for i in range(5):
                if len(text): text = text[:1000]
                input_ids = [tokenizer.cls_token_id]  # 每个input以[CLS]为开头
                input_ids.extend(tokenizer.encode(text))
                input_ids.append(tokenizer.sep_token_id)
                curr_input_tensor = torch.tensor(input_ids).long().to(device)

                generated = []
                # 最多生成max_len个token
                for _ in range(args.max_len):
                    outputs = model(input_ids=curr_input_tensor)
                    next_token_logits = outputs[0][-1, :]
                    # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率
                    for id in set(generated):
                        next_token_logits[id] /= args.repetition_penalty
                    next_token_logits = next_token_logits / args.temperature
                    # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token
                    next_token_logits[tokenizer.convert_tokens_to_ids(
                        '[UNK]')] = -float('Inf')
                    filtered_logits = top_k_top_p_filtering(next_token_logits,
                                                            top_k=args.topk,
                                                            top_p=args.topp)
                    # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标
                    next_token = torch.multinomial(F.softmax(filtered_logits,
                                                             dim=-1),
                                                   num_samples=1)
                    if next_token == tokenizer.sep_token_id:  # 遇到[SEP]则表明response生成结束
                        break
                    generated.append(next_token.item())
                    curr_input_tensor = torch.cat(
                        (curr_input_tensor, next_token), dim=0)

                text = tokenizer.convert_ids_to_tokens(generated)
                print("summary:" + "".join(text))

        except KeyboardInterrupt:
            break
def get_model(name):
    tokenizer = GPT2Tokenizer.from_pretrained(name)
    model = GPT2LMHeadModel.from_pretrained(name, output_hidden_states=True)
    for param in model.parameters():
        param.requires_grad = False

    model.to(device)
    model.eval()
    return model, tokenizer
    def __init__(self, config_path):
        super(transformer_gpt2, self).__init__()
        self.tokenzier = BertTokenizer(vocab_file='config/vocab_en.txt')
        self.vocab_size = len(self.tokenzier)

        self.model_config = transformers.modeling_gpt2.GPT2Config.from_json_file(
            config_path)
        self.model = GPT2LMHeadModel(config=self.model_config)
        self.model.resize_token_embeddings(self.vocab_size)

        self.n_ctx = self.model.config.to_dict().get('n_ctx')
Esempio n. 15
0
 def __init__(self, device='cuda', history_len=1, batch_size=5, max_len=25,
              penalty=1.0, temperature=1):
     super(ConversationHandler, self).__init__()
     self.device = device
     self.max_len = max_len  # max length of each utterance
     self.history_len = history_len
     self.config = ConfigParser.config_dict['conversation']
     self.tokenizer = BertTokenizer(vocab_file=self.config['voca_path'])
     self.hanlp = HanlpWrapper()  # not going to inited twice since it's a Singleton
     self.model = GPT2LMHeadModel.from_pretrained(self.config['dialogue_model'])
     self.mmi_model = GPT2LMHeadModel.from_pretrained(self.config['mmi_model'])
     # move both models to specific device
     self.model.to(device)
     self.model.eval()
     self.mmi_model.to(device)
     self.mmi_model.eval()
     self.history = []  # for future multi-conversation usage
     self.batch_size = batch_size  # how many response generated for MMI filter
     self.penalty = penalty
     self.temperature = temperature
Esempio n. 16
0
    def _get_dialogue_model(self):
        logging.info("Start getting dialogue model.")

        args = self.args
        dialogue_model = GPT2LMHeadModel.from_pretrained(
            args.dialogue_model_path)
        dialogue_model.to(self.device)
        dialogue_model.eval()

        logging.info("Finish reading dialogue model.")

        return dialogue_model
Esempio n. 17
0
 def _get_mmi_model(self):
     args = self.args
     try:
         logging.info("Start getting mmi model.")
         mmi_model = GPT2LMHeadModel.from_pretrained(args.mmi_model_path)
         mmi_model.to(self.device)
         mmi_model.eval()
         logging.info("Finish getting mmi model.")
         return mmi_model
     except:
         logging.info(
             'Cannot find mmi model in directory, we will choose response randomly.'
         )
         return None
Esempio n. 18
0
    def load_lang_model(self):
        print(f"Loading language model {self.pretrained_model}")
        # load pretrained model
        self.lang_model = GPT2LMHeadModel.from_pretrained(
            self.pretrained_model,
            output_hidden_states=True
        )
        self.lang_model.to(self.device)
        self.lang_model.eval()

        # load tokenizer
        self.tokenizer = GPT2Tokenizer.from_pretrained(self.pretrained_model)

        # Freeze GPT-2 weights
        for param in self.lang_model.parameters():
            param.requires_grad = False
Esempio n. 19
0
def initialize(np,torch):
    from transformers import GPT2Tokenizer
    from transformers.modeling_gpt2 import GPT2LMHeadModel
    import PPLM.run_pplm as PPLM

    torch.manual_seed(0)
    np.random.seed(0)

    model = GPT2LMHeadModel.from_pretrained(
        "gpt2-medium",
        output_hidden_states=True
    )
    model.to("cpu")
    model.eval()
    for param in model.parameters():
        param.requires_grad = False

    tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
    return tokenizer,model,PPLM
Esempio n. 20
0
def sumarize(content):
    tokenizer = BertTokenizer.from_pretrained('./vocab')
    model = GPT2LMHeadModel.from_pretrained('./model.pt')
    model.to(device)
    model.eval()
    
    for i in range(3):
        # 对新闻正文进行预处理,并判断如果超长则进行截断
        content_tokens = tokenizer.tokenize(content)
        if len(content_tokens) > max_len - 3 - generate_max_len:
            content_tokens = content_tokens[:max_len - 3 - generate_max_len]
        
        # 将tokens索引化,变成模型所需格式
        content_tokens = ["[CLS]"] + content_tokens + ["[SEP]"]
        input_ids = tokenizer.convert_tokens_to_ids(content_tokens)
        
        # 将input_ids变成tensor
        curr_input_tensor = torch.tensor(input_ids).long().to(device)

        generated = []
        # 最多生成generate_max_len个token
        for _ in range(generate_max_len):
            outputs = model(input_ids=curr_input_tensor)
            next_token_logits = outputs[0][-1, :] #size:[vocab size]
            # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率
            for id_ in set(generated):
                next_token_logits[id_] /= repetition_penalty
            # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token
            next_token_logits[tokenizer.convert_tokens_to_ids('[UNK]')] = -float('Inf')
            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=topk, top_p=topp)
            # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标
            next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
            if next_token.item() == tokenizer.sep_token_id:  # 遇到[SEP]则表明response生成结束
                break
            generated.append(next_token.item())
            curr_input_tensor = torch.cat((curr_input_tensor, next_token), dim=0)

        text = tokenizer.convert_ids_to_tokens(generated)
        print("summary:" + "".join(text))
Esempio n. 21
0
def main():
    args = set_args()
    # 设置使用哪些显卡进行训练
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu

    global logger
    logger = create_logger(args)
    logger.info('using device:{}'.format(args.device))
    logger.info('Initilizing tokenizer ...')
    tokenizer = BertTokenizer.from_pretrained(args.vocab_path)
    vocab_size = len(tokenizer)

    logger.info('Loading pretrained model ...')
    model = GPT2LMHeadModel.from_pretrained(args.vocab_path)
    n_ctx = model.config.to_dict().get('n_ctx')  # 获取模型规定的序列长度
    model.resize_token_embeddings(vocab_size)  # 修改预训练模型的vocab大小
    model.to(args.device)

    logger.info('Loading data for training and evaluation ...')
    dataset = get_dataset(args.raw_data_path, tokenizer, n_ctx,
                          args.token_data_path, args.train_mmi)
    train(model, dataset['test'], args)
Esempio n. 22
0
def main():
    args = set_interact_args()
    logger = create_logger(args)
    # 当用户使用GPU,并且GPU可用时
    args.cuda = torch.cuda.is_available() and not args.no_cuda
    device = 'cuda' if args.cuda else 'cpu'
    logger.info('using device:{}'.format(device))
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device
    tokenizer = BertTokenizer(vocab_file=args.voca_path)
    # 对话model
    dialogue_model = GPT2LMHeadModel.from_pretrained(args.dialogue_model_path)
    dialogue_model.to(device)
    dialogue_model.eval()
    # 互信息mmi model
    mmi_model = GPT2LMHeadModel.from_pretrained(args.mmi_model_path)
    mmi_model.to(device)
    mmi_model.eval()
    if args.save_samples_path:
        if not os.path.exists(args.save_samples_path):
            os.makedirs(args.save_samples_path)
        samples_file = open(args.save_samples_path + '/mmi_samples.txt',
                            'a',
                            encoding='utf8')
        samples_file.write("聊天记录{}:\n".format(datetime.now()))
        # 存储聊天记录,每个utterance以token的id的形式进行存储
    history = []
    print('开始和Liam的叛逆机器人Tyrion聊天,输入CTRL + Z以退出')
    import readline
    while True:
        try:
            text = input("Liam:")
            if args.save_samples_path:
                samples_file.write("Liam:{}\n".format(text))
            history.append(tokenizer.encode(text))
            input_ids = [tokenizer.cls_token_id]  # 每个input以[CLS]为开头
            for history_id, history_utr in enumerate(
                    history[-args.max_history_len:]):
                input_ids.extend(history_utr)
                input_ids.append(tokenizer.sep_token_id)
            # 用于批量生成response,维度为(batch_size,token_len)
            input_ids = [
                copy.deepcopy(input_ids) for _ in range(args.batch_size)
            ]

            curr_input_tensors = torch.tensor(input_ids).long().to(device)
            generated = [
            ]  # 二维数组,维度为(生成的response的最大长度,batch_size),generated[i,j]表示第j个response的第i个token的id
            finish_set = set(
            )  # 标记是否所有response均已生成结束,若第i个response生成结束,即生成了sep_token_id,则将i放入finish_set
            # 最多生成max_len个token
            for _ in range(args.max_len):
                outputs = dialogue_model(input_ids=curr_input_tensors)
                next_token_logits = outputs[0][:, -1, :]
                # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率
                for index in range(args.batch_size):
                    for token_id in set(
                        [token_ids[index] for token_ids in generated]):
                        next_token_logits[index][
                            token_id] /= args.repetition_penalty
                next_token_logits = next_token_logits / args.temperature
                # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token
                for next_token_logit in next_token_logits:
                    next_token_logit[tokenizer.convert_tokens_to_ids(
                        '[UNK]')] = -float('Inf')
                filtered_logits = top_k_top_p_filtering(next_token_logits,
                                                        top_k=args.topk,
                                                        top_p=args.topp)
                # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标
                next_token = torch.multinomial(F.softmax(filtered_logits,
                                                         dim=-1),
                                               num_samples=1)
                # 判断是否有response生成了[SEP],将已生成了[SEP]的resposne进行标记
                for index, token_id in enumerate(next_token[:, 0]):
                    if token_id == tokenizer.sep_token_id:
                        finish_set.add(index)
                # 检验是否所有的response均已生成[SEP]
                finish_flag = True  # 是否所有的response均已生成[SEP]的token
                for index in range(args.batch_size):
                    if index not in finish_set:  # response批量生成未完成
                        finish_flag = False
                        break
                if finish_flag:
                    break
                generated.append([token.item() for token in next_token[:, 0]])
                # 将新生成的token与原来的token进行拼接
                curr_input_tensors = torch.cat(
                    (curr_input_tensors, next_token), dim=-1)
            candidate_responses = []  # 生成的所有候选response
            for batch_index in range(args.batch_size):
                response = []
                for token_index in range(len(generated)):
                    if generated[token_index][
                            batch_index] != tokenizer.sep_token_id:
                        response.append(generated[token_index][batch_index])
                    else:
                        break
                candidate_responses.append(response)

            # mmi模型的输入
            if args.debug:
                print("candidate response:")
            samples_file.write("candidate response:\n")
            min_loss = float('Inf')
            best_response = ""
            for response in candidate_responses:
                mmi_input_id = [tokenizer.cls_token_id]  # 每个input以[CLS]为开头
                mmi_input_id.extend(response)
                mmi_input_id.append(tokenizer.sep_token_id)
                for history_utr in reversed(history[-args.max_history_len:]):
                    mmi_input_id.extend(history_utr)
                    mmi_input_id.append(tokenizer.sep_token_id)
                mmi_input_tensor = torch.tensor(mmi_input_id).long().to(device)
                out = mmi_model(input_ids=mmi_input_tensor,
                                labels=mmi_input_tensor)
                loss = out[0].item()
                if args.debug:
                    text = tokenizer.convert_ids_to_tokens(response)
                    print("{} loss:{}".format("".join(text), loss))
                samples_file.write("{} loss:{}\n".format("".join(text), loss))
                if loss < min_loss:
                    best_response = response
                    min_loss = loss
            history.append(best_response)
            text = tokenizer.convert_ids_to_tokens(best_response)
            print("Tyrion:" + "".join(text))
            if args.save_samples_path:
                samples_file.write("Tyrion:{}\n".format("".join(text)))
        except KeyboardInterrupt:
            if args.save_samples_path:
                samples_file.close()
            break
Esempio n. 23
0
from flask import Flask, render_template, url_for, request
from flask_bootstrap import Bootstrap
from transformers.modeling_gpt2 import GPT2LMHeadModel
# This downloads GPT-2 Medium, it takes a little while
_ = GPT2LMHeadModel.from_pretrained("gpt2-medium")
from run_pplm import run_pplm_example
app = Flask(__name__)
Bootstrap(app)

# add a rule for the index page.
@app.route('/')
def index():
    return render_template('index.html')

@app.route('/get_data', methods=['POST'])


def get_data():
    if(request.method =='POST'):
        text = request.form['nlg']
        drop = request.form['personality']
        x = run_pplm_example(cond_text=text,num_samples=1,bag_of_words=drop,length=50,stepsize=0.03,sample=True,num_iterations=3, window_length=5,gamma=1.5,gm_scale=0.95,kl_scale=0.01,verbosity='regular')
        
        
        
    return render_template('result.html',prediction=[text,type(x)])
'''def get_data():
	print("I am here!")
	if request.method == 'POST':
		text = request.form['nlg']
		print(text)
Esempio n. 24
0
def main():
    args = set_interact_args()
    logger = create_logger(args)
    # 当用户使用GPU,并且GPU可用时
    args.cuda = torch.cuda.is_available() and not args.no_cuda
    device = 'cuda' if args.cuda else 'cpu'
    logger.info('using device:{}'.format(device))
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device
    tokenizer = BertTokenizer(vocab_file=args.voca_path)
    model = GPT2LMHeadModel.from_pretrained(args.dialogue_model_path)
    model.to(device)
    model.eval()
    if args.save_samples_path:
        if not os.path.exists(args.save_samples_path):
            os.makedirs(args.save_samples_path)
        samples_file = open(args.save_samples_path + '/samples.txt',
                            'a',
                            encoding='utf8')
        samples_file.write("聊天记录{}:\n".format(datetime.now()))
        # 存储聊天记录,每个utterance以token的id的形式进行存储
    history = []
    print('开始和chatbot聊天,输入CTRL + Z以退出')

    while True:
        try:
            text = input("user:"******"user:{}\n".format(text))
            history.append(tokenizer.encode(text))
            input_ids = [tokenizer.cls_token_id]  # 每个input以[CLS]为开头

            for history_id, history_utr in enumerate(
                    history[-args.max_history_len:]):
                input_ids.extend(history_utr)
                input_ids.append(tokenizer.sep_token_id)
            curr_input_tensor = torch.tensor(input_ids).long().to(device)
            generated = []
            # 最多生成max_len个token
            for _ in range(args.max_len):
                outputs = model(input_ids=curr_input_tensor)
                next_token_logits = outputs[0][-1, :]
                # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率
                for id in set(generated):
                    next_token_logits[id] /= args.repetition_penalty
                next_token_logits = next_token_logits / args.temperature
                # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token
                next_token_logits[tokenizer.convert_tokens_to_ids(
                    '[UNK]')] = -float('Inf')
                filtered_logits = top_k_top_p_filtering(next_token_logits,
                                                        top_k=args.topk,
                                                        top_p=args.topp)
                # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标
                next_token = torch.multinomial(F.softmax(filtered_logits,
                                                         dim=-1),
                                               num_samples=1)
                if next_token == tokenizer.sep_token_id:  # 遇到[SEP]则表明response生成结束
                    break
                generated.append(next_token.item())
                curr_input_tensor = torch.cat((curr_input_tensor, next_token),
                                              dim=0)
                # his_text = tokenizer.convert_ids_to_tokens(curr_input_tensor.tolist())
                # print("his_text:{}".format(his_text))
            history.append(generated)
            text = tokenizer.convert_ids_to_tokens(generated)
            print("chatbot:" + "".join(text))
            if args.save_samples_path:
                samples_file.write("chatbot:{}\n".format("".join(text)))
        except KeyboardInterrupt:
            if args.save_samples_path:
                samples_file.close()
            break
Esempio n. 25
0
def run_pplm_example(pretrained_model="gpt2-medium",
                     cond_text="",
                     uncond=False,
                     num_samples=1,
                     bag_of_words=None,
                     discrim=None,
                     discrim_weights=None,
                     discrim_meta=None,
                     class_label=-1,
                     length=100,
                     stepsize=0.02,
                     temperature=1.0,
                     top_k=10,
                     sample=True,
                     num_iterations=3,
                     grad_length=10000,
                     horizon_length=1,
                     window_length=0,
                     decay=False,
                     gamma=1.5,
                     gm_scale=0.9,
                     kl_scale=0.01,
                     seed=0,
                     no_cuda=False,
                     colorama=False,
                     verbosity='regular'):
    # set Random seed
    torch.manual_seed(seed)
    np.random.seed(seed)

    # set verbosiry
    verbosity_level = VERBOSITY_LEVELS.get(verbosity.lower(), REGULAR)

    # set the device
    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"

    if discrim == 'generic':
        set_generic_model_params(discrim_weights, discrim_meta)

    if discrim is not None:
        discriminator_pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][
            "pretrained_model"]
        if pretrained_model != discriminator_pretrained_model:
            pretrained_model = discriminator_pretrained_model
            if verbosity_level >= REGULAR:
                print("discrim = {}, pretrained_model set "
                      "to discriminator's = {}".format(discrim,
                                                       pretrained_model))

    # load pretrained model
    model = GPT2LMHeadModel.from_pretrained(pretrained_model,
                                            output_hidden_states=True)
    model.to(device)
    model.eval()

    # load tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)

    # Freeze GPT-2 weights
    for param in model.parameters():
        param.requires_grad = False

    # figure out conditioning text
    if uncond:
        tokenized_cond_text = tokenizer.encode([tokenizer.bos_token],
                                               add_special_tokens=False)
    else:
        raw_text = cond_text
        while not raw_text:
            print("Did you forget to add `--cond_text`? ")
            raw_text = input("Model prompt >>> ")
        tokenized_cond_text = tokenizer.encode(tokenizer.bos_token + raw_text,
                                               add_special_tokens=False)

    print("= Prefix of sentence =")
    print(tokenizer.decode(tokenized_cond_text))
    print()

    # generate unperturbed and perturbed texts

    # full_text_generation returns:
    # unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time
    unpert_gen_tok_text, pert_gen_tok_texts, _, _ = full_text_generation(
        model=model,
        tokenizer=tokenizer,
        context=tokenized_cond_text,
        device=device,
        num_samples=num_samples,
        bag_of_words=bag_of_words,
        discrim=discrim,
        class_label=class_label,
        length=length,
        stepsize=stepsize,
        temperature=temperature,
        top_k=top_k,
        sample=sample,
        num_iterations=num_iterations,
        grad_length=grad_length,
        horizon_length=horizon_length,
        window_length=window_length,
        decay=decay,
        gamma=gamma,
        gm_scale=gm_scale,
        kl_scale=kl_scale,
        verbosity_level=verbosity_level)

    # untokenize unperturbed text
    unpert_gen_text = tokenizer.decode(unpert_gen_tok_text.tolist()[0])

    if verbosity_level >= REGULAR:
        print("=" * 80)
    print("= Unperturbed generated text =")
    print(unpert_gen_text)
    print()

    generated_texts = []

    bow_word_ids = set()
    if bag_of_words and colorama:
        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"),
                                               tokenizer)
        for single_bow_list in bow_indices:
            # filtering all words in the list composed of more than 1 token
            filtered = list(filter(lambda x: len(x) <= 1, single_bow_list))
            # w[0] because we are sure w has only 1 item because previous fitler
            bow_word_ids.update(w[0] for w in filtered)

    # iterate through the perturbed texts
    for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts):
        try:
            # untokenize unperturbed text
            if colorama:
                import colorama

                pert_gen_text = ''
                for word_id in pert_gen_tok_text.tolist()[0]:
                    if word_id in bow_word_ids:
                        pert_gen_text += '{}{}{}'.format(
                            colorama.Fore.RED, tokenizer.decode([word_id]),
                            colorama.Style.RESET_ALL)
                    else:
                        pert_gen_text += tokenizer.decode([word_id])
            else:
                pert_gen_text = tokenizer.decode(pert_gen_tok_text.tolist()[0])

            print("= Perturbed generated text {} =".format(i + 1))
            print(pert_gen_text)
            print()

            # saves output to text file: 'samples.txt'
            load_words_in_text_file(pert_gen_text)

        except:
            pass

        # keep the prefix, perturbed seq, original seq for each index
        generated_texts.append(
            (tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text))

    # add inputs to input.txt
    input_text = tokenizer.decode(tokenized_cond_text)
    words = input_text.split("<|endoftext|>")
    if '<|endoftext|>' in words:
        words.remove('<|endoftext|>')
    s = ""
    input_text = s.join(words)

    emotion = get_emotion(tokenizer.decode(class_label))

    #0: no emotion, 1: anger, 2: disgust, 3: fear, 4: happiness, 5: sadness, 6: surprise

    f = open("input.txt", "w+")
    f.write(input_text)
    f.write('\n')
    f.write(emotion)
    f.close()

    return
Esempio n. 26
0
def run_pplm_example(
    pretrained_model="gpt2-medium",
    cond_text="",
    uncond=False,
    num_samples=1,
    bag_of_words=None,
    discrim=None,
    discrim_weights=None,
    discrim_meta=None,
    class_label=-1,
    length=100,
    stepsize=0.02,
    temperature=1.0,
    top_k=10,
    sample=False,
    num_iterations=3,
    grad_length=10000,
    horizon_length=1,
    window_length=0,
    decay=False,
    gamma=1.5,
    gm_scale=0.9,
    kl_scale=0.01,
    seed=0,
    no_cuda=False,
    colorama=False,
    repetition_penalty=1.0,
):
    # set Random seed
    torch.manual_seed(seed)
    np.random.seed(seed)

    # set the device
    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"

    if discrim == "generic":
        set_generic_model_params(discrim_weights, discrim_meta)

    if discrim is not None:
        pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][
            "pretrained_model"]
        print("discrim = {}, pretrained_model set to discriminator's = {}".
              format(discrim, pretrained_model))

    # load pretrained model
    model = GPT2LMHeadModel.from_pretrained(pretrained_model,
                                            output_hidden_states=True)
    model.to(device)
    model.eval()

    # load tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)

    # Freeze GPT-2 weights
    for param in model.parameters():
        param.requires_grad = False

    # figure out conditioning text
    if uncond:
        tokenized_cond_text = tokenizer.encode([tokenizer.bos_token])
    else:
        raw_text = cond_text
        while not raw_text:
            print("Did you forget to add `--cond_text`? ")
            raw_text = input("Model prompt >>> ")
        tokenized_cond_text = tokenizer.encode(tokenizer.bos_token + raw_text)

    print("= Prefix of sentence =")
    print(tokenizer.decode(tokenized_cond_text))
    print()

    # generate unperturbed and perturbed texts

    # full_text_generation returns:
    # unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time
    unpert_gen_tok_text, pert_gen_tok_texts, _, _ = full_text_generation(
        model=model,
        tokenizer=tokenizer,
        context=tokenized_cond_text,
        device=device,
        num_samples=num_samples,
        bag_of_words=bag_of_words,
        discrim=discrim,
        class_label=class_label,
        length=length,
        stepsize=stepsize,
        temperature=temperature,
        top_k=top_k,
        sample=sample,
        num_iterations=num_iterations,
        grad_length=grad_length,
        horizon_length=horizon_length,
        window_length=window_length,
        decay=decay,
        gamma=gamma,
        gm_scale=gm_scale,
        kl_scale=kl_scale,
        repetition_penalty=repetition_penalty,
    )

    # untokenize unperturbed text
    unpert_gen_text = tokenizer.decode(unpert_gen_tok_text.tolist()[0])

    print("=" * 80)
    print("= Unperturbed generated text =")
    print(unpert_gen_text)
    print()

    generated_texts = []

    bow_word_ids = set()
    if bag_of_words and colorama:
        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"),
                                               tokenizer)
        for single_bow_list in bow_indices:
            # filtering all words in the list composed of more than 1 token
            filtered = list(filter(lambda x: len(x) <= 1, single_bow_list))
            # w[0] because we are sure w has only 1 item because previous fitler
            bow_word_ids.update(w[0] for w in filtered)

    # iterate through the perturbed texts
    for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts):
        try:
            # untokenize unperturbed text
            if colorama:
                import colorama

                pert_gen_text = ""
                for word_id in pert_gen_tok_text.tolist()[0]:
                    if word_id in bow_word_ids:
                        pert_gen_text += "{}{}{}".format(
                            colorama.Fore.RED,
                            tokenizer.decode([word_id]),
                            colorama.Style.RESET_ALL,
                        )
                    else:
                        pert_gen_text += tokenizer.decode([word_id])
            else:
                pert_gen_text = tokenizer.decode(pert_gen_tok_text.tolist()[0])

            print("= Perturbed generated text {} =".format(i + 1))
            print(pert_gen_text)
            print()
        except Exception as exc:
            print("Ignoring error while generating perturbed text:", exc)

        # keep the prefix, perturbed seq, original seq for each index
        generated_texts.append(
            (tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text))

    return
def generate_with_bow_feedback(
        pretrained_model="gpt2-medium",
        cond_text="",
        num_samples=1,
        length=100,
        stepsize=0.02,
        temperature=1.0,
        top_k=10,
        sample=True,
        num_iterations=3,
        grad_length=10000,
        horizon_length=1,
        window_length=0,
        decay=False,
        gamma=1.5,
        gm_scale=0.9,
        kl_scale=0.01,
        seed=0,
        no_cuda=False,
        colorama=False,
        verbosity='regular',
        strategy='base',
        cache_dir=None,
):
    if strategy == 'exp' and num_samples > 1:
        raise NotImplementedError(
            "num_samples > 1 is not yet implemented for 'exp' strategy.")

    # set Random seed
    torch.manual_seed(seed)
    np.random.seed(seed)

    # set verbosiry
    verbosity_level = VERBOSITY_LEVELS.get(verbosity.lower(), REGULAR)

    # set the device
    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"

    # load pretrained model
    if cache_dir:
        model = GPT2LMHeadModel.from_pretrained(
            pretrained_model,
            cache_dir=cache_dir,
            output_hidden_states=True
        )
    else:
        model = GPT2LMHeadModel.from_pretrained(
            pretrained_model,
            output_hidden_states=True
        )
    model.to(device)
    model.eval()

    # load tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)

    # Freeze GPT-2 weights
    for param in model.parameters():
        param.requires_grad = False

    # figure out conditioning text
    raw_text = cond_text
    while not raw_text:
        print("Did you forget to add `--cond_text`? ")
        raw_text = input("Model prompt >>> ")
    tokenized_cond_text = tokenizer.encode(
        tokenizer.bos_token + raw_text,
        add_special_tokens=False
    )

    logger.info("= Prefix of sentence =")
    logger.info(tokenizer.decode(tokenized_cond_text))
    logger.info("\n")

    # generate unperturbed and perturbed texts
    # full_text_generation returns:
    # unpert_gen_tok_text, pert_gen_tok_texts, losses_in_time
    if strategy == 'base':
        pert_gen_tok_texts, _ = full_text_generation(
            model=model,
            cond_text=cond_text,
            tokenizer=tokenizer,
            context=tokenized_cond_text,
            device=device,
            num_samples=num_samples,
            length=length,
            stepsize=stepsize,
            temperature=temperature,
            top_k=top_k,
            sample=sample,
            num_iterations=num_iterations,
            grad_length=grad_length,
            horizon_length=horizon_length,
            window_length=window_length,
            decay=decay,
            gamma=gamma,
            gm_scale=gm_scale,
            kl_scale=kl_scale,
            verbosity_level=verbosity_level,
            generate_unpert=False
        )
    elif strategy == 'exp':
        prev_length = len(tokenized_cond_text)
        current_length = len(tokenized_cond_text) + 1
        current_cond_text = cond_text
        current_tokenized_cond_text = tokenized_cond_text
        while current_length < length:
            pert_gen_tok_texts, _ = full_text_generation(
                model=model,
                cond_text=current_cond_text,
                tokenizer=tokenizer,
                context=current_tokenized_cond_text,
                device=device,
                num_samples=1,
                length=current_length - prev_length,
                stepsize=stepsize,
                temperature=temperature,
                top_k=top_k,
                sample=sample,
                num_iterations=num_iterations,
                grad_length=grad_length,
                horizon_length=horizon_length,
                window_length=window_length,
                decay=decay,
                gamma=gamma,
                gm_scale=gm_scale,
                kl_scale=kl_scale,
                verbosity_level=verbosity_level,
                generate_unpert=False,
            )
            prev_length = current_length
            current_length *= 2
            current_length = min(current_length, length)
            current_cond_text = tokenizer.decode(pert_gen_tok_texts[0].tolist()[0])
            current_tokenized_cond_text = tokenizer.encode(
                current_cond_text,
                add_special_tokens=False
            )

    generated_texts = []

    bow_word_ids = set()
    if colorama:
        bow_indices = get_bag_of_words_indices(cond_text,
                                               tokenizer)
        for single_bow_list in bow_indices:
            # filtering all words in the list composed of more than 1 token
            filtered = list(filter(lambda x: len(x) <= 1, single_bow_list))
            # w[0] because we are sure w has only 1 item because previous fitler
            bow_word_ids.update(w[0] for w in filtered)

    # iterate through the perturbed texts
    for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts):
        try:
            # untokenize unperturbed text
            if colorama:
                import colorama

                pert_gen_text = ''
                for word_id in pert_gen_tok_text.tolist()[0]:
                    if word_id in bow_word_ids:
                        pert_gen_text += '{}{}{}'.format(
                            colorama.Fore.RED,
                            tokenizer.decode([word_id]),
                            colorama.Style.RESET_ALL
                        )
                    else:
                        pert_gen_text += tokenizer.decode([word_id])
            else:
                pert_gen_text = tokenizer.decode(pert_gen_tok_text.tolist()[0])

            logger.info("= Perturbed generated text {} =".format(i + 1))
            logger.info(pert_gen_text)
            logger.info("\n")
        except:
            pass

        generated_texts.append(
            tokenizer.decode(pert_gen_tok_text.tolist()[0])
        )

    return generated_texts
Esempio n. 28
0
def run_pplm_example(
    pretrained_model="gpt2-medium",
    cond_text="",
    uncond=False,
    num_samples=1,
    bag_of_words=None,
    discrim=None,
    discrim_weights=None,
    discrim_meta=None,
    class_label=-1,
    length=100,
    stepsize=0.02,
    temperature=1.0,
    top_k=10,
    sample=True,
    num_iterations=3,
    grad_length=10000,
    horizon_length=1,
    window_length=0,
    decay=False,
    gamma=1.5,
    gm_scale=0.9,
    kl_scale=0.01,
    seed=0,
    no_cuda=False,
    colorama=False,
    verbosity='regular',
    file=None,
    sample_method=PERTURBED,
    vad_loss_params=None,
    vad_threshold=0.01,
):
    # set Random seed
    torch.manual_seed(seed)
    np.random.seed(seed)

    # set verbosity
    verbosity_level = VERBOSITY_LEVELS.get(verbosity.lower(), REGULAR)

    # set generation method
    generation_method = GENERATION_METHODS.get(sample_method, PERTURBED)

    # set the device
    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"

    if discrim == 'generic':
        set_generic_model_params(discrim_weights, discrim_meta)

    if discrim is not None:
        discriminator_pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][
            "pretrained_model"]
        if pretrained_model != discriminator_pretrained_model:
            pretrained_model = discriminator_pretrained_model
            if verbosity_level >= REGULAR:
                print("discrim = {}, pretrained_model set "
                      "to discriminator's = {}".format(discrim,
                                                       pretrained_model))

    import logging
    logging.basicConfig(level=logging.INFO)

    # load pretrained model
    model = GPT2LMHeadModel.from_pretrained(
        pretrained_model,
        output_hidden_states=True  # passed to model's __init__ method
    )
    model.to(device)
    model.eval()

    # load tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)

    # Freeze GPT-2 weights
    for param in model.parameters():
        param.requires_grad = False

    # figure out conditioning text
    if uncond:
        tokenized_cond_text = tokenizer.encode([tokenizer.bos_token],
                                               add_special_tokens=False)
    else:
        raw_text = cond_text
        while not raw_text:
            print("Did you forget to add `--cond_text`? ")
            raw_text = input("Model prompt >>> ")
        tokenized_cond_text = tokenizer.encode(tokenizer.bos_token + raw_text,
                                               add_special_tokens=False)

    print("= Prefix of sentence =")
    print(tokenizer.decode(tokenized_cond_text))
    print()

    # generate unperturbed and perturbed texts

    # full_text_generation returns:
    # unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time
    # generate one GPT-2 sample and multiple PPLM samples
    unpert_gen_tok_text, pert_gen_tok_texts, _, _, num_changes_list = full_text_generation(
        model=model,
        tokenizer=tokenizer,
        context=tokenized_cond_text,
        device=device,
        num_samples=num_samples,
        bag_of_words=bag_of_words,
        discrim=discrim,
        class_label=class_label,
        length=length,
        stepsize=stepsize,
        temperature=temperature,
        top_k=top_k,
        sample=sample,
        num_iterations=num_iterations,
        grad_length=grad_length,
        horizon_length=horizon_length,
        window_length=window_length,
        decay=decay,
        gamma=gamma,
        gm_scale=gm_scale,
        kl_scale=kl_scale,
        verbosity_level=verbosity_level,
        file=file,
        generation_method=generation_method,
        vad_loss_params=vad_loss_params,
        vad_threshold=vad_threshold,
    )

    # untokenize unperturbed text
    unpert_gen_text = tokenizer.decode(unpert_gen_tok_text.tolist()[0])

    if verbosity_level >= REGULAR:
        print("=" * 80)
        print("= Unperturbed generated text =")
        print(unpert_gen_text)
        print()

    generated_texts = []

    bow_word_ids = set()
    if bag_of_words and colorama:
        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"),
                                               tokenizer)
        for single_bow_list in bow_indices:
            # filtering all words in the list composed of more than 1 token
            filtered = list(filter(lambda x: len(x) <= 1, single_bow_list))
            # w[0] because we are sure w has only 1 item because previous fitler
            bow_word_ids.update(w[0] for w in filtered)

    # iterate through the perturbed texts
    for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts):
        try:
            # untokenize unperturbed text
            if colorama:
                import colorama

                pert_gen_text = ''
                for word_id in pert_gen_tok_text.tolist()[0]:
                    if word_id in bow_word_ids:
                        pert_gen_text += '{}{}{}'.format(
                            colorama.Fore.RED, tokenizer.decode([word_id]),
                            colorama.Style.RESET_ALL)
                    else:
                        pert_gen_text += tokenizer.decode([word_id])
            else:
                pert_gen_text = tokenizer.decode(pert_gen_tok_text.tolist()[0])

            print("= Perturbed generated text {} =".format(i + 1))
            print(pert_gen_text)
            print()

            # log sample
            if verbosity_level >= QUIET:
                if verbosity_level >= REGULAR:
                    pert_gen_text += '【{} words changed】'.format(
                        num_changes_list[i])
                file.write(pert_gen_text)

        except:
            pass

        # keep the prefix, perturbed seq, original seq for each index
        generated_texts.append(
            (tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text))

    # log average changes
    changes_mean = stat.mean(num_changes_list)
    if verbosity_level >= QUIET:
        print('========{} words changed(mean)========'.format(changes_mean))
        if verbosity_level >= REGULAR:
            file.write('\n========{} words changed(mean)========'.format(
                changes_mean))

    return changes_mean
Esempio n. 29
0
def main():
    global logger
    args = set_interact_args()
    logger = create_logger(args)
    # 当用户使用GPU,并且GPU可用时
    args.cuda = torch.cuda.is_available() and not args.no_cuda
    device = 'cuda' if args.cuda else 'cpu'
    logger.info('using device:{}'.format(device))
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device
    tokenizer = BertTokenizer(vocab_file=args.voca_path)
    model = GPT2LMHeadModel.from_pretrained(args.dialogue_model_path)
    model.to(device)
    model.eval()

    n_ctx = model.config.to_dict().get("n_ctx")
    print(f'dialogue model path load: {args.dialogue_model_path}')

    # generate the test file
    # read the file and open the writable file
    corpus = []
    print(f'========== n_ctx of the model and datasets: {n_ctx} ==========')
    with open(args.test_data_path) as f:
        for line in f.readlines():
            line = line.lower()
            line = line.strip().replace('<user0>',
                                        '').replace('<user1>', '').replace(
                                            '__eou__', '[SEP]')
            corpus.append(line)
    fw = open(args.save_samples_path, 'w')
    for line in tqdm(corpus):
        input_ids = [tokenizer.cls_token_id
                     ] + tokenizer.encode(line) + [tokenizer.sep_token_id]
        if len(input_ids) > n_ctx:
            curr_input_tensor = torch.tensor(
                [tokenizer.cls_token_id] +
                input_ids[-(n_ctx - 1):]).long().to(device)
        else:
            curr_input_tensor = torch.tensor(input_ids).long().to(device)

        generated = []
        for _ in range(args.max_len):
            outputs = model(input_ids=curr_input_tensor)
            next_token_logits = outputs[0][-1, :]

            # for id in set(generated):
            #     next_token_logits[id] /= args.repetition_penalty
            next_token_logits = next_token_logits / args.temperature
            next_token_logits[tokenizer.convert_tokens_to_ids(
                '[UNK]')] = -float('Inf')
            filtered_logits = top_k_top_p_filtering(next_token_logits,
                                                    top_k=args.topk,
                                                    top_p=args.topp)
            next_token = torch.multinomial(F.softmax(next_token_logits,
                                                     dim=-1),
                                           num_samples=1)
            if next_token == tokenizer.sep_token_id:  # 遇到[SEP]则表明response生成结束
                break
            generated.append(next_token.item())
            curr_input_tensor = torch.cat((curr_input_tensor, next_token),
                                          dim=0)

        text = tokenizer.convert_ids_to_tokens(generated)
        # ipdb.set_trace()
        text = ' '.join(text)
        fw.write(f'{text}\n')
        fw.flush()
    fw.close()
    '''
Esempio n. 30
0
def run_pplm_example(
        pretrained_model="gpt2-medium",
        cond_text="",
        uncond=False,
        num_samples=1,
        bag_of_words=None,
        discrim=None,
        discrim_weights=None,
        discrim_meta=None,
        class_label=-1,
        length=100,
        stepsize=0.02,
        temperature=1.0,
        top_k=10,
        sample=True,
        num_iterations=3,
        grad_length=10000,
        horizon_length=1,
        window_length=0,
        decay=False,
        gamma=1.5,
        gm_scale=0.9,
        kl_scale=0.01,
        seed=0,
        no_cuda=False,
        colorama=False,
        verbosity='regular'
):
    # random seed 설정
    torch.manual_seed(seed)
    np.random.seed(seed)

    # set verbosiry
    # GPT2 사이즈 설정 과정으로 보임 regular 여서 1으로 반환
    verbosity_level = VERBOSITY_LEVELS.get(verbosity.lower(), REGULAR)

    # set the device
    # gpu,cpu 설정 과정
    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"

    # discrim 모델 설정 과정 예제는 discrim = 'sentiment'
    if discrim == 'generic':
        set_generic_model_params(discrim_weights, discrim_meta)

    if discrim is not None:
        discriminator_pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][
            "pretrained_model"
        ]
        if pretrained_model != discriminator_pretrained_model:
            pretrained_model = discriminator_pretrained_model
            if verbosity_level >= REGULAR:
                print("discrim = {}, pretrained_model set "
                "to discriminator's = {}".format(discrim, pretrained_model))

    # load pretrained model
    # bow 의 경우 기존의 GPT2 model load
    model = GPT2LMHeadModel.from_pretrained(
        pretrained_model,
        output_hidden_states=True
    )
    model.to(device)
    model.eval()

    # load tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)

    # Freeze GPT-2 weights
    for param in model.parameters():
        param.requires_grad = False

    # figure out conditioning text
    # 기본값인 uncond=False 
    if uncond:
        tokenized_cond_text = tokenizer.encode(
            [tokenizer.bos_token],
            add_special_tokens=False
        )
    else:
        # Bow 예제 cond_text="The potato"
        raw_text = cond_text
        # cond_text 가 없을시 즉석으로 입력받아 설정
        while not raw_text:
            print("Did you forget to add `--cond_text`? ")
            raw_text = input("Model prompt >>> ")
        #그 후 시작을 나타내는 토큰을 붙이고 토큰화 실행
        tokenized_cond_text = tokenizer.encode(
            tokenizer.bos_token + raw_text,
            add_special_tokens=False
        )

    # 시작 문구 출력
    print("= Prefix of sentence =")
    print(tokenizer.decode(tokenized_cond_text))
    print()

    # generate unperturbed and perturbed texts

    # full_text_generation returns:
    # unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time
    unpert_gen_tok_text, pert_gen_tok_texts, _, _ = full_text_generation(
        model=model,
        tokenizer=tokenizer,
        context=tokenized_cond_text,
        device=device,
        num_samples=num_samples,
        bag_of_words=bag_of_words,
        discrim=discrim,
        class_label=class_label,
        length=length,
        stepsize=stepsize,
        temperature=temperature,
        top_k=top_k,
        sample=sample,
        num_iterations=num_iterations,
        grad_length=grad_length,
        horizon_length=horizon_length,
        window_length=window_length,
        decay=decay,
        gamma=gamma,
        gm_scale=gm_scale,
        kl_scale=kl_scale,
        verbosity_level=verbosity_level
    )

    # untokenize unperturbed text
    unpert_gen_text = tokenizer.decode(unpert_gen_tok_text.tolist()[0])

    if verbosity_level >= REGULAR:
        print("=" * 80)
    print("= Unperturbed generated text =")
    print(unpert_gen_text)
    print()

    generated_texts = []

    bow_word_ids = set()
    if bag_of_words and colorama:
        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"),
                                               tokenizer)
        for single_bow_list in bow_indices:
            # filtering all words in the list composed of more than 1 token
            filtered = list(filter(lambda x: len(x) <= 1, single_bow_list))
            # w[0] because we are sure w has only 1 item because previous fitler
            bow_word_ids.update(w[0] for w in filtered)

    # iterate through the perturbed texts
    for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts):
        try:
            # untokenize unperturbed text
            if colorama:
                import colorama

                pert_gen_text = ''
                for word_id in pert_gen_tok_text.tolist()[0]:
                    if word_id in bow_word_ids:
                        pert_gen_text += '{}{}{}'.format(
                            colorama.Fore.RED,
                            tokenizer.decode([word_id]),
                            colorama.Style.RESET_ALL
                        )
                    else:
                        pert_gen_text += tokenizer.decode([word_id])
            else:
                pert_gen_text = tokenizer.decode(pert_gen_tok_text.tolist()[0])

            print("= Perturbed generated text {} =".format(i + 1))
            print(pert_gen_text)
            print()
        except:
            pass

        # keep the prefix, perturbed seq, original seq for each index
        generated_texts.append(
            (tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text)
        )

    return