def InitModel(self): logger.info( f"Starting conv model with gpu: {torch.cuda.is_available()}") """ This takes care of loading model/dataset/tokenizing. Can be called async or in a seperate thread so as to avoid loooong waiting time""" # Start with model and download pretrained if neccesary if self.args["model_checkpoint"] == "": logger.debug("Downloading pretrained model...") self.args["model_checkpoint"] = download_pretrained_model() # do model setup and tokenize vocabulary tokenizer_class = (GPT2Tokenizer if self.args["model"] == "gpt2" else OpenAIGPTTokenizer) logger.debug("Opening tokenizer class from pretrained model...") self.tokenizer = tokenizer_class.from_pretrained( self.args["model_checkpoint"]) model_class = (GPT2LMHeadModel if self.args["model"] == "gpt2" else OpenAIGPTLMHeadModel) logger.debug("Opening model class from pretrained model...") self.model = model_class.from_pretrained(self.args["model_checkpoint"]) self.model.to(self.args["device"]) self.model.eval() logger.debug("Getting dataset personalities...") personalities = get_dataset_personalities(self.tokenizer, self.args["dataset_path"], self.args["dataset_cache"]) logger.debug("Selecting a random personality...") self.personality = random.choice(personalities) logger.info(f"Selected personality: " + f"{self.tokenizer.decode(chain(*self.personality))}") self.is_ready = True logger.info("⭐Model initialized and ready to go! ⭐")
def run_interactive(tokenizer, model, args): logger.info("Sample a personality") personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache) personality = list(chain(*random.choice(personalities))) history_encoded = [] history_types_encoded = [] logger.info("Selected personality: %s", tokenizer.decode(personality)) while True: raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") history_encoded.append(tokenizer.encode(raw_text)) history_types_encoded.append( tokenizer.convert_tokens_to_ids(TYPE_USER)) with torch.no_grad(): out_ids, _ = sample_sequence(personality=personality, utterances=history_encoded, utterance_types=history_types_encoded, tokenizer=tokenizer, model=model, args=args) history_encoded.append(out_ids) history_types_encoded.append(tokenizer.convert_tokens_to_ids(TYPE_BOT)) history_encoded = history_encoded[-(2 * args.max_history + 1):] history_types_encoded = history_types_encoded[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print(out_text)
def run(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": args.model_checkpoint = download_pretrained_model() random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_checkpoint) model = OpenAIGPTLMHeadModel.from_pretrained(args.model_checkpoint) model.to(args.device) model.eval() logger.info("Sample a personality") personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache) personality = random.choice(personalities) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] while True: raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2*args.max_history+1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print(out_text)
def WriteAvailablePersonalities(self, filename="/tmp/personalities.txt"): """Lists and decodes all personalities and writes to filename""" personalities = get_dataset_personalities(self.tokenizer, self.args["dataset_path"], self.args["dataset_cache"]) with open(filename, "w") as out: maxFailures = 5 failures = 0 successes = 0 for p in personalities: if failures > maxFailures: logger.error( "Too many failures. Aborting personality write") break try: out.write(self.tokenizer.decode(chain(*p))) out.write("\n" + ("-" * 50) + "\n") successes += 1 except: logger.warning(f"Couldn't write personality: {p}") failures += 1
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="gpt2", help="Model type (gpt or gpt2)") parser.add_argument("--model_checkpoint", "-mc", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=100, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=float, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") # add option to not use personality parser.add_argument("--no_personality", type=bool, default=True, help="Set to not sample a personality.") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if os.path.isdir("./huggingface_s3/"): args.model_checkpoint = "./huggingface_s3/" logger.info("Loading from pre-downloaded temp path: {}".format( args.model_checkpoint)) else: args.model_checkpoint = download_pretrained_model() random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if "gpt2" == args.model else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) model.eval() # added the option to opt out of using a personality if args.no_personality: logger.info("No personality is sampled for this chatbot.") personality = "" # personality = ["My name is Isabelle Hawkins.", # "I am five years old.", # "My phone number is 959-100-9300.", # "Here is a link I would like you to check out: google.com.", # "I would like to know more about you."] # personality = [tokenizer.encode(p) for p in personality] # logger.info("Selected custom personality: %s",tokenizer.decode(chain(*personality))) else: logger.info("Sample a personality") personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache) personality = random.choice(personalities) # import pdb; pdb.set_trace() logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] # while True: # custom_history = input("Press 0 to end\n\tAdd history: ") # if custom_history == '0': # break # else: # history.append(tokenizer.encode(custom_history)) while True: history = [] args.temperature = float(input("Set temperature: > 0 and <= 1")) prompt = input("Speaker 1 >>> ") while not prompt: print('Prompt should not be empty!') prompt = input("Speaker 1 >>> ") history.append(tokenizer.encode(prompt)) i = 0 while True: with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) i += 1 speaker = "Speaker 2" if i % 2 else "Speaker 1" print(f"{speaker}: {out_text}") if i == 10: break
def main(args): logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Import %s", args.train_mod) train_mod = __import__(args.train_mod, globals(), locals(), [ 'GPT2BPETokenizer_CN', 'SPECIAL_TOKENS', 'build_input_from_segments', 'add_special_tokens_' ], 0) global SPECIAL_TOKENS, build_input_from_segments, add_special_tokens_ GPT2BPETokenizer_CN = train_mod.GPT2BPETokenizer_CN SPECIAL_TOKENS = train_mod.SPECIAL_TOKENS build_input_from_segments = train_mod.build_input_from_segments add_special_tokens_ = train_mod.add_special_tokens_ logger.info("Get pretrained model and tokenizer") model_class, tokenizer_class = GPT2DoubleHeadsModel, GPT2BPETokenizer_CN logger.info("load tokenizer....") tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) logger.info("load model....") model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) n_positions = len(model.transformer.wpe.weight) logger.info("Sample a personality") personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache) personality = random.choice(personalities) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) # HUP 清空 history def sighup_fn(signum, frame): logger.info('Signal %s!', signum) nonlocal history history = [] signal.signal(signal.SIGHUP, sighup_fn) history = [] try: while True: raw_text = input(">>> ").strip() while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ").strip() history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = [] for out_id in sample_generate(personality, history, tokenizer, model, args): out_ids.append(out_id) out_text = tokenizer.decode([out_id], skip_special_tokens=True) print(out_text.strip(), end='') print() history.append(out_ids) history = history[-(2 * args.max_history + 1):] history_length = sum(len(ids) for ids in history) if history_length >= n_positions: warnings.warn('历史数据 tokens 长度 %s 大于等于 %s,程序将崩溃!', history_length, n_positions) except KeyboardInterrupt: logger.warning('KeyboardInterrupt')
def run(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="gpt2", help="Model type (gpt or gpt2)") parser.add_argument("--model_checkpoint", "-mc", type=str, default="", help="Path, url or short name of the model") parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=50, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") # add option to not use personality parser.add_argument("--no_personality", type=bool, default=True, help="Set to not sample a personality.") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if os.path.isdir("./huggingface_s3/"): args.model_checkpoint = "./huggingface_s3/" logger.info("Loading from pre-downloaded temp path: {}".format(args.model_checkpoint)) else: args.model_checkpoint = download_pretrained_model() with open("dailydialog_formatted.json", "r") as f: valid = json.load(f) inputs = [utterance['history'] for instance in valid for utterance in instance['utterances']] shuffled_inputs = shuffle(inputs, random_state=42, n_samples=1000) random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class = GPT2Tokenizer if "gpt2" == args.model else OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2LMHeadModel if "gpt2" == args.model else OpenAIGPTLMHeadModel model = model_class.from_pretrained(args.model_checkpoint) add_special_tokens_(model, tokenizer) model.to(args.device) model.eval() # added the option to opt out of using a personality if args.no_personality: logger.info("No personality is sampled for this chatbot.") personality = [""] # personality = ["My name is Isabelle Hawkins.", # "I am five years old.", # "My phone number is 959-100-9300.", # "Here is a link I would like you to check out: google.com.", # "I would like to know more about you."] # personality = [tokenizer.encode(p) for p in personality] # logger.info("Selected custom personality: %s",tokenizer.decode(chain(*personality))) else: logger.info("Sample a personality") personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache) personality = random.choice(personalities) # import pdb; pdb.set_trace() logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) device = 'gpu' if args.device == 'cuda' else 'cpu' time_path = Path(args.model_checkpoint).absolute().name + f"{device}_predict_time.txt" f = open(time_path, 'w') total_time = 0 lines = [] for history in tqdm(shuffled_inputs): start = time.time() tokenized_history = [tokenizer.encode(h) for h in history] with torch.no_grad(): out_ids = sample_sequence(personality, tokenized_history, tokenizer, model, args) out_text = tokenizer.decode(out_ids, skip_special_tokens=True) elapsed = time.time() - start rounded_ = round(elapsed, 2) total_time += elapsed input_text = 'Input: ' + ' --- '.join(history) + '\n' output = f'Output: {out_text}\n' elapsed_text = f'Elapsed time: {rounded_}s\n' lines.append(input_text + output + elapsed_text) if args.device == 'cpu': processor = get_processor_name() f.writelines(f"CPU predictions\nProcessor: {processor}\n") else: f.writelines(f"GPU predictions\n") avg_time = round((total_time / len(shuffled_inputs)), 2) f.writelines(f"Average time elapsed per prediction: {avg_time} seconds\n\n") f.writelines(lines) f.close()
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="gpt", help="Model type (gpt or gpt2)") parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": args.model_checkpoint = download_pretrained_model() random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class = GPT2Tokenizer if "gpt2" == args.model else OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2LMHeadModel if "gpt2" == args.model else OpenAIGPTLMHeadModel model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) model.eval() logger.info("Sample a personality") personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache) personality = random.choice(personalities) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] while True: print( "MyData: Type ==> 'Convo' for an interactive chat OR Type ==> 'ask' for Asking Questions... " ) raw_text = input("You: ") raw_text = raw_text.lower() if raw_text.replace(' ', '') == 'ask': print('MyData: Please Ask your Question..') message = input('You: ') main(message) elif raw_text.replace(' ', '') == 'convo': print('MyData: Hello, whats up?') continue while not raw_text: print('Please i did not get that') raw_text = input("You: ") history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print("MyData: {}".format(out_text))
def run(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="gpt", help="Model type (gpt or gpt2)") parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": args.model_checkpoint = download_pretrained_model() random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class = GPT2Tokenizer if "gpt2" == args.model else OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2LMHeadModel if "gpt2" == args.model else OpenAIGPTLMHeadModel model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) B_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') B_model = BertModel.from_pretrained('bert-base-uncased') B_model.eval() logger.info("Sample a personality") personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache) personality = random.choice(personalities) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] class WindowClass(QMainWindow, form_class): def __init__(self): super().__init__() self.setupUi(self) self.enter.clicked.connect(self.enterPlainTextEdit) # PlainTextEdit과 관련된 함수 def enterPlainTextEdit(self): input = self.user.toPlainText() self.bot.append(input) raw_text = input history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, B_tokenizer, model, args, B_model) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) self.bot.append(out_text) app = QApplication(sys.argv) myWindow = WindowClass() myWindow.show() app.exec_()
def run(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--model", type=str, default="bert", help="Model type (gpt or gpt2)") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model_checkpoint", type=str, default="/home/rohola/codes/transfer-learning-conv-ai/logs/logs", help="Path, url or short name of the model") #parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") #parser.add_argument("--model_checkpoint", type=str, default="/home/rohola/data/bert_checkpoint", help="Path, url or short name of the model") parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") if args.model == "bert": tokenizer_class = BertTokenizer model_class = BertLMHeadModel elif args.model == "gpt2": tokenizer_class = GPT2Tokenizer model_class = GPT2LMHeadModel if args.model == "bert": SPECIAL_TOKENS = ["[BOS]", "[EOS]", "[SPEAKER1]", "[SPEAKER2]", "[PAD]"] else: SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"] tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) model.eval() if args.length < 0 and model.config.max_position_embeddings > 0: args.length = model.config.max_position_embeddings elif 0 < model.config.max_position_embeddings < args.length: args.length = model.config.max_position_embeddings # No generation bigger than model size elif args.length < 0: args.length = MAX_LENGTH # avoid infinite loop logger.info("Sample a personality") personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache) personality = random.choice(personalities) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] while True: raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") history.append(tokenizer.encode(raw_text)) with torch.no_grad(): #out_ids = sample_sequence(personality, history, tokenizer, model, args, SPECIAL_TOKENS) out = sample_sequence( args=args, model=model, personality=personality, history=history, length=args.length, temperature=args.temperature, top_k=args.top_k, top_p=args.top_p, device=args.device, is_xlnet=bool(args.model_type == "xlnet"), ) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print(out_text)
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="gpt2", help="Model type (gpt or gpt2)") parser.add_argument("--model_checkpoint", "-mc", type=str, default="runs/gpt2_convai_yesand", help="Path, url or short name of the model") parser.add_argument( "--max_history", "-mh", type=int, default=5, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") # set a number of top choices to show parser.add_argument("--top_c", type=int, default=10, help="Determine how many top choices to be shown.") # add option to not use personality parser.add_argument("--no_personality", "-np", action='store_true', help="Set to not sample a personality.") # use text file to deduce results parser.add_argument( "--email_sequence", "-es", default=None, help= "Provide the text file for which to generate outputs for the chatbot.") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if os.path.isdir("./huggingface_s3/"): args.model_checkpoint = "./huggingface_s3/" logger.info("Loading from pre-downloaded temp path: {}".format( args.model_checkpoint)) else: args.model_checkpoint = download_pretrained_model() random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class = GPT2Tokenizer if "gpt2" == args.model else OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2LMHeadModel if "gpt2" == args.model else OpenAIGPTLMHeadModel model = model_class.from_pretrained(args.model_checkpoint) add_special_tokens_(model, tokenizer) model.to(args.device) model.eval() # added the option to opt out of using a personality if args.no_personality: logger.info("No personality is sampled for this chatbot.") personality = "" # personality = ["My name is Isabelle Hawkins.", # "I am five years old.", # "My phone number is 959-100-9300.", # "Here is a link I would like you to check out: google.com.", # "I would like to know more about you."] # personality = [tokenizer.encode(p) for p in personality] # logger.info("Selected custom personality: %s",tokenizer.decode(chain(*personality))) else: logger.info("Sample a personality") personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache) personality = random.choice(personalities) # import pdb; pdb.set_trace() logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) # test case to see if decoder works as expected. If not, see if special tokens are appropriately added to the tokenizer test_case = "Hello, my name is Justin. I'd like a strawberry cheesecake." test_encode = tokenizer.encode(test_case) test_decode = tokenizer.decode(test_encode) assert test_case == test_decode # adapt code for efficient experimentation of existing email exchanges if args.email_sequence: logger.info( f"Running chatbot generations for {os.path.split(args.email_sequence)[-1]}" ) while True: with open(args.email_sequence, 'r') as f: email_sequence = f.readlines() his_length = int( input( "Indicate how many exchanges you want to refer back to - must be an integer. \n1 indicates only the most recent email from the scammer: " )) email_sequence = [ e for e in email_sequence if re.sub('\n', '', e) ] # remove any empty lines email_sequence = email_sequence[-((his_length - 1) * 2 + 1):] history = [tokenizer.encode(e) for e in email_sequence] logger.info("Used input:\n") for idx, e in enumerate(email_sequence): output = f"\tUser: {e}" if idx % 2 else f"\tScammer: {e}" print(output) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) out_texts = [ tokenizer.decode(o, skip_special_tokens=True) for o in out_ids ] print( f"Top {args.top_c} choices of history length = {his_length}:") for idx, o in enumerate(out_texts): print(f"\t{idx}: {o}") # manual generation else: history = [] while True: custom_history = input("Press 0 to end\n\tAdd history: ") if custom_history == '0': break else: history.append(tokenizer.encode(custom_history)) while True: raw_text = input("Scammer >>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input("Scammer >>> ") history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) # multiple retries # history.append(out_ids) # history = history[-(2*args.max_history+1):] out_texts = [ tokenizer.decode(o, skip_special_tokens=True) for o in out_ids ] print(f"Top {args.top_c} choices:") for idx, o in enumerate(out_texts): print(f"\t{idx}: {o}")