def model_tokenizer(args): if torch.cuda.is_available(): args["device"] = "cuda" else: args["device"] = "cpu" if args["model_checkpoint"] == "": if args["model"] == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args["model_checkpoint"] = download_pretrained_model() if args["seed"] != 0: random.seed(args["seed"]) torch.random.manual_seed(args["seed"]) torch.cuda.manual_seed(args["seed"]) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args["model"] == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args["model_checkpoint"]) model = model_class.from_pretrained(args["model_checkpoint"]) model.to(args["device"]) add_special_tokens_(model, tokenizer) logger.info("Get text to emote model") emote_clf = txtemote_model(args["txtemotion_dataset_path"]) return model, emote_clf, tokenizer
def run(): pretrained_model = utils.download_pretrained_model() tokenizer_class, model_class = (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(pretrained_model) model = model_class.from_pretrained(pretrained_model) model.to("cpu") add_special_tokens_(model, tokenizer) dataset = utils.get_dataset(tokenizer, "./dataset_cache") features = [ dialog["feature"] for dataset in dataset.values() for dialog in dataset ] feature = random.choice(features) print("Examples of selected feature:\n", tokenizer.decode(itertools.chain(*feature))) background = [tokenizer.encode("tell me about yourself")] generated_lyrics = [] hist_size = 2 for _ in range( 5 ): # how many lines of lyrics to generate - time grows exponentially with this value with torch.no_grad(): out_ids = sample_sequence(feature, background, tokenizer, model) background.append(out_ids) background.append(random.choice(background)) background = background[ -5:] # size of history to retain (needs to be odd number since we're using two headed model) this_line = tokenizer.decode(out_ids, skip_special_tokens=True) generated_lyrics.append(this_line) print("\nGenerated lyrics:") print("\n".join(generated_lyrics))
def run(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset.") parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--gpt2_model_name", type=str, default="gpt2", help="name of the model ex)openai-gpt") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=40, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=1.0, help="Sampling softmax temperature") parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') bert_model = BertModel.from_pretrained('bert-base-uncased') bert_model.to(args.device) bert_model.eval() tokenizer_class = GPT2Tokenizer if "gpt2" in args.gpt2_model_name else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) config_class = GPT2Config if "gpt2" in args.gpt2_model_name else OpenAIGPTConfig gpt_config = config_class.from_pretrained(args.model_checkpoint) model_class = GPT2LMHeadModel if "gpt2" in args.gpt2_model_name else OpenAIGPTLMHeadModel model = model_class.from_pretrained(args.model_checkpoint, config=gpt_config) model.to(args.device) add_special_tokens_(model, tokenizer) model.eval() sourceList, targetList = get_test_datasetEN2(bert_tokenizer, tokenizer, args.dataset_path) current_time = datetime.now().strftime('%b%d_%H-%M-%S') f1 = open((args.model_checkpoint + current_time + "_output.txt"), 'w') for line in zip(sourceList, targetList): out_ids = sample_sequence(line[0], bert_tokenizer, model, bert_model, tokenizer, args) out_texts = tokenizer.decode(out_ids) for text in out_texts: f1.write(text.replace('▁', ' ').replace('</s>', ' ')) """ for id in out_ids: f1.write(str(id)) f1.write(' ') """ f1.write("\n") f1.close()
def load_model(model_checkpoint, model_name='openai-gpt', device='cuda:0'): tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if model_name == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(model_checkpoint) model = model_class.from_pretrained(model_checkpoint) model.to(device) add_special_tokens_(model, tokenizer) return model, tokenizer
def run(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=float, default=0.7, help="Sampling softmax temperature") parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError("Interacting with GPT2 requires passing a finetuned model_checkpoint") else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = (GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) global tokenizer tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) global model model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) personalities = [dialog["personality"] for dataset in dataset.values() for dialog in dataset] personality = random.choice(personalities) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) return model, tokenizer, args, personality
def __init__(self, opt, shared=None): super(TransformerAgent, self).__init__(opt, shared) args = AttrDict( opt) # to keep most commands identical to the interact.py script self.args = args logging.basicConfig(level=logging.INFO) self.logger = logging.getLogger(__file__) self.logger.info(pformat(args)) random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if shared is None: self.logger.info("Get pretrained model and tokenizer") if args.model_checkpoint == "": args.model_checkpoint = download_pretrained_model() if 'gpt2' in args.model_checkpoint: self.tokenizer = GPT2Tokenizer.from_pretrained( args.model_checkpoint) model_class = GPT2DoubleHeadsModel if self.args.eval_type == "hits@1" else GPT2LMHeadModel else: self.tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_checkpoint) model_class = OpenAIGPTDoubleHeadsModel if self.args.eval_type == "hits@1" else OpenAIGPTLMHeadModel self.model_checkpoint = model_class.from_pretrained( args.model_checkpoint) self.model_checkpoint.to(args.device) self.logger.info("Build BPE prefix dictionary") convai_dict = build_dict() assert len(convai_dict) == 19304 self.prefix2words = self.get_prefix2words(convai_dict) else: self.model_checkpoint = shared['model'] self.tokenizer = shared['tokenizer'] self.prefix2words = shared['prefix2words'] add_special_tokens_(self.model_checkpoint, self.tokenizer) self.special_tokens_ids = self.tokenizer.convert_tokens_to_ids( SPECIAL_TOKENS) self.persona = [] self.history = [] self.labels = [] self.reset()
def get_model( dataset_path="", dataset_cache='./dataset_cache', model="openai-gpt", model_checkpoint="", device="cuda" if torch.cuda.is_available() else "cpu", seed=0, ): logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) if model_checkpoint == "": if model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: model_checkpoint = download_pretrained_model() if seed != 0: random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = (GPT2Tokenizer, GPT2LMHeadModel) if model == 'gpt2' else ( OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(model_checkpoint) model = model_class.from_pretrained(model_checkpoint) model.to(device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") dataset = get_dataset(tokenizer, dataset_path, dataset_cache) personalities = [ dialog["personality"] for dataset in dataset.values() for dialog in dataset ] personality = random.choice(personalities) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) return model, personality, tokenizer
def init(): args = { "dataset_path": "", "dataset_cache": "./dataset_cache_GPT2tokenizer", "model": "gp2", "model_checkpoint": "../runs/Sep19_21-11-42_micah-HP-ENVY-x360-Convertible-15-ee0xxx_gpt2/", "max_history": 2, "device": "cpu", "max_length": 20, "min_length": 1, "seed": 0, "temperature": 0.7, "top_k": 0, "top_p": 0.9 } if args.get("model_checkpoint") == "": if args.get("model") == 'gpt2': raise ValueError("Interacting with GPT2 requires passing a finetuned model_checkpoint") else: args["model_checkpoint"] = download_pretrained_model() if args.get("seed") != 0: random.seed(args.get("seed")) torch.random.manual_seed(args.get("seed")) torch.cuda.manual_seed(args.get("seed")) print("Get pretrained model and tokenizer") tokenizer_class, model_class = (GPT2Tokenizer, GPT2LMHeadModel) if args.get("model") == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.get("model_checkpoint")) model = model_class.from_pretrained(args.get("model_checkpoint")) model.to(args.get("device")) add_special_tokens_(model, tokenizer) print("Sample a personality") dataset = get_dataset(tokenizer, args.get("dataset_path"), args.get("dataset_cache")) personalities = [dialog["personality"] for dataset in dataset.values() for dialog in dataset] personality = random.choice(personalities) print(tokenizer.decode(chain(*personality))) return tokenizer, personality, model, args
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) personalities = [ dialog["personality"] for dataset in dataset.values() for dialog in dataset ] personality = random.choice(personalities) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) tts = gTTS(tokenizer.decode(chain(*personality))) tts.save('out.mp3') playsound('out.mp3') # YASSIN @app.route('/<string:page_name>/') def render_static(page_name): return render_template('%s.html' % page_name) @app.route('/api/chat', methods=['POST', 'GET']) def result(): if request.method == 'POST': message = request.form.to_dict() message = message["message"] print(message) history = [] while True: #raw_text = input(">>> ") print("Say something!") with sr.Microphone() as source: audio = r.listen(source) GOOGLE_CLOUD_SPEECH_CREDENTIALS = r"""{json_google_credentials}""" try: # for testing purposes, we're just using the default API key # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")` # instead of `r.recognize_google(audio)` raw_text = r.recognize_google_cloud( audio, credentials_json=GOOGLE_CLOUD_SPEECH_CREDENTIALS) print("Google Cloud Speech thinks you said " + raw_text) except sr.UnknownValueError: print("Google Speech Recognition could not understand audio") except sr.RequestError as e: print( "Could not request results from Google Speech Recognition service; {0}" .format(e)) while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print(out_text) tts = gTTS(out_text) tts.save('out.mp3') playsound('out.mp3')
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") parser.add_argument("--n_samples", type=int, default=10) parser.add_argument("--sample_term", type=int, default=1) args = parser.parse_args() if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) #sample_idxs = range(args.n_samples) sample_idxs = [args.sample_term * i for i in range(1, args.n_samples + 1)] for i in sample_idxs: personality = dataset['valid'][i]['personality'] history = dataset['valid'][i]['utterances'][4]['history'] target = dataset['valid'][i]['utterances'][4]['candidates'][-1] with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print('Persona info:') for persona in personality: print(tokenizer.decode(persona, skip_special_tokens=True), end=' ') print('\nDialog:') for his in history: print(tokenizer.decode(his, skip_special_tokens=True)) print('Target:') print(tokenizer.decode(target, skip_special_tokens=True)) print('Prediction:') print(out_text, end='\n\n')
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="gpt", help="Model type (gpt or gpt2)") parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": args.model_checkpoint = download_pretrained_model() random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class = GPT2Tokenizer if "gpt2" == args.model else OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2LMHeadModel if "gpt2" == args.model else OpenAIGPTLMHeadModel model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") #personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache) #personality = random.choice(personalities) #logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) wordfile = './data/truncate.txt' # word vector file, can be downloaded from GloVe website weightfile = './auxiliary_data/enwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word p = 0 start_time = time.time() with open('data_volunteers.json') as json_file: json_data = json.load(json_file) for i in json_data: p += 1 #if p <1100: # continue history = [] personality = [] query_set = [] json_dialog = i["dialog"] json_bot = i["bot_profile"] for j in json_bot: personality.append(tokenizer.encode(j)) #logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) persona = tokenizer.decode(chain(*personality)) row = {"Personality": persona} text = [] for j in json_dialog: if j["sender_class"] == "Human": json_text = j["text"] raw_text = json_text check = tokenizer.decode(tokenizer.encode(raw_text), skip_special_tokens=True) if check == "": history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = normal_sample_sequence( personality, history, tokenizer, model, args) # history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) text.append({ "evaluation_score": j["evaluation_score"], "id": j["id"], "sender": j["sender"], "sender_class": j["sender_class"], "text": raw_text, "generated_text": out_text }) continue history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args, words, weight4ind, We) # history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) text.append({ "evaluation_score": j["evaluation_score"], "id": j["id"], "sender": j["sender"], "sender_class": j["sender_class"], "text": raw_text, "generated_text": out_text }) else: json_text = j["text"] raw_text = json_text history.append(tokenizer.encode(raw_text)) text.append({ "evaluation_score": j["evaluation_score"], "id": j["id"], "sender": j["sender"], "sender_class": j["sender_class"], "text": raw_text }) row["dialog"] = text query_set.append(row) #print(query_set) with open('./sif_set/sif' + str(p) + '.json', 'w', encoding='utf-8') as make_file: json.dump(query_set, make_file) if not p % 10: print( str(p * 100 / 1111) + '%, ' + str(time.time() - start_time) + 'sec') '''
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="gpt2", help="Model type (gpt or gpt2)") parser.add_argument("--model_checkpoint", type=str, default="gpt2", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=150, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") parser.add_argument( "--task", type=str, default="dialogue", help="one of task from [dialogue, qa, mt, nlg, summarization]") parser.add_argument("--self_copy", action='store_true', help="add self copy") parser.add_argument("--perturbation_layers", type=int, default=0, help="number of perturbation layers") parser.add_argument("--adapter_bottleneck", type=int, default=0, help="adapter layer bottleneck") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) #path_list = ["Oct31_20-10-36_black-rack-1_gpt2_mt_0.0005_0_False_epoch8adapter300", "Oct28_13-02-05_black-rack-1_gpt2_summarization_0.0005_0_False_epoch10adapter100", "Nov18_11-06-15_black-rack-1_gpt2_dialogue_0.001_0_False_epoch3adapter100randomFalse_distillation", "Nov03_21-09-43_black-rack-1_gpt2_qa_0.0005_0_False_epoch5adapter300", "Oct28_02-34-33_black-rack-1_gpt2_nlg_0.005_0_False_epoch10adapter10"] # path_list: Give the list of checkpoints of [mt, summarization, dialogue, qa, nlg] to combine all adapters into VLM path_list = [ "Nov15_10-44-19_black-rack-1_gpt2_mt_0.0005_0_False_epoch8adapter300randomFalse_distillation", "Nov24_21-54-28_black-rack-1_gpt2_summarization_0.0005_0_False_epoch5adapter100randomFalse_distillation", "Nov18_11-06-15_black-rack-1_gpt2_dialogue_0.001_0_False_epoch3adapter100randomFalse_distillation", "Nov03_21-09-43_black-rack-1_gpt2_qa_0.0005_0_False_epoch5adapter300", "Oct28_02-34-33_black-rack-1_gpt2_nlg_0.005_0_False_epoch10adapter10" ] logger.info("Get pretrained model and tokenizer") tokenizer_class = GPT2Tokenizer tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = VLM vlm_model = model_class.from_pretrained(args.model_checkpoint, bottleneck_map=BOTTLEBECK_MAP) add_special_tokens_(vlm_model, tokenizer) print([ tokenizer.encode(word) for word in ATTR_TO_SPECIAL_TOKEN['additional_special_tokens'] ]) vlm_weights = deepcopy(vlm_model.state_dict()) model_class_sub = GPT2LMHeadModel bottlenecklist = [300, 100, 100, 300, 10] emblist = [[50268, 50272], [50272, 50276], [50258, 50263], [50263, 50268], [50276, 50287]] for i, path in enumerate(path_list): sub_model = model_class_sub.from_pretrained( "runs/" + path, perturbation_layers=0, self_copy=False, adapter_bottleneck=bottlenecklist[i]) #print(vlm_model.transformer.wte.weight.data[emblist[i][0]:emblist[i][1],:].shape, sub_model.transformer.wte.weight.data[emblist[i][0]:emblist[i][1],:].shape) vlm_model.transformer.wte.weight.data[ emblist[i][0]:emblist[i] [1], :] = sub_model.transformer.wte.weight.data[ emblist[i][0]:emblist[i][1], :] #vlm_model.transformer.wte.weight.data = sub_model.transformer.wte.weight.data #assert torch.equal(vlm_model.transformer.wte.weight.data[:50257,:], sub_model.transformer.wte.weight.data[:50257,:]) # print(ID2COMMAND[i]) #print(vlm_model.transformer.h[10].adapter_block.mixadapter[4].project_up.weight.data) #print(sub_model.transformer.h[10].mlp.c_proj.weight.data) # print([n for n, p in sub_model.named_parameters()]) weights = deepcopy(sub_model.state_dict()) check_str = "mixadapter.{}.".format(i) model_dict = vlm_model.state_dict() # for name in vlm_weights: # if ("mixadapter" not in name) and ("wte" not in name): # try: # assert torch.equal(vlm_weights[name], weights[name]) # except: # print(name) # print(vlm_weights[name]) # print(weights[name]) #print({ name: name.replace(check_str, "") for name in vlm_weights if check_str in name }) model_dict.update({ name: weights[name.replace(check_str, "")] for name in vlm_weights if check_str in name }) vlm_model.load_state_dict(model_dict) #print(vlm_model.transformer.h[10].adapter_block.mixadapter[4].project_up.weight.data) # print([n for n, p in sub_model.named_parameters() if "adapter" in n]) # exit(0) # model.to(args.device) # add_special_tokens_(model, tokenizer) torch.save(vlm_model.state_dict(), "runs/VLM/pytorch_model.bin")
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) personalities = [ dialog["personality"] for dataset in dataset.values() for dialog in dataset ] personality = random.choice(personalities) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] while True: raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") #classifier code starts # zz = ['I like to sleep',"that's cool other cultures are nice", "where is Geneva cats?", "What public figure defended New York in Januar"] zz = [raw_text] valDF = pd.DataFrame() valDF['question_text'] = zz # prediction part batch_size = 256 def batch_gen(test_df): n_batches = math.ceil(len(test_df) / batch_size) for i in range(n_batches): texts = test_df.iloc[i * batch_size:(i + 1) * batch_size, 0] text_arr = np.array([text_to_array(text) for text in texts]) yield text_arr # test_df = pd.read_csv("../input/quora-insincere-questions-classification/test.csv") test_df = valDF all_preds = [] for x in tqdm(batch_gen(test_df)): all_preds.extend(classifier_model.predict(x).flatten()) y_te = (np.array(all_preds) > 0.5).astype(np.int) print(y_te) print(valDF['question_text']) history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print(out_text)
def run(): """ Initialize model. Loop over every new incoming message and return chatbots answer. :return: """ parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=float, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) personalities = [ dialog["personality"] for dataset in dataset.values() for dialog in dataset ] personality = random.choice(personalities) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) driver = login() time.sleep(4) weiter() contacts = prepare(driver) while True: new_chats = check_message(driver) print(new_chats) final_df = prepare_df(new_chats=new_chats) messages, names = prepare_final_format(final_df, driver, contacts) final_fucking_dict = final_model_input(messages, names) print(final_fucking_dict) for nam in names: history = [] raw_text = final_fucking_dict.get(nam) while not raw_text: raw_text = final_fucking_dict.get(nam) history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) driver.send_message_to_id(contacts.get(nam), f'{out_text}') print(out_text) time.sleep(100)
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") parser.add_argument( "--conv_limit", type=int, default=None, help="Length of conversation - number of times Speaker1 can respond") args = parser.parse_args() #logging.basicConfig(level=logging.INFO) #logger = logging.getLogger(__file__) #logger.info(pformat(args)) if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) print("Select type of chat:\n1. Counselling\n2. Task-Oriented") raw_text = input(">>> ") initial = [ "Will you like to learn a new recipe?", "Do you want to learn a new recipe?", "Let us learn a new recipe." ] sents = ["To sum up, ", "Thus, as I understand, ", "So, to summarize, "] history = [] if raw_text == "1": if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() #logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else ( OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) #logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) personalities = [dialog["personality"] for dialog in dataset] personality = random.choice(personalities) print("Selected personality: ", tokenizer.decode(chain(*personality))) if args.conv_limit: conv_len = args.conv_limit else: conv_len = -1 utt = 0 text_summary = [] while utt != conv_len: raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") history.append(tokenizer.encode(raw_text)) text_summary.append(raw_text) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print(out_text) utt = utt + 1 if utt == conv_len: if out_text.endswith("?"): utt = utt - 1 # generate emotion raw_text = 'exit chat' history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print("\n" + "Chat Emotion: " + out_text) # generate summary text = ".".join(text_summary) summary = summarizer(text, max_length=50) print("\n" + "Summary:\n" + random.choice(sents) + create_reflection(summary[0]['summary_text'])) # generate a supporting response to the summary raw_text = 'summarize-chat' history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print("\n" + "Response:\n" + out_text) elif raw_text == "2": print(random.choice(initial)) raw_text = input(">>> ") scores = sentiment.polarity_scores(raw_text) if scores['pos'] > scores['neg']: print("Great, here is a recipe for you ...") create_recipe() raw_text = input(">>> ") elif scores['neg'] > scores['pos']: print( "ok, then maybe you will like to chat with the counsellor. Please choose option 1. Thank you." ) else: print("I could not understand what you are asking.") else: print("Please select the correct choice.")
def run(chapter): args = easydict.EasyDict({ "dataset_path": "data/en_book_conversational.json", "dataset_cache": './dataset_cache', "model": "gpt2", "model_checkpoint": "/home/ubuntu/GraduateProject/transfer-learning-conv-ai/runs/Jun04_18-39-17_ime-502_gpt2", "max_history": 4, "device": "cuda" if torch.cuda.is_available() else "cpu", "max_length": 20, "min_length": 1, "seed": 0, "top_p": 0.9 }) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) personalities = [ dialog["personality"] for dataset in dataset.values() for dialog in dataset ] personality = random.choice(personalities) print("Selected personality: ", tokenizer.decode(chain(*personality))) while get_persona_label(chapter) not in tokenizer.decode( chain(*personality)): personality = random.choice(personalities) return personality
def run(): parser = ArgumentParser() parser.add_argument("--dataset_cache", type=str, default='./dataset.bin', help="Path or url of the dataset cache") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": raise ValueError( "Interacting with Model requires passing a finetuned model_checkpoint" ) if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = (GPT2Tokenizer, GPT2LMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a twitter user") dataset = get_dataset(tokenizer) personalities = [ dialog["name"] for dataset in dataset.values() for dialog in dataset ] personality = random.choice(personalities) logger.info("Selected personality: %s", tokenizer.decode(personality)) previous_tweet = [] while True: raw_text = input("Please enter a previous tweet: ") while not raw_text: print('Prompt should not be empty!') raw_text = input("Please enter a previous tweet: ") previous_tweet = tokenizer.encode(raw_text) with torch.no_grad(): out_ids = sample_sequence(personality, previous_tweet, tokenizer, model, args) out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print('New Tweet:') print(out_text)
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") parser.add_argument( "--conv_limit", type=int, default=None, help="Length of conversation - number of times Speaker1 can respond") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) personalities = [dialog["personality"] for dialog in dataset] personality = random.choice(personalities) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) sents = ["To sum up, ", "Thus, as I understand, ", "So, to summarize, "] if args.conv_limit: conv_len = args.conv_limit else: conv_len = -1 text_summary = [] utt = 0 history = [] while utt != conv_len: raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") history.append(tokenizer.encode(raw_text)) text_summary.append(raw_text) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print(out_text) utt = utt + 1 if utt == conv_len: if out_text.endswith("?"): utt = utt - 1 text = ".".join(text_summary) summary = summarizer(text, max_length=50) print("\n" + random.choice(sents) + create_reflection(summary[0]['summary_text']))
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) personalities = [ dialog["personality"] for dataset in dataset.values() for dialog in dataset ] personality = random.choice(personalities) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] engine = pyttsx3.init() r = sr.Recognizer() while True: print("Talk:") with sr.Microphone() as source: audio = r.listen(source) raw_text = r.recognize_google(audio) print(raw_text) # raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print(out_text) engine.say(out_text) engine.runAndWait()
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="data/en_book_conversational.json", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) personalities = [ dialog["personality"] for dataset in dataset.values() for dialog in dataset ] personality = random.choice(personalities) # persona = get_persona_label(tokenizer) tokenizer.decode(personalities) def matching_personality(persona_text): while 'i have got a headache and a fever.' not in tokenizer.decode( chain(*personality)): personality = random.choice(personalities) return personality """'immigration checkpoint', 'in a taxi', 'hotel check-in', 'at a restaurant', 'getting a dessert', 'asking for directions', 'at a shopping mall', 'hotel check-out', 'checking in at the airport', 'in flight', 'currency exchange', 'renting a car', 'making a hotel reservation','room service', 'buying a camera', 'at a supermarket', 'in a hospital', 'getting a souvenir', 'asking someone to take a photo'""" logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] while True: raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print(out_text)
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="gpt2", help="Model type (gpt or gpt2)") parser.add_argument("--model_checkpoint", "-mc", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=100, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=float, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") # add option to not use personality parser.add_argument("--no_personality", type=bool, default=True, help="Set to not sample a personality.") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if os.path.isdir("./huggingface_s3/"): args.model_checkpoint = "./huggingface_s3/" logger.info("Loading from pre-downloaded temp path: {}".format( args.model_checkpoint)) else: args.model_checkpoint = download_pretrained_model() random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if "gpt2" == args.model else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) model.eval() # added the option to opt out of using a personality if args.no_personality: logger.info("No personality is sampled for this chatbot.") personality = "" # personality = ["My name is Isabelle Hawkins.", # "I am five years old.", # "My phone number is 959-100-9300.", # "Here is a link I would like you to check out: google.com.", # "I would like to know more about you."] # personality = [tokenizer.encode(p) for p in personality] # logger.info("Selected custom personality: %s",tokenizer.decode(chain(*personality))) else: logger.info("Sample a personality") personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache) personality = random.choice(personalities) # import pdb; pdb.set_trace() logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] # while True: # custom_history = input("Press 0 to end\n\tAdd history: ") # if custom_history == '0': # break # else: # history.append(tokenizer.encode(custom_history)) while True: history = [] args.temperature = float(input("Set temperature: > 0 and <= 1")) prompt = input("Speaker 1 >>> ") while not prompt: print('Prompt should not be empty!') prompt = input("Speaker 1 >>> ") history.append(tokenizer.encode(prompt)) i = 0 while True: with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) i += 1 speaker = "Speaker 2" if i % 2 else "Speaker 1" print(f"{speaker}: {out_text}") if i == 10: break
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='persona_comet', help="Path or url of the dataset cache") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") parser.add_argument("--comet_greedy", action='store_true', help="Use top-most comet expansion") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) # personalities = [dialog["personality"] for dataset in dataset.values() for dialog in dataset] dialogs = [dialog for dataset in dataset.values() for dialog in dataset] dialog = random.choice(dialogs) # personality = random.choice(personalities) personality = dialog['personality'] comet_annotations = dialog["coment_annotation"] for sent in comet_annotations: sent_beams = [] for effect in sent['comet'].items(): # not sure is ' .' should be added or '.' # tokenizer realize different tokens for each of the above options # beams = [x+' .' for x in effect[1]['beams']] if args.comet_greedy: sent_beams += [effect[1]['beams'][0]] else: sent_beams += effect[1]['beams'] personality += sent_beams print(personality) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] while True: raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print(out_text)
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=200, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) personalities = [ dialog["personality"] for dataset in dataset.values() for dialog in dataset ] # personality = random.choice(personalities) personality = [string_transformer('my name is WabiSabi', tokenizer, False)] quotes = [ 'do not be afraid to ask for yourself', 'to escape fear , you must go through it', 'I am timeless, incomplete and imperfect. No age. No sense of time.', ' failure is another steppingstone to greatness . ', 'think like a queen . queen is not afraid to fail . failure is another steppingstone to greatness . ', 'be thankful for what you have ; you will end up having more . if you concentrate on what you do not have, you will never, ever have enough .', 'surround yourself with only people who are going to lift you higher .', 'the biggest adventure you can ever take is to live the life of your dreams .', 'doing the best at this moment puts you in the best place for the next moment .', 'real integrity is doing the right thing , knowing that nobody is going to know whether you did it or not .', 'the more you praise and celebrate your life , the more there is in life to celebrate .', 'passion is energy . feel the power that comes from focusing on what excites you .', 'lots of people want to ride with you in the limo , but what you want is someone who will take the bus with you when the limo breaks down .', 'turn your wounds into wisdom . ', 'you can have it all . just not all at once . ', 'one of the hardest things in life to learn are which bridges to cross and which bridges to burn . ', 'challenges are gifts that force us to search for a new center of gravity .', 'the thing you fear most has no power . your fear of it is what has the power . facing the truth really will set you free .', 'surround yourself only with people who are going to take you higher .', 'you get in life what you have the courage to ask for .', 'i trust that everything happens for a reason , even when we are not wise enough to see it .', 'everybody has a calling . and your real job in life is to figure out as soon as possible what that is , who you were meant to be , and to begin to honor that in the best way possible for yourself .', 'the key to realizing a dream is to focus not on success but on significance , and then even the small steps and little victories along your path will take on greater meaning .', 'the biggest adventure you can ever take is to live the life of your dreams .', 'self-esteem comes from being able to define the world in your own terms and refusing to abide by the judgments of others .', 'forgiveness is giving up the hope that the past could have been any different .', 'luck is a matter of preparation meeting opportunity .', 'the whole point of being alive is to evolve into the complete person you were intended to be .', 'wisdom equals knowledge plus courage . you have to not only know what to do and when to do it , but you have to also be brave enough to follow through .', 'surround yourself with great people .', 'i alone cannot change the world , but i can cast a stone across the water to create many ripples .', 'whatever the mind of man can conceive and believe, it can achieve .', 'whenever you see a successful person, you only see the public glories, never the private sacrifices to reach them .', 'at some point you are bound to stumble because if you are constantly doing what we do , raising the bar . if you are constantly pushing yourself higher, higher the law of averages not to mention the myth of icarus predicts that you will at some point fall . And when you do i want you to know this , remember this : there is no such thing as failure . failure is just life trying to move us in another direction . now when you are down there in the hole , it looks like failure .', 'and when you are down in the hole when that moment comes , it is really okay to feel bad for a little while . give yourself time to mourn what you think you may have lost but then here is the key , learn from every mistake because every experience , encounter , and particularly your mistakes are there to teach you and force you into being more who you are . and then figure out what is the next right move .', 'because when you inevitably stumble and find yourself stuck in a hole that is the story that will get you out : what is your true calling ? what is your dharma ? what is your purpose ?', 'i know that you all might have a little anxiety now but no matter what challenges or setbacks or disappointments you may encounter along the way , you will find true success and happiness if you have only one goal , there really is only one , and that is this : to fulfill the highest most truthful expression of yourself as a human being . you want to max out your humanity by using your energy to lift yourself up , your family and the people around you .', 'from time to time you may stumble , fall , you will for sure , you will have questions and you will have doubts about your path . but i know this , if you are willing to be guided by , that still small voice that is the gps within yourself , to find out what makes you come alive , you will be more than okay . you will be happy , you will be successful , and you will make a difference in the world .' ] random.shuffle(quotes) quotes = quotes[:24] [personality.append(string_transformer(s, tokenizer)) for s in quotes] # print(personality) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] while True: raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") history.append(string_transformer(raw_text, tokenizer)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print(out_text)
def run(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="gpt2", help="Model type (gpt or gpt2)") parser.add_argument("--model_checkpoint", type=str, default="gpt2", help="Path, url or short name of the model") parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=150, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") parser.add_argument("--task", type=str, default="dialogue", help="one of task from [dialogue, qa, mt, nlg, summarization]") parser.add_argument("--self_copy", action='store_true', help="add self copy") parser.add_argument("--perturbation_layers", type=int, default=0, help="number of perturbation layers") parser.add_argument("--adapter_bottleneck", type=int, default=0, help="adapter layer bottleneck") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class = GPT2Tokenizer tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) #model_class = GPT2LMHeadModel #model = model_class.from_pretrained(args.model_checkpoint, perturbation_layers=0, self_copy=False, adapter_bottleneck=BOTTLEBECK_MAP[args.task]) model_class = VLM model = model_class.from_pretrained(args.model_checkpoint, bottleneck_map=BOTTLEBECK_MAP) model.to(args.device) add_special_tokens_(model, tokenizer) if not os.path.exists("results/VLM_result/"): os.makedirs("results/VLM_result/") if (args.task=="mt" or args.task=="summarization"): output_text = [] ref_text = [] loaded_dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache, args.task) for pair in loaded_dataset["test"]: source = pair["src"][:MAXLEN_MAP[args.task]['src']] target = pair["tgt"]#[:MAXLEN_MAP[args.task]['tgt']] with torch.no_grad(): out_ids = sample_sequence( tokenizer, model, args, source=source, target=target, task_id=COMMAND2ID[args.task]) out_text = tokenizer.decode(out_ids, skip_special_tokens=True) output_text.append(out_text) ref_text.append(tokenizer.decode(pair["tgt"], skip_special_tokens=True)) BLEU = moses_multi_bleu(np.array(output_text),np.array(ref_text)) r_1, r_2, r_l, r_m = rouge(output_text, ref_text) print("BLEU:{}".format(BLEU)) print("ROUGE_1:{}, ROUGE_2:{}, ROUGE_L:{}, ROUGE_mean:{}".format(r_1, r_2, r_l, r_m)) with open("results/VLM_result/"+args.task+"_output.txt", 'w', encoding='utf-8') as f: for line in output_text: f.write(line) f.write('\n') with open("results/VLM_result/"+args.task+"_ref.txt", 'w', encoding='utf-8') as f: for line in ref_text: f.write(line) f.write('\n') # nlg interact if args.task=="nlg": output_text = [] ref_text = [] loaded_dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache, args.task) for pair in loaded_dataset["test"]: source = pair["src"] target = pair["tgt"] with torch.no_grad(): out_ids = sample_sequence(tokenizer, model, args, source=source, target=target, task_id=COMMAND2ID[args.task] ) out_text = tokenizer.decode(out_ids, skip_special_tokens=True) # print("input: ") # print([{k: tokenizer.decode(v, skip_special_tokens=True)} for k, v in pair["src"].items()]) # print("model output: ") # print(out_text) # print("ref: ") # print(tokenizer.decode(pair["tgt"], skip_special_tokens=True)) # print("======================================") output_text.append(out_text) ref_text.append(tokenizer.decode(pair["tgt"], skip_special_tokens=True)) with open("results/VLM_result/nlg_output.txt", 'w', encoding='utf-8') as f: for line in output_text: f.write(line) f.write('\n') with open("results/VLM_result/nlg_ref.txt", 'w', encoding='utf-8') as f: for line in ref_text: f.write(line) f.write('\n') if args.task=="dialogue": output_text = [] ref_text = [] loaded_dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache, args.task) persona_text = [] for pair in loaded_dataset["valid"]: persona = pair["personality"].copy() for utterance in pair["utterances"]: history = utterance["history"][-(2*args.max_history+1):] with torch.no_grad(): out_ids = sample_sequence( tokenizer, model, args, personality=persona, history=history, task_id=COMMAND2ID[args.task]) out_text = tokenizer.decode(out_ids, skip_special_tokens=True) output_text.append(out_text) ref_text.append(tokenizer.decode(utterance["candidates"][-1], skip_special_tokens=True)) persona_text.append([tokenizer.decode(p, skip_special_tokens=True)for p in persona]) with open("results/VLM_result/"+args.task+"_output.txt", 'w', encoding='utf-8') as f: for line in output_text: f.write(line) f.write('\n') with open("results/VLM_result/"+args.task+"_ref.txt", 'w', encoding='utf-8') as f: for line in ref_text: f.write(line) f.write('\n') with open("results/VLM_result/"+args.task+"_persona.txt", 'w', encoding='utf-8') as f: for line in persona_text: f.write("\t".join(line)) f.write('\n') # print("Evaluate ENT score") # ent_ref_arr = [] # ent_pred_arr = [] # for ref, pred, per in zip(ref_text,output_text,persona_text): # ent_ref_arr.append(ent_score(ref,per)) # ent_pred_arr.append(ent_score(pred,per)) print("Evaluate BLEU") BLEU = moses_multi_bleu(np.array(output_text),np.array(ref_text)) print("BLEU:{}".format(BLEU)) # print("ENT REF:{}".format(np.mean(ent_ref_arr))) # print("ENT PRED:{}".format(np.mean(ent_pred_arr))) with open("results/VLM_result/"+args.task+"_output.txt", 'w', encoding='utf-8') as f: for line in output_text: f.write(line) f.write('\n') with open("results/VLM_result/"+args.task+"_ref.txt", 'w', encoding='utf-8') as f: for line in ref_text: f.write(line) f.write('\n') with open("results/VLM_result/"+args.task+"_persona.txt", 'w', encoding='utf-8') as f: for line in persona_text: f.write("\t".join(line)) f.write('\n') # qa interact if args.task=="qa": output_text = [] ref_text = [] loaded_dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache, args.task) ## load dev_set_with_ids here for pair in loaded_dataset["valid"]: evidence = pair["document"].copy() evidence = [evidence[0][:MAXLEN_MAP[args.task]['document']]] for utterance in pair["utterances"]: history = utterance["history"][-(2*args.max_history+1):] with torch.no_grad(): out_ids = sample_sequence( tokenizer, model, args, personality=evidence, history=history, task_id=COMMAND2ID[args.task]) out_text = tokenizer.decode(out_ids, skip_special_tokens=True) output_text.append({"id": pair['id'],"turn_id": utterance['turn_id'],"answer": out_text}) data = json.dumps(output_text) with open("results/VLM_result/"+args.task+"_output.txt", 'w', encoding='utf-8') as f: f.write(data)
else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = (GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) personalities = [dialog["personality"] for dataset in dataset.values() for dialog in dataset] personality = random.choice(personalities) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] @app.route('/') def index(): return "<h1>Welcome!</h1>" @app.route('/bot', methods=['POST','GET']) def bot():
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument( "--model", type=str, default="gpt2", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2', 'gpt2-medium']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--batch_size", type=int, default=1) parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--num_candidates", type=int, default=1) parser.add_argument("--personality_permutations", type=int, default=1) parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--distributed", action='store_true') parser.add_argument("--temperature", type=int, default=1, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") parser.add_argument("--entmax_alpha", type=float, default=1.5) parser.add_argument("--entmax_k", type=int, default=512) parser.add_argument("--entmax_bisect_iter", type=int, default=50) parser.add_argument("--loss", default="cross_entropy", type=str) parser.add_argument("--metric", default="bleu", type=str) parser.add_argument("--epsilon", default=0.000001, type=float) parser.add_argument("--name", default='', type=str) parser.add_argument("--temp", default=0, type=float) args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) args.train_batch_size = args.batch_size args.valid_batch_size = args.batch_size generic_entmax_loss = partial(EntmaxBisectLoss, alpha=args.entmax_alpha, n_iter=args.entmax_bisect_iter) loss_funcs = { "cross_entropy": nn.CrossEntropyLoss, "sparsemax": partial(SparsemaxLoss, k=args.entmax_k), "entmax15": partial(Entmax15Loss, k=args.entmax_k), "entmax": generic_entmax_loss, "entmax_alpha": "entmax_alpha" } assert args.loss in loss_funcs loss_func = loss_funcs[args.loss] generic_entmax = partial(entmax_bisect, alpha=args.entmax_alpha, n_iter=args.entmax_bisect_iter) gen_funcs = { "softmax": torch.softmax, "sparsemax": partial(sparsemax, k=args.entmax_k), "entmax15": partial(entmax15, k=args.entmax_k), "entmax": generic_entmax, "entmax_alpha": "entmax_alpha" } if args.loss == "cross_entropy": gen_func = gen_funcs["softmax"] elif args.loss == "sparsemax": gen_func = gen_funcs["sparsemax"] elif args.loss == "entmax15": gen_func = gen_funcs["entmax15"] elif args.loss == "entmax": gen_func = gen_funcs["entmax"] elif args.loss == "entmax_alpha": gen_func = gen_funcs["entmax_alpha"] if args.model_checkpoint == "": if args.model == 'gpt2' or args.model == 'gpt2-medium': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel ) if args.model == 'gpt2' or args.model == 'gpt2-medium' else ( OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) personachat = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) bos, eos, speaker1, speaker2, pad = tokenizer.convert_tokens_to_ids( SPECIAL_TOKENS) model.eval() datasets = {"train": defaultdict(list), "valid": defaultdict(list)} personalities = [ dialog["personality"] for dataset in personachat.values() for dialog in dataset ] special_tokens_ids = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS) lengths = [] f = open('./conv_sim/model_to_model_' + args.name, 'w') for dataset_name, dataset in personachat.items(): num_candidates = 1 if dataset_name != 'train': for dialog in dataset: persona_1 = random.choice(personalities) persona_2 = random.choice(personalities) utterance = dialog["utterances"][0] history = utterance["history"][0] instance_1 = build_input(persona_1, history, tokenizer, speaker=1) instance_2 = build_input(persona_2, history, tokenizer, speaker=2) v = 0 input_1 = instance_1["input_ids"] input_2 = instance_2["input_ids"] token_ids_1 = instance_1["token_type_ids"] token_ids_2 = instance_2["token_type_ids"] conversation = [] while True: if v % 2 == 0 or v == 0: inpu = torch.tensor(input_1).unsqueeze(0).cuda() token_ids = torch.tensor(token_ids_1).unsqueeze( 0).cuda() else: inpu = torch.tensor(input_2).unsqueeze(0).cuda() token_ids = torch.tensor(token_ids_2).unsqueeze( 0).cuda() current_output = [] for i in range(args.max_length): if i > 0: inpu = torch.cat([inpu, prev.unsqueeze(0)], 1) if token_ids[0][-1] == 50260: token_ids = torch.cat([ token_ids, torch.tensor([50260]).cuda().unsqueeze(0) ], 1) if v % 3 == 0 or v == 1: token_ids_1.append(50260) token_ids_2.append(50261) else: token_ids_1.append(50261) token_ids_2.append(50260) else: token_ids = torch.cat([ token_ids, torch.tensor([50261]).cuda().unsqueeze(0) ], 1) token_ids_1.append(50261) token_ids_2.append(50260) if v % 3 == 0 or v == 1: token_ids_1.append(50261) token_ids_2.append(50260) else: token_ids_1.append(50260) token_ids_2.append(50261) if token_ids.size(1) != inpu.size(1): break logits = model(inpu, token_type_ids=token_ids) if isinstance(logits, tuple): logits = logits[0] logits = logits[0, -1, :] if args.top_k > 0 or args.top_p > 0: logits = top_filtering(logits, top_k=args.top_k, top_p=args.top_p) if args.temp == 0: probs = gen_func(logits, dim=-1) else: probs = softmax_temperature(logits, temperature=args.temp, axis=1) prev = torch.multinomial(probs, 1) if prev.item() in special_tokens_ids: break current_output.append(prev.item()) input_1.extend(current_output) input_2.extend(current_output) if v % 2 == 0 or v == 0: input_1.append(50261) input_2.append(50260) token_ids_1.append(50261) token_ids_2.append(50260) else: input_1.append(50260) input_2.append(50261) token_ids_1.append(50260) token_ids_2.append(50261) v += 1 conversation.append(current_output) if v == 20 or len(current_output) == 0: print('empty') break c = 0 if len(conversation) > 2: for word in current_output: if word in conversation[-3]: c += 1 if c / len(current_output) >= 0.8: break c = 0 if len(conversation) > 1: for word in current_output: if word in conversation[-2]: c += 1 if c / len(current_output) >= 0.8: break print('persona_1: ', tokenizer.decode(chain(*persona_1))) print('persona_2: ', tokenizer.decode(chain(*persona_2))) print('history: ', tokenizer.decode(history)) for utt in conversation: print('-----', tokenizer.decode(utt)) print('\n') f.write('persona_1: ' + str(tokenizer.decode(chain(*persona_1)))) f.write('persona_2: ' + str(tokenizer.decode(chain(*persona_2)))) f.write('history: ' + str(tokenizer.decode(history))) for utt in conversation: f.write( tokenizer.decode(utt, clean_up_tokenization_spaces=False)) f.write('\n') lengths.append(len(conversation)) print(len(lengths)) print('average number of turns:', np.array(lengths).mean())
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="gpt2", help="Model type (gpt or gpt2)") parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=150, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") parser.add_argument( "--task", type=str, default="dialogue", help="one of task from [dialogue, qa, mt, nlg, summarization]") parser.add_argument("--self_copy", action='store_true', help="add self copy") parser.add_argument("--perturbation_layers", type=int, default=0, help="number of perturbation layers") parser.add_argument("--adapter_bottleneck", type=int, default=0, help="adapter layer bottleneck") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": args.model_checkpoint = download_pretrained_model() random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class = GPT2Tokenizer if "gpt2" == args.model else OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2LMHeadModel if "gpt2" == args.model else OpenAIGPTLMHeadModel model = model_class.from_pretrained( args.model_checkpoint, perturbation_layers=args.perturbation_layers, self_copy=args.self_copy, adapter_bottleneck=args.adapter_bottleneck) model.to(args.device) add_special_tokens_(model, tokenizer) if args.task == "dialogue": output_text = [] ref_text = [] loaded_dataset, cache_path = get_dataset(tokenizer, args.dataset_path, args.dataset_cache, args.task, return_cachepath=True) persona_text = [] distillated_dataset = loaded_dataset for i, pair in enumerate(tqdm(loaded_dataset["train"])): persona = pair["personality"].copy() for j, utterance in enumerate(pair["utterances"]): history = utterance["history"][-(2 * args.max_history + 1):] with torch.no_grad(): out_ids = sample_sequence(tokenizer, model, args, personality=persona, history=history) distillated_dataset["train"][i]["utterances"][j][ "distillated_candidates"] = [out_ids] torch.save(distillated_dataset, cache_path) # qa interact if args.task == "qa": output_text = [] ref_text = [] loaded_dataset, cache_path = get_dataset(tokenizer, args.dataset_path, args.dataset_cache, args.task, return_cachepath=True) distillated_dataset = loaded_dataset for i, pair in enumerate(tqdm(loaded_dataset["train"])): evidence = pair["document"].copy() evidence = [evidence[0][:MAXLEN_MAP[args.task]['document']]] for j, utterance in enumerate(pair["utterances"]): history = utterance["history"][-(2 * args.max_history + 1):] with torch.no_grad(): out_ids = sample_sequence(tokenizer, model, args, personality=evidence, history=history) distillated_dataset["train"][i]["utterances"][j][ "distillated_candidates"] = [out_ids] torch.save(distillated_dataset, cache_path) # nlg interact if args.task == "nlg": output_text = [] ref_text = [] loaded_dataset, cache_path = get_dataset(tokenizer, args.dataset_path, args.dataset_cache, args.task, return_cachepath=True) distillated_dataset = loaded_dataset for i, pair in enumerate(tqdm(loaded_dataset["train"])): source = pair["src"] target = pair["tgt"] with torch.no_grad(): out_ids = sample_sequence(tokenizer, model, args, source=source, target=target) distillated_dataset["train"][i]["distillated_tgt"] = out_ids torch.save(distillated_dataset, cache_path) if (args.task == "mt" or args.task == "summarization"): output_text = [] ref_text = [] loaded_dataset, cache_path = get_dataset(tokenizer, args.dataset_path, args.dataset_cache, args.task, return_cachepath=True) distillated_dataset = loaded_dataset for i, pair in enumerate(tqdm(loaded_dataset["train"])): source = pair["src"][:MAXLEN_MAP[args.task]['src']] target = pair["tgt"] #[:MAXLEN_MAP[args.task]['tgt']] with torch.no_grad(): out_ids = sample_sequence(tokenizer, model, args, source=source, target=target) distillated_dataset["train"][i]["distillated_tgt"] = out_ids torch.save(distillated_dataset, cache_path)
class run: parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=20, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) #personalities = [dialog["personality"] for dataset in dataset.values() for dialog in dataset] #logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] def process_text(self, raw_text): #personality = random.choice(self.personalities) personality = [ 'i am a robot.', 'my job is to give or deny permission.', 'i love my job.', 'josh is my favorite person.', 'my name is permissioner-bot.', 'i do not have a gender.' ] personality = [self.tokenizer.encode(line) for line in personality] self.history.append(self.tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, self.history, self.tokenizer, self.model, self.args) self.history.append(out_ids) self.history = self.history[-(2 * self.args.max_history + 1):] out_text = self.tokenizer.decode(out_ids, skip_special_tokens=True) return out_text
def run(): parser = ArgumentParser() parser.add_argument("--model_type", default="",type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") #parser.add_argument("--model", type=str, default="gpt", help="Model type (gpt or gpt2)") parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=float, default=0.7, help="Sampling softmax temperature") parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": args.model_checkpoint = download_pretrained_model() random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") #tokenizer_class = GPT2Tokenizer if "gpt2" == args.model else OpenAIGPTTokenizer #tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) #model_class = GPT2LMHeadModel if "gpt2" == args.model else OpenAIGPTLMHeadModel #model = model_class.from_pretrained(args.model_checkpoint) args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] logger.info("load tokenizer....\n") tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) logger.info("load model....\n") model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache) personality = random.choice(personalities) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] while True: raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2*args.max_history+1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print(out_text)
def init(quotes, quotes_num=16): global history global personality global tokenizer global model global args global parser global logger # new conversation parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=200, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError("Interacting with GPT2 requires passing a finetuned model_checkpoint") else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = (GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Get personality") personality = [string_transformer('my name is WabiSabi', tokenizer, False)] random.shuffle(quotes) # quotes = quotes[:16] # quotes = [q for _, q in zip(range(quotes_num), quotes)] concatenated = " ".join(quotes)[0:1600] quotes = concatenated.split('.') print(quotes) [personality.append(string_transformer(s, tokenizer)) for s in quotes] # print(personality) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = []