def load_vgg19(input_image): """ Load VGG into a TensorFlow model. Use a dictionary to hold the model instead of using a Python class """ # VGG-19 parameters file VGG19_DOWNLOAD_LINK = 'http://www.vlfeat.org/matconvnet/models/imagenet-vgg-verydeep-19.mat' VGG19_MODEL = 'imagenet-vgg-verydeep-19.mat' VGG19_EXPECTED_BYTES = 534904783 download_pretrained_model(VGG19_DOWNLOAD_LINK, VGG19_MODEL, VGG19_EXPECTED_BYTES) vgg = scipy.io.loadmat(VGG19_MODEL) vgg_layers = vgg['layers'] graph = {} graph['conv1_1'] = _conv2d_relu(vgg_layers, input_image, 0, 'conv1_1') graph['conv1_2'] = _conv2d_relu(vgg_layers, graph['conv1_1'], 2, 'conv1_2') graph['avgpool1'] = _avgpool(graph['conv1_2']) graph['conv2_1'] = _conv2d_relu(vgg_layers, graph['avgpool1'], 5, 'conv2_1') graph['conv2_2'] = _conv2d_relu(vgg_layers, graph['conv2_1'], 7, 'conv2_2') graph['avgpool2'] = _avgpool(graph['conv2_2']) graph['conv3_1'] = _conv2d_relu(vgg_layers, graph['avgpool2'], 10, 'conv3_1') graph['conv3_2'] = _conv2d_relu(vgg_layers, graph['conv3_1'], 12, 'conv3_2') graph['conv3_3'] = _conv2d_relu(vgg_layers, graph['conv3_2'], 14, 'conv3_3') graph['conv3_4'] = _conv2d_relu(vgg_layers, graph['conv3_3'], 16, 'conv3_4') graph['avgpool3'] = _avgpool(graph['conv3_4']) graph['conv4_1'] = _conv2d_relu(vgg_layers, graph['avgpool3'], 19, 'conv4_1') graph['conv4_2'] = _conv2d_relu(vgg_layers, graph['conv4_1'], 21, 'conv4_2') graph['conv4_3'] = _conv2d_relu(vgg_layers, graph['conv4_2'], 23, 'conv4_3') graph['conv4_4'] = _conv2d_relu(vgg_layers, graph['conv4_3'], 25, 'conv4_4') graph['avgpool4'] = _avgpool(graph['conv4_4']) graph['conv5_1'] = _conv2d_relu(vgg_layers, graph['avgpool4'], 28, 'conv5_1') graph['conv5_2'] = _conv2d_relu(vgg_layers, graph['conv5_1'], 30, 'conv5_2') graph['conv5_3'] = _conv2d_relu(vgg_layers, graph['conv5_2'], 32, 'conv5_3') graph['conv5_4'] = _conv2d_relu(vgg_layers, graph['conv5_3'], 34, 'conv5_4') graph['avgpool5'] = _avgpool(graph['conv5_4']) return graph
def model_tokenizer(args): if torch.cuda.is_available(): args["device"] = "cuda" else: args["device"] = "cpu" if args["model_checkpoint"] == "": if args["model"] == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args["model_checkpoint"] = download_pretrained_model() if args["seed"] != 0: random.seed(args["seed"]) torch.random.manual_seed(args["seed"]) torch.cuda.manual_seed(args["seed"]) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args["model"] == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args["model_checkpoint"]) model = model_class.from_pretrained(args["model_checkpoint"]) model.to(args["device"]) add_special_tokens_(model, tokenizer) logger.info("Get text to emote model") emote_clf = txtemote_model(args["txtemotion_dataset_path"]) return model, emote_clf, tokenizer
def model_pretrained(path_dir=None, model=None): if model is not None: path_dir = download_pretrained_model(model) config = load_dict(path_dir) model = Property_Prediction(**config) model.load_pretrained(path_dir + '/model.pt') return model
def run(): pretrained_model = utils.download_pretrained_model() tokenizer_class, model_class = (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(pretrained_model) model = model_class.from_pretrained(pretrained_model) model.to("cpu") add_special_tokens_(model, tokenizer) dataset = utils.get_dataset(tokenizer, "./dataset_cache") features = [ dialog["feature"] for dataset in dataset.values() for dialog in dataset ] feature = random.choice(features) print("Examples of selected feature:\n", tokenizer.decode(itertools.chain(*feature))) background = [tokenizer.encode("tell me about yourself")] generated_lyrics = [] hist_size = 2 for _ in range( 5 ): # how many lines of lyrics to generate - time grows exponentially with this value with torch.no_grad(): out_ids = sample_sequence(feature, background, tokenizer, model) background.append(out_ids) background.append(random.choice(background)) background = background[ -5:] # size of history to retain (needs to be odd number since we're using two headed model) this_line = tokenizer.decode(out_ids, skip_special_tokens=True) generated_lyrics.append(this_line) print("\nGenerated lyrics:") print("\n".join(generated_lyrics))
def InitModel(self): logger.info( f"Starting conv model with gpu: {torch.cuda.is_available()}") """ This takes care of loading model/dataset/tokenizing. Can be called async or in a seperate thread so as to avoid loooong waiting time""" # Start with model and download pretrained if neccesary if self.args["model_checkpoint"] == "": logger.debug("Downloading pretrained model...") self.args["model_checkpoint"] = download_pretrained_model() # do model setup and tokenize vocabulary tokenizer_class = (GPT2Tokenizer if self.args["model"] == "gpt2" else OpenAIGPTTokenizer) logger.debug("Opening tokenizer class from pretrained model...") self.tokenizer = tokenizer_class.from_pretrained( self.args["model_checkpoint"]) model_class = (GPT2LMHeadModel if self.args["model"] == "gpt2" else OpenAIGPTLMHeadModel) logger.debug("Opening model class from pretrained model...") self.model = model_class.from_pretrained(self.args["model_checkpoint"]) self.model.to(self.args["device"]) self.model.eval() logger.debug("Getting dataset personalities...") personalities = get_dataset_personalities(self.tokenizer, self.args["dataset_path"], self.args["dataset_cache"]) logger.debug("Selecting a random personality...") self.personality = random.choice(personalities) logger.info(f"Selected personality: " + f"{self.tokenizer.decode(chain(*self.personality))}") self.is_ready = True logger.info("⭐Model initialized and ready to go! ⭐")
def run(): config_file = "configs/interact_config.json" config = InteractConfig.from_json_file(config_file) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(config)) if config.model_checkpoint == "": config.model_checkpoint = download_pretrained_model() random.seed(config.seed) torch.random.manual_seed(config.seed) torch.cuda.manual_seed(config.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class = GPT2Tokenizer if "gpt2" == config.model else OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint) model_class = GPT2LMHeadModel if "gpt2" == config.model else OpenAIGPTLMHeadModel model = model_class.from_pretrained(config.model_checkpoint) model.to(config.device) model.eval() dataset = get_dataset(tokenizer, config.dataset_path, config.dataset_cache) special_tokens = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"] calculate_metrics(config, model, tokenizer, dataset, special_tokens)
def run(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=float, default=0.7, help="Sampling softmax temperature") parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError("Interacting with GPT2 requires passing a finetuned model_checkpoint") else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = (GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) global tokenizer tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) global model model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) personalities = [dialog["personality"] for dataset in dataset.values() for dialog in dataset] personality = random.choice(personalities) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) return model, tokenizer, args, personality
def run(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": args.model_checkpoint = download_pretrained_model() random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_checkpoint) model = OpenAIGPTLMHeadModel.from_pretrained(args.model_checkpoint) model.to(args.device) model.eval() logger.info("Sample a personality") personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache) personality = random.choice(personalities) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] while True: raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2*args.max_history+1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print(out_text)
def __init__(self, opt, shared=None): super(TransformerAgent, self).__init__(opt, shared) args = AttrDict( opt) # to keep most commands identical to the interact.py script self.args = args logging.basicConfig(level=logging.INFO) self.logger = logging.getLogger(__file__) self.logger.info(pformat(args)) random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if shared is None: self.logger.info("Get pretrained model and tokenizer") if args.model_checkpoint == "": args.model_checkpoint = download_pretrained_model() if 'gpt2' in args.model_checkpoint: self.tokenizer = GPT2Tokenizer.from_pretrained( args.model_checkpoint) model_class = GPT2DoubleHeadsModel if self.args.eval_type == "hits@1" else GPT2LMHeadModel else: self.tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_checkpoint) model_class = OpenAIGPTDoubleHeadsModel if self.args.eval_type == "hits@1" else OpenAIGPTLMHeadModel self.model_checkpoint = model_class.from_pretrained( args.model_checkpoint) self.model_checkpoint.to(args.device) self.logger.info("Build BPE prefix dictionary") convai_dict = build_dict() assert len(convai_dict) == 19304 self.prefix2words = self.get_prefix2words(convai_dict) else: self.model_checkpoint = shared['model'] self.tokenizer = shared['tokenizer'] self.prefix2words = shared['prefix2words'] add_special_tokens_(self.model_checkpoint, self.tokenizer) self.special_tokens_ids = self.tokenizer.convert_tokens_to_ids( SPECIAL_TOKENS) self.persona = [] self.history = [] self.labels = [] self.reset()
def run(): config_file = "configs/interact_config.json" config = InteractConfig.from_json_file(config_file) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(config)) if config.model_checkpoint == "": config.model_checkpoint = download_pretrained_model() torch.random.manual_seed(config.seed) torch.cuda.manual_seed(config.seed) logger.info("Get pretrained model and tokenizer") if config.model == "bert": tokenizer_class = BertTokenizer model_class = BertLMHeadModel elif config.model == "gpt2": tokenizer_class = GPT2Tokenizer model_class = GPT2LMHeadModel else: tokenizer_class = OpenAIGPTTokenizer model_class = OpenAIGPTLMHeadModel SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"] tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint) model = model_class.from_pretrained(config.model_checkpoint) model.to(config.device) model.eval() history = [] while True: raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(history, tokenizer, model, config, SPECIAL_TOKENS) history.append(out_ids) history = history[-(2 * config.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print(out_text)
def get_model( dataset_path="", dataset_cache='./dataset_cache', model="openai-gpt", model_checkpoint="", device="cuda" if torch.cuda.is_available() else "cpu", seed=0, ): logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) if model_checkpoint == "": if model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: model_checkpoint = download_pretrained_model() if seed != 0: random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = (GPT2Tokenizer, GPT2LMHeadModel) if model == 'gpt2' else ( OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(model_checkpoint) model = model_class.from_pretrained(model_checkpoint) model.to(device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") dataset = get_dataset(tokenizer, dataset_path, dataset_cache) personalities = [ dialog["personality"] for dataset in dataset.values() for dialog in dataset ] personality = random.choice(personalities) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) return model, personality, tokenizer
def init(): args = { "dataset_path": "", "dataset_cache": "./dataset_cache_GPT2tokenizer", "model": "gp2", "model_checkpoint": "../runs/Sep19_21-11-42_micah-HP-ENVY-x360-Convertible-15-ee0xxx_gpt2/", "max_history": 2, "device": "cpu", "max_length": 20, "min_length": 1, "seed": 0, "temperature": 0.7, "top_k": 0, "top_p": 0.9 } if args.get("model_checkpoint") == "": if args.get("model") == 'gpt2': raise ValueError("Interacting with GPT2 requires passing a finetuned model_checkpoint") else: args["model_checkpoint"] = download_pretrained_model() if args.get("seed") != 0: random.seed(args.get("seed")) torch.random.manual_seed(args.get("seed")) torch.cuda.manual_seed(args.get("seed")) print("Get pretrained model and tokenizer") tokenizer_class, model_class = (GPT2Tokenizer, GPT2LMHeadModel) if args.get("model") == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.get("model_checkpoint")) model = model_class.from_pretrained(args.get("model_checkpoint")) model.to(args.get("device")) add_special_tokens_(model, tokenizer) print("Sample a personality") dataset = get_dataset(tokenizer, args.get("dataset_path"), args.get("dataset_cache")) personalities = [dialog["personality"] for dataset in dataset.values() for dialog in dataset] personality = random.choice(personalities) print(tokenizer.decode(chain(*personality))) return tokenizer, personality, model, args
def load_model(model_checkpoint, model_type): if model_checkpoint == "": model_checkpoint = download_pretrained_model() else: assert os.path.exists( model_checkpoint ), f'checkpoint directory not found: {model_checkpoint}' logger.info("Get pretrained model and tokenizer") if model_type not in MODELS: raise NotImplementedError('model "%s" not implemented. use one of %s' % (model_type, str(MODELS.keys))) config_class, tokenizer_class, _, model_class = MODELS[model_type] _tokenizer = tokenizer_class.from_pretrained(model_checkpoint) _model = model_class.from_pretrained(model_checkpoint) device = "cuda" if torch.cuda.is_available() else "cpu" _model.to(device) _model.eval() return _model, _tokenizer, os.path.basename( model_checkpoint) if model_checkpoint else model_checkpoint
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="gpt", help="Model type (gpt or gpt2)") parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": args.model_checkpoint = download_pretrained_model() random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class = GPT2Tokenizer if "gpt2" == args.model else OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2LMHeadModel if "gpt2" == args.model else OpenAIGPTLMHeadModel model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") #personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache) #personality = random.choice(personalities) #logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) wordfile = './data/truncate.txt' # word vector file, can be downloaded from GloVe website weightfile = './auxiliary_data/enwiki_vocab_min200.txt' # each line is a word and its frequency weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3] # load word vectors (words, We) = data_io.getWordmap(wordfile) # load word weights word2weight = data_io.getWordWeight( weightfile, weightpara) # word2weight['str'] is the weight for the word 'str' weight4ind = data_io.getWeight( words, word2weight) # weight4ind[i] is the weight for the i-th word p = 0 start_time = time.time() with open('data_volunteers.json') as json_file: json_data = json.load(json_file) for i in json_data: p += 1 #if p <1100: # continue history = [] personality = [] query_set = [] json_dialog = i["dialog"] json_bot = i["bot_profile"] for j in json_bot: personality.append(tokenizer.encode(j)) #logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) persona = tokenizer.decode(chain(*personality)) row = {"Personality": persona} text = [] for j in json_dialog: if j["sender_class"] == "Human": json_text = j["text"] raw_text = json_text check = tokenizer.decode(tokenizer.encode(raw_text), skip_special_tokens=True) if check == "": history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = normal_sample_sequence( personality, history, tokenizer, model, args) # history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) text.append({ "evaluation_score": j["evaluation_score"], "id": j["id"], "sender": j["sender"], "sender_class": j["sender_class"], "text": raw_text, "generated_text": out_text }) continue history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args, words, weight4ind, We) # history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) text.append({ "evaluation_score": j["evaluation_score"], "id": j["id"], "sender": j["sender"], "sender_class": j["sender_class"], "text": raw_text, "generated_text": out_text }) else: json_text = j["text"] raw_text = json_text history.append(tokenizer.encode(raw_text)) text.append({ "evaluation_score": j["evaluation_score"], "id": j["id"], "sender": j["sender"], "sender_class": j["sender_class"], "text": raw_text }) row["dialog"] = text query_set.append(row) #print(query_set) with open('./sif_set/sif' + str(p) + '.json', 'w', encoding='utf-8') as make_file: json.dump(query_set, make_file) if not p % 10: print( str(p * 100 / 1111) + '%, ' + str(time.time() - start_time) + 'sec') '''
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") parser.add_argument("--n_samples", type=int, default=10) parser.add_argument("--sample_term", type=int, default=1) args = parser.parse_args() if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) #sample_idxs = range(args.n_samples) sample_idxs = [args.sample_term * i for i in range(1, args.n_samples + 1)] for i in sample_idxs: personality = dataset['valid'][i]['personality'] history = dataset['valid'][i]['utterances'][4]['history'] target = dataset['valid'][i]['utterances'][4]['candidates'][-1] with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print('Persona info:') for persona in personality: print(tokenizer.decode(persona, skip_special_tokens=True), end=' ') print('\nDialog:') for his in history: print(tokenizer.decode(his, skip_special_tokens=True)) print('Target:') print(tokenizer.decode(target, skip_special_tokens=True)) print('Prediction:') print(out_text, end='\n\n')
def run(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError("Interacting with GPT2 requires passing a finetuned model_checkpoint") else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = (GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) #add_special_tokens_(model, tokenizer) logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) with open('test.txt','w') as file: for item in dataset['valid']: personality =item["personality"] file.write("个性:") file.write(tokenizer.decode(chain(*personality))) file.write("\n") for op in item["utterances"]: history=op["history"] history = history[-(2 * args.max_history + 1):] file.write("另一个人说的话") file.write(tokenizer.decode(history[-1])) file.write("\n") with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) out_text = tokenizer.decode(out_ids, skip_special_tokens=True) file.write("生成的回答:") file.write(out_text) file.write("\n") file.write("正确答案") file.write(tokenizer.decode(op["candidates"][-1])) file.write("\n") file.write("\n\n")
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) personalities = [ dialog["personality"] for dataset in dataset.values() for dialog in dataset ] personality = random.choice(personalities) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] while True: raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") #classifier code starts # zz = ['I like to sleep',"that's cool other cultures are nice", "where is Geneva cats?", "What public figure defended New York in Januar"] zz = [raw_text] valDF = pd.DataFrame() valDF['question_text'] = zz # prediction part batch_size = 256 def batch_gen(test_df): n_batches = math.ceil(len(test_df) / batch_size) for i in range(n_batches): texts = test_df.iloc[i * batch_size:(i + 1) * batch_size, 0] text_arr = np.array([text_to_array(text) for text in texts]) yield text_arr # test_df = pd.read_csv("../input/quora-insincere-questions-classification/test.csv") test_df = valDF all_preds = [] for x in tqdm(batch_gen(test_df)): all_preds.extend(classifier_model.predict(x).flatten()) y_te = (np.array(all_preds) > 0.5).astype(np.int) print(y_te) print(valDF['question_text']) history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print(out_text)
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") parser.add_argument( "--conv_limit", type=int, default=None, help="Length of conversation - number of times Speaker1 can respond") args = parser.parse_args() #logging.basicConfig(level=logging.INFO) #logger = logging.getLogger(__file__) #logger.info(pformat(args)) if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) print("Select type of chat:\n1. Counselling\n2. Task-Oriented") raw_text = input(">>> ") initial = [ "Will you like to learn a new recipe?", "Do you want to learn a new recipe?", "Let us learn a new recipe." ] sents = ["To sum up, ", "Thus, as I understand, ", "So, to summarize, "] history = [] if raw_text == "1": if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() #logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else ( OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) #logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) personalities = [dialog["personality"] for dialog in dataset] personality = random.choice(personalities) print("Selected personality: ", tokenizer.decode(chain(*personality))) if args.conv_limit: conv_len = args.conv_limit else: conv_len = -1 utt = 0 text_summary = [] while utt != conv_len: raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") history.append(tokenizer.encode(raw_text)) text_summary.append(raw_text) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print(out_text) utt = utt + 1 if utt == conv_len: if out_text.endswith("?"): utt = utt - 1 # generate emotion raw_text = 'exit chat' history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print("\n" + "Chat Emotion: " + out_text) # generate summary text = ".".join(text_summary) summary = summarizer(text, max_length=50) print("\n" + "Summary:\n" + random.choice(sents) + create_reflection(summary[0]['summary_text'])) # generate a supporting response to the summary raw_text = 'summarize-chat' history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print("\n" + "Response:\n" + out_text) elif raw_text == "2": print(random.choice(initial)) raw_text = input(">>> ") scores = sentiment.polarity_scores(raw_text) if scores['pos'] > scores['neg']: print("Great, here is a recipe for you ...") create_recipe() raw_text = input(">>> ") elif scores['neg'] > scores['pos']: print( "ok, then maybe you will like to chat with the counsellor. Please choose option 1. Thank you." ) else: print("I could not understand what you are asking.") else: print("Please select the correct choice.")
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="data/en_book_conversational.json", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) personalities = [ dialog["personality"] for dataset in dataset.values() for dialog in dataset ] personality = random.choice(personalities) # persona = get_persona_label(tokenizer) tokenizer.decode(personalities) def matching_personality(persona_text): while 'i have got a headache and a fever.' not in tokenizer.decode( chain(*personality)): personality = random.choice(personalities) return personality """'immigration checkpoint', 'in a taxi', 'hotel check-in', 'at a restaurant', 'getting a dessert', 'asking for directions', 'at a shopping mall', 'hotel check-out', 'checking in at the airport', 'in flight', 'currency exchange', 'renting a car', 'making a hotel reservation','room service', 'buying a camera', 'at a supermarket', 'in a hospital', 'getting a souvenir', 'asking someone to take a photo'""" logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] while True: raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print(out_text)
parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=float, default=0.7, help="Sampling softmax temperature") parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError("Interacting with GPT2 requires passing a finetuned model_checkpoint") else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = (GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer)
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") parser.add_argument( "--conv_limit", type=int, default=None, help="Length of conversation - number of times Speaker1 can respond") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) personalities = [dialog["personality"] for dialog in dataset] personality = random.choice(personalities) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) sents = ["To sum up, ", "Thus, as I understand, ", "So, to summarize, "] if args.conv_limit: conv_len = args.conv_limit else: conv_len = -1 text_summary = [] utt = 0 history = [] while utt != conv_len: raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") history.append(tokenizer.encode(raw_text)) text_summary.append(raw_text) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print(out_text) utt = utt + 1 if utt == conv_len: if out_text.endswith("?"): utt = utt - 1 text = ".".join(text_summary) summary = summarizer(text, max_length=50) print("\n" + random.choice(sents) + create_reflection(summary[0]['summary_text']))
class run: parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=20, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) #personalities = [dialog["personality"] for dataset in dataset.values() for dialog in dataset] #logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] def process_text(self, raw_text): #personality = random.choice(self.personalities) personality = [ 'i am a robot.', 'my job is to give or deny permission.', 'i love my job.', 'josh is my favorite person.', 'my name is permissioner-bot.', 'i do not have a gender.' ] personality = [self.tokenizer.encode(line) for line in personality] self.history.append(self.tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, self.history, self.tokenizer, self.model, self.args) self.history.append(out_ids) self.history = self.history[-(2 * self.args.max_history + 1):] out_text = self.tokenizer.decode(out_ids, skip_special_tokens=True) return out_text
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) personalities = [ dialog["personality"] for dataset in dataset.values() for dialog in dataset ] personality = random.choice(personalities) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] engine = pyttsx3.init() r = sr.Recognizer() while True: print("Talk:") with sr.Microphone() as source: audio = r.listen(source) raw_text = r.recognize_google(audio) print(raw_text) # raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print(out_text) engine.say(out_text) engine.runAndWait()
def train(): parser = ArgumentParser() parser.add_argument( "--data_path", type=str, default=None, help= "Path to conversational data (by default will look for single file in ./data)" ) parser.add_argument("--run_name", type=str, default='run1', help="The name of the run (subdirectory in ./runs)") parser.add_argument( "--model", type=str, default="openai-gpt", help= "Initialize model from path to checkpoint or with model name (openai-gpt/openai-gpt2)" ) parser.add_argument("--save_every", type=int, default=100, help="Save checkpoint every n updates steps.") parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training") parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history") parser.add_argument( "--max_input_length", type=int, default=200, help= "Number of tokens which will be fed into the model (reduce this number if you have memory constraints)" ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training") parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient") parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--use_huggingface_model", action='store_true', help="Start training from pre-trained model by Huggingface") args = parser.parse_args() # Set seed set_seed(args.seed) if args.use_huggingface_model: args.model = download_pretrained_model() logger.info(f'Using pre-trained Personachat model {args.model}') # Load tokenizer logger.info("Prepare tokenizer, pretrained model and optimizer.") tokenizer_class = GPT2Tokenizer if "gpt2" in args.model else OpenAIGPTTokenizer # cant use Autotokenizer because checkpoint could be a Path tokenizer = tokenizer_class.from_pretrained(args.model) # Load model model_class = GPT2DoubleHeadsModel if "gpt2" in args.model else OpenAIGPTDoubleHeadsModel model = model_class.from_pretrained(args.model) model.to(args.device) # Add special tokens if they are not already added add_special_tokens_(model, tokenizer) # Get data loaders logger.info("Prepare datasets") train_loader = get_data_loader(args, tokenizer, use_cache=True) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=args.adam_epsilon) t_total = len( train_loader) // args.gradient_accumulation_steps * args.n_epochs scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Train! logger.info("***** Running training *****") global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model): # set global_step to gobal_step of last saved checkpoint from model path try: global_step = int(args.model.split("-")[-1].split("/")[0]) except: global_step = 0 epochs_trained = global_step // (len(train_loader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_loader) // args.gradient_accumulation_steps) logger.info( "Continuing training from checkpoint, will skip to saved global_step" ) logger.info(f"Continuing training from epoch {epochs_trained}") logger.info(f"Continuing training from global step {global_step}") logger.info( f"Will skip the first {steps_trained_in_current_epoch} steps in the first epoch" ) # Training loop model.zero_grad() epoch_pbar = trange(epochs_trained, int(args.n_epochs)) av_loss = 0 for current_epoch in epoch_pbar: epoch_pbar.set_description( f"Epoch [{current_epoch+1}/{args.n_epochs}]") pbar = tqdm(train_loader) for step, batch in enumerate(pbar): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple( input_tensor.to(args.device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch (lm_loss), (mc_loss), *_ = model(input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, lm_labels=lm_labels) loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps loss.backward() tr_loss = loss.item() # caclulate exponential moving average av_loss = (step * av_loss + loss) / (step + 1) pbar.set_description(f"Average loss: {av_loss:.4f}") torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if global_step % args.save_every == 0 and global_step > 0: checkpoint_prefix = "checkpoint" output_dir = os.path.join( 'runs', args.run_name, "{}-{}".format(checkpoint_prefix, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) logger.info(f"Saving model checkpoint to {output_dir}") model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) logger.info( f"Saving optimizer and scheduler states to {output_dir}" ) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) # save model output_dir = os.path.join('runs', args.run_name) if not os.path.exists(output_dir): os.makedirs(output_dir) logger.info(f"Saving model checkpoint to {output_dir}") model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(output_dir, "training_args.bin"))
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=200, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) personalities = [ dialog["personality"] for dataset in dataset.values() for dialog in dataset ] # personality = random.choice(personalities) personality = [string_transformer('my name is WabiSabi', tokenizer, False)] quotes = [ 'do not be afraid to ask for yourself', 'to escape fear , you must go through it', 'I am timeless, incomplete and imperfect. No age. No sense of time.', ' failure is another steppingstone to greatness . ', 'think like a queen . queen is not afraid to fail . failure is another steppingstone to greatness . ', 'be thankful for what you have ; you will end up having more . if you concentrate on what you do not have, you will never, ever have enough .', 'surround yourself with only people who are going to lift you higher .', 'the biggest adventure you can ever take is to live the life of your dreams .', 'doing the best at this moment puts you in the best place for the next moment .', 'real integrity is doing the right thing , knowing that nobody is going to know whether you did it or not .', 'the more you praise and celebrate your life , the more there is in life to celebrate .', 'passion is energy . feel the power that comes from focusing on what excites you .', 'lots of people want to ride with you in the limo , but what you want is someone who will take the bus with you when the limo breaks down .', 'turn your wounds into wisdom . ', 'you can have it all . just not all at once . ', 'one of the hardest things in life to learn are which bridges to cross and which bridges to burn . ', 'challenges are gifts that force us to search for a new center of gravity .', 'the thing you fear most has no power . your fear of it is what has the power . facing the truth really will set you free .', 'surround yourself only with people who are going to take you higher .', 'you get in life what you have the courage to ask for .', 'i trust that everything happens for a reason , even when we are not wise enough to see it .', 'everybody has a calling . and your real job in life is to figure out as soon as possible what that is , who you were meant to be , and to begin to honor that in the best way possible for yourself .', 'the key to realizing a dream is to focus not on success but on significance , and then even the small steps and little victories along your path will take on greater meaning .', 'the biggest adventure you can ever take is to live the life of your dreams .', 'self-esteem comes from being able to define the world in your own terms and refusing to abide by the judgments of others .', 'forgiveness is giving up the hope that the past could have been any different .', 'luck is a matter of preparation meeting opportunity .', 'the whole point of being alive is to evolve into the complete person you were intended to be .', 'wisdom equals knowledge plus courage . you have to not only know what to do and when to do it , but you have to also be brave enough to follow through .', 'surround yourself with great people .', 'i alone cannot change the world , but i can cast a stone across the water to create many ripples .', 'whatever the mind of man can conceive and believe, it can achieve .', 'whenever you see a successful person, you only see the public glories, never the private sacrifices to reach them .', 'at some point you are bound to stumble because if you are constantly doing what we do , raising the bar . if you are constantly pushing yourself higher, higher the law of averages not to mention the myth of icarus predicts that you will at some point fall . And when you do i want you to know this , remember this : there is no such thing as failure . failure is just life trying to move us in another direction . now when you are down there in the hole , it looks like failure .', 'and when you are down in the hole when that moment comes , it is really okay to feel bad for a little while . give yourself time to mourn what you think you may have lost but then here is the key , learn from every mistake because every experience , encounter , and particularly your mistakes are there to teach you and force you into being more who you are . and then figure out what is the next right move .', 'because when you inevitably stumble and find yourself stuck in a hole that is the story that will get you out : what is your true calling ? what is your dharma ? what is your purpose ?', 'i know that you all might have a little anxiety now but no matter what challenges or setbacks or disappointments you may encounter along the way , you will find true success and happiness if you have only one goal , there really is only one , and that is this : to fulfill the highest most truthful expression of yourself as a human being . you want to max out your humanity by using your energy to lift yourself up , your family and the people around you .', 'from time to time you may stumble , fall , you will for sure , you will have questions and you will have doubts about your path . but i know this , if you are willing to be guided by , that still small voice that is the gps within yourself , to find out what makes you come alive , you will be more than okay . you will be happy , you will be successful , and you will make a difference in the world .' ] random.shuffle(quotes) quotes = quotes[:24] [personality.append(string_transformer(s, tokenizer)) for s in quotes] # print(personality) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] while True: raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") history.append(string_transformer(raw_text, tokenizer)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print(out_text)
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="gpt2", help="Model type (gpt or gpt2)") parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=150, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") parser.add_argument( "--task", type=str, default="dialogue", help="one of task from [dialogue, qa, mt, nlg, summarization]") parser.add_argument("--self_copy", action='store_true', help="add self copy") parser.add_argument("--perturbation_layers", type=int, default=0, help="number of perturbation layers") parser.add_argument("--adapter_bottleneck", type=int, default=0, help="adapter layer bottleneck") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": args.model_checkpoint = download_pretrained_model() random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class = GPT2Tokenizer if "gpt2" == args.model else OpenAIGPTTokenizer tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model_class = GPT2LMHeadModel if "gpt2" == args.model else OpenAIGPTLMHeadModel model = model_class.from_pretrained( args.model_checkpoint, perturbation_layers=args.perturbation_layers, self_copy=args.self_copy, adapter_bottleneck=args.adapter_bottleneck) model.to(args.device) add_special_tokens_(model, tokenizer) if args.task == "dialogue": output_text = [] ref_text = [] loaded_dataset, cache_path = get_dataset(tokenizer, args.dataset_path, args.dataset_cache, args.task, return_cachepath=True) persona_text = [] distillated_dataset = loaded_dataset for i, pair in enumerate(tqdm(loaded_dataset["train"])): persona = pair["personality"].copy() for j, utterance in enumerate(pair["utterances"]): history = utterance["history"][-(2 * args.max_history + 1):] with torch.no_grad(): out_ids = sample_sequence(tokenizer, model, args, personality=persona, history=history) distillated_dataset["train"][i]["utterances"][j][ "distillated_candidates"] = [out_ids] torch.save(distillated_dataset, cache_path) # qa interact if args.task == "qa": output_text = [] ref_text = [] loaded_dataset, cache_path = get_dataset(tokenizer, args.dataset_path, args.dataset_cache, args.task, return_cachepath=True) distillated_dataset = loaded_dataset for i, pair in enumerate(tqdm(loaded_dataset["train"])): evidence = pair["document"].copy() evidence = [evidence[0][:MAXLEN_MAP[args.task]['document']]] for j, utterance in enumerate(pair["utterances"]): history = utterance["history"][-(2 * args.max_history + 1):] with torch.no_grad(): out_ids = sample_sequence(tokenizer, model, args, personality=evidence, history=history) distillated_dataset["train"][i]["utterances"][j][ "distillated_candidates"] = [out_ids] torch.save(distillated_dataset, cache_path) # nlg interact if args.task == "nlg": output_text = [] ref_text = [] loaded_dataset, cache_path = get_dataset(tokenizer, args.dataset_path, args.dataset_cache, args.task, return_cachepath=True) distillated_dataset = loaded_dataset for i, pair in enumerate(tqdm(loaded_dataset["train"])): source = pair["src"] target = pair["tgt"] with torch.no_grad(): out_ids = sample_sequence(tokenizer, model, args, source=source, target=target) distillated_dataset["train"][i]["distillated_tgt"] = out_ids torch.save(distillated_dataset, cache_path) if (args.task == "mt" or args.task == "summarization"): output_text = [] ref_text = [] loaded_dataset, cache_path = get_dataset(tokenizer, args.dataset_path, args.dataset_cache, args.task, return_cachepath=True) distillated_dataset = loaded_dataset for i, pair in enumerate(tqdm(loaded_dataset["train"])): source = pair["src"][:MAXLEN_MAP[args.task]['src']] target = pair["tgt"] #[:MAXLEN_MAP[args.task]['tgt']] with torch.no_grad(): out_ids = sample_sequence(tokenizer, model, args, source=source, target=target) distillated_dataset["train"][i]["distillated_tgt"] = out_ids torch.save(distillated_dataset, cache_path)
def run(chapter): args = easydict.EasyDict({ "dataset_path": "data/en_book_conversational.json", "dataset_cache": './dataset_cache', "model": "gpt2", "model_checkpoint": "/home/ubuntu/GraduateProject/transfer-learning-conv-ai/runs/Jun04_18-39-17_ime-502_gpt2", "max_history": 4, "device": "cuda" if torch.cuda.is_available() else "cpu", "max_length": 20, "min_length": 1, "seed": 0, "top_p": 0.9 }) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) personalities = [ dialog["personality"] for dataset in dataset.values() for dialog in dataset ] personality = random.choice(personalities) print("Selected personality: ", tokenizer.decode(chain(*personality))) while get_persona_label(chapter) not in tokenizer.decode( chain(*personality)): personality = random.choice(personalities) return personality
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="gpt2", help="Model type (gpt or gpt2)") parser.add_argument("--model_checkpoint", "-mc", type=str, default="", help="Path, url or short name of the model") parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=100, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=float, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") # add option to not use personality parser.add_argument("--no_personality", type=bool, default=True, help="Set to not sample a personality.") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if os.path.isdir("./huggingface_s3/"): args.model_checkpoint = "./huggingface_s3/" logger.info("Loading from pre-downloaded temp path: {}".format( args.model_checkpoint)) else: args.model_checkpoint = download_pretrained_model() random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if "gpt2" == args.model else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) model.eval() # added the option to opt out of using a personality if args.no_personality: logger.info("No personality is sampled for this chatbot.") personality = "" # personality = ["My name is Isabelle Hawkins.", # "I am five years old.", # "My phone number is 959-100-9300.", # "Here is a link I would like you to check out: google.com.", # "I would like to know more about you."] # personality = [tokenizer.encode(p) for p in personality] # logger.info("Selected custom personality: %s",tokenizer.decode(chain(*personality))) else: logger.info("Sample a personality") personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache) personality = random.choice(personalities) # import pdb; pdb.set_trace() logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] # while True: # custom_history = input("Press 0 to end\n\tAdd history: ") # if custom_history == '0': # break # else: # history.append(tokenizer.encode(custom_history)) while True: history = [] args.temperature = float(input("Set temperature: > 0 and <= 1")) prompt = input("Speaker 1 >>> ") while not prompt: print('Prompt should not be empty!') prompt = input("Speaker 1 >>> ") history.append(tokenizer.encode(prompt)) i = 0 while True: with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) i += 1 speaker = "Speaker 2" if i % 2 else "Speaker 1" print(f"{speaker}: {out_text}") if i == 10: break
def init(quotes, quotes_num=16): global history global personality global tokenizer global model global args global parser global logger # new conversation parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=200, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError("Interacting with GPT2 requires passing a finetuned model_checkpoint") else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = (GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Get personality") personality = [string_transformer('my name is WabiSabi', tokenizer, False)] random.shuffle(quotes) # quotes = quotes[:16] # quotes = [q for _, q in zip(range(quotes_num), quotes)] concatenated = " ".join(quotes)[0:1600] quotes = concatenated.split('.') print(quotes) [personality.append(string_transformer(s, tokenizer)) for s in quotes] # print(personality) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = []
def run(): parser = ArgumentParser() parser.add_argument( "--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--use_adapter", default=False, action='store_true', help="Use adapter or not") parser.add_argument("--keyword_module", type=str, default="", help="add, attention, ") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--bert_model_path", default="./", type=str, help="Bert pre-trained model path") parser.add_argument( "--vocab_file", default="./vocab.korean.rawtext.list", type=str, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=50, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=50, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError( "Interacting with GPT2 requires passing a finetuned model_checkpoint" ) else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") # Load KoBERT model and tokenizer bert_tokenizer = BertTokenizer.from_pretrained( args.vocab_file, do_lower_case=args.do_lower_case) bert_model = BertModel.from_pretrained(args.bert_model_path) bert_model.to(args.device) bert_model.eval() # Load KoGPT2 model and tokenizer tok_path = get_tokenizer() gpt_model, gpt_vocab = get_pytorch_conkogpt2_model2( use_adapter=args.use_adapter) gpt_tokenizer = SentencepieceTokenizer(tok_path) gpt_model.to(args.device) gpt_model.eval() model = Seq2Seq(bert_model, gpt_model, gpt_vocab, args) model.load_state_dict(torch.load(args.model_checkpoint), strict=False) model.to(args.device) model.eval() logger.info("Load test data") sourceList, targetList = get_test_dataset(bert_tokenizer, gpt_tokenizer, gpt_vocab, args.dataset_path) f1 = open((args.model_checkpoint + "_output.txt"), 'w') for line in zip(sourceList, targetList): out_ids = sample_sequence(line[0], bert_model, bert_tokenizer, gpt_model, gpt_vocab, args) out_texts = gpt_vocab.to_tokens(out_ids) for text in out_texts: f1.write(text.replace('▁', ' ').replace('</s>', ' ')) """ for id in out_ids: f1.write(str(id)) f1.write(' ') """ f1.write("\n") f1.close()