def test_config_to_json_file(self): config_first = GPT2Config(vocab_size_or_config_json_file=99, n_embd=37) json_file_path = "/tmp/config.json" config_first.to_json_file(json_file_path) config_second = GPT2Config.from_json_file(json_file_path) os.remove(json_file_path) self.assertEqual(config_second.to_dict(), config_first.to_dict())
def run_model(): parser = argparse.ArgumentParser() parser.add_argument('--model_name_or_path', type=str, default='', help='pretrained model name or path to local checkpoint') parser.add_argument("--seed", type=int, default=42) parser.add_argument("--load_checkpoint", '-c', type=str, default='') parser.add_argument("--fp16", type=boolean_string, default=False) parser.add_argument("--max_seq_length", type=int, default=128) parser.add_argument("--generation_length", type=int, default=20) parser.add_argument("--max_history", type=int, default=2) parser.add_argument("--temperature", type=float, default=1) parser.add_argument("--top_k", type=int, default=0) parser.add_argument("--top_p", type=float, default=0.9) parser.add_argument('--use_gpu', action='store_true') parser.add_argument("--gpu", type=int, default=0) args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) device = torch.device("cuda" if torch.cuda.is_available() and args.use_gpu else "cpu") n_gpu = torch.cuda.device_count() args.device, args.n_gpu = device, n_gpu np.random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) #### load the GPT-2 model config = GPT2Config.from_json_file(os.path.join(args.model_name_or_path, 'config.json')) enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path) model = load_model(GPT2LMHeadModel(config), args.load_checkpoint, args, verbose=True) model.to(device) model.eval() history = [] while True: raw_text = input("USR >>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input("USR >>> ") if raw_text.lower() == 'quit': print('SYS >>> Goodbye!') break history.append(raw_text) context_tokens = sum([enc.encode(h) + [EOS_ID] for h in history],[]) #+ [EOS_ID] context_tokens = torch.tensor(context_tokens, device=device, dtype=torch.long).unsqueeze(0) position_ids = torch.arange(0, context_tokens.size(-1), dtype=torch.long, device=context_tokens.device) out = generate_sequence(model, context_tokens, position_ids=position_ids, length=args.generation_length, temperature=args.temperature, top_k=args.top_k, top_p= args.top_p) out = out.tolist() text = enc.decode(cut_seq_to_eos(out[0])).encode('ascii','ignore').decode('ascii') print("SYS >>> ", text) history.append(text) history = history[-(2*args.max_history+1):]
def prepare_config_and_inputs(self): input_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.vocab_size) position_ids = None if self.use_position_ids: position_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions) token_type_ids = None if self.use_token_type_ids: total_voc = self.vocab_size token_type_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc) mc_labels = None lm_labels = None mc_token_ids = None if self.use_labels: mc_labels = GPT2ModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size) lm_labels = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels) mc_token_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices], self.seq_length) config = GPT2Config( vocab_size_or_config_json_file=self.vocab_size, n_positions=self.n_positions, n_embd=self.n_embd, n_layer=self.n_layer, n_head=self.n_head, initializer_range=self.initializer_range) return (config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids)
def get_model(args, device): if args.scratch: config = GPT2Config(n_ctx=args.context_length, n_positions=args.context_length) model = GPT2LMHeadModel(config) else: model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path) #import torchsummary #torchsummary.summary(model, (args.context_length, vocab_size), args.train_batch_size) return model.to(device)
def init(self, model_path, model_checkpoint): self.config = GPT2Config.from_json_file(os.path.join(model_path, "config.json")) self.tokenizer = GPT2Tokenizer.from_pretrained(model_path) self.model = GPT2LMHeadModel(self.config) model_state_dict = fix_state_dict_namespace(torch.load(model_checkpoint)) start_model = self.model if hasattr(self.model, "transformer") and all(not s.startswith('transformer.') for s in model_state_dict.keys()): print('loading transfomer only') start_model = self.model.transformer start_model.load_state_dict(model_state_dict) if self.fp16: self.model.half() self.model.to(self.device) self.model.eval()
def test_config_to_json_string(self): config = GPT2Config(vocab_size_or_config_json_file=99, n_embd=37) obj = json.loads(config.to_json_string()) self.assertEqual(obj["vocab_size"], 99) self.assertEqual(obj["n_embd"], 37)
def run_model(): print(socket.gethostname()) parser = argparse.ArgumentParser() parser.add_argument( '--model_name_or_path', type=str, default='', help='pretrained model name or path to local checkpoint') parser.add_argument("--seed", type=int, default=42) parser.add_argument("--load_checkpoint", '-c', type=str, default='') parser.add_argument("--fp16", type=boolean_string, default=False) parser.add_argument("--test_file", '-t', type=str, default=None, help='input file for testing') parser.add_argument("--output_file", '-o', type=str, default=None, help='output file for testing') parser.add_argument("--normalize_data", type=boolean_string, default=True) parser.add_argument("--batch_size", '-b', type=int, default=256) parser.add_argument("--max_seq_length", type=int, default=512) parser.add_argument("--no_token_id", action='store_true') parser.add_argument("--no_attn_mask", action='store_true') parser.add_argument("--no_eos", action='store_true') parser.add_argument("--generation_length", type=int, default=20) parser.add_argument("--temperature", type=float, default=1) parser.add_argument("--top_k", type=int, default=0) parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.') parser.add_argument('--is_sampling', action='store_true', help='If true, sampling for generation.') parser.add_argument('--output_ref', action='store_true', help='If true, output ref') #BEAM parser.add_argument("--beam", action='store_true', help='If true, beam search') parser.add_argument("--beam_width", type=int, default=1) parser.add_argument('--use_gpu', action='store_true') parser.add_argument("--gpu", type=int, default=0) parser.add_argument('--config', help='JSON config file') parser.add_argument('--eval', action='store_true') parser.add_argument('--cstr_decode', action='store_true') parser.add_argument("--bonus", type=float, default=0.0) args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) if args.config is not None: # override argparse defaults by config JSON opts = json.load(open(args.config)) for k, v in opts.items(): if isinstance(v, str): # PHILLY ENV special cases if 'PHILLY_JOB_DIRECTORY' in v: v = v.replace('PHILLY_JOB_DIRECTORY', os.environ['PHILLY_JOB_DIRECTORY']) elif 'PHILLY_LOG_DIRECTORY' in v: v = v.replace('PHILLY_LOG_DIRECTORY', os.environ['PHILLY_LOG_DIRECTORY']) setattr(args, k, v) # command line should override config JSON argv = sys.argv[1:] overrides, _ = parser.parse_known_args(argv) for k, v in vars(overrides).items(): if f'--{k}' in argv: setattr(args, k, v) # setattr(args, 'local_rank', overrides.local_rank) # do normal parsing device = torch.device( "cuda" if torch.cuda.is_available() and args.use_gpu else "cpu") n_gpu = torch.cuda.device_count() args.device, args.n_gpu = device, n_gpu print(args) np.random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) config = GPT2Config.from_json_file( os.path.join(args.model_name_or_path, 'config.json')) enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path) model = load_model(GPT2LMHeadModel(config), args.load_checkpoint, args, verbose=True) model.to(device) model.eval() if args.test_file: eval_dataloader = get_eval_list_same_length_with_order( args.test_file, enc, args.batch_size, True) model.eval() outs = [] targets = [] loss_all = [] ppl_all = [] sources = [] conv_ids = [] with torch.no_grad(): with tqdm.tqdm(total=len(eval_dataloader), desc=f"Test") as pbar: for step, batch in enumerate( tqdm.tqdm(eval_dataloader, desc="Iteration")): new_batch = [] for t in batch: if isinstance(t, list): new_batch.append(t) else: new_batch.append(t.to(device)) input_ids, position_ids, token_ids, attn_masks, label_ids, context_len, conv_id = new_batch if args.no_token_id: token_ids = None if args.no_eos: input_ids = input_ids[:, :-1] if args.no_attn_mask: attn_masks = None if args.beam: out = beam_search_naive(model, input_ids, position_ids=position_ids, token_type_ids=token_ids, attn_masks=attn_masks, length=args.generation_length, beam_width=args.beam_width, device=args.device, use_bonus=args.cstr_decode, bonus=args.bonus, enc=enc) else: out = generate_sequence(model, input_ids, position_ids=position_ids, token_type_ids=token_ids, attn_masks=attn_masks, length=args.generation_length, start_token=None, temperature=args.temperature, top_k=args.top_k, sample=args.is_sampling, use_bonus=args.cstr_decode, bonus=args.bonus, enc=enc) sources.extend(input_ids.cpu().numpy()) out = out.tolist() outs.extend(out) targets.extend(label_ids) conv_ids.extend(conv_id.cpu().numpy()) conv_id_map = {conv_ids[i]: i for i in range(len(conv_ids))} val_src = [ enc.decode( cut_seq_to_eos(s)).encode('utf-8').decode('utf-8') for s in sources ] #print(len(val_src),len(targets)) val_set = [ enc.decode(s).encode('utf-8').decode('utf-8') for s in targets ] gen = [ enc.decode( cut_seq_to_eos(s)).encode('utf-8').decode('utf-8') for s in outs ] val_src_orders = [ val_src[conv_id_map[i]] for i in sorted(conv_id_map) ] val_set_orders = [ val_set[conv_id_map[i]] for i in sorted(conv_id_map) ] gen_orders = [gen[conv_id_map[i]] for i in sorted(conv_id_map)] print("=" * 40 + " SAMPLE " + "=" * 40) src = enc.decode([ x for x in input_ids[-1].cpu().numpy() if x != 0 ]).encode('utf-8').decode('utf-8') gt = val_set[-1] resp = gen[-1] print( f"Source: \t {src} \n Oracle: \t {gt} \n Resp: \t {resp}\n" ) if args.output_file: with open(args.output_file + '.resp.txt', "w") as resp_f: for i, r in enumerate(gen_orders): r = re.sub("\n", "", r) if args.output_ref: # import pdb; pdb.set_trace() resp_f.write(val_src_orders[i] + '\t' + val_set_orders[i] + '\t' + r + '\n') else: resp_f.write(r + '\n') print("=" * 80) sys.stdout.flush() else: generated = 0 while True: raw_text = input("Model prompt >>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input("Model prompt >>> ") context_tokens = enc.encode(raw_text) + [EOS_ID] context_tokens = torch.tensor(context_tokens, device=device, dtype=torch.long).unsqueeze( 0) #.repeat(batch_size, 1) generated += 1 position_ids = torch.arange(0, context_tokens.size(-1), dtype=torch.long, device=context_tokens.device) token_ids = None if args.no_token_id else torch.zeros_like( context_tokens, dtype=torch.long, device=context_tokens.device) if args.beam: out = beam_search_naive(model, context_tokens, position_ids=None, token_type_ids=token_ids, length=args.generation_length, beam_width=args.beam_width, device=args.device) else: out = generate_sequence(model, context_tokens, position_ids=None, token_type_ids=token_ids, length=args.generation_length, start_token=None, temperature=args.temperature, top_k=args.top_k, sample=args.is_sampling) out = out.tolist() text = enc.decode(cut_seq_to_eos( out[0])).encode('utf-8').decode('utf-8') print("=" * 40 + " RESPONSE " + str(generated) + " " + "=" * 40) print(text) print("=" * 80)
for a in args_dict: logger.info('%-28s %s' % (a, args_dict[a])) if args.fp16: config = join(abspath(PROJECT_FOLDER), 'config_file/SeqLen_vs_BatchSize_1GPU_fp16.csv') else: config = join(abspath(PROJECT_FOLDER), 'config_file/SeqLen_vs_BatchSize_1GPU_fp32.csv') seq_len_mapper = get_len_mapper(config) ######################################################################### # Prepare Data Set ########################################################################## enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path) config = GPT2Config.from_json_file( join(args.model_name_or_path, 'config.json')) if args.local_rank == -1: train_dataloader = BucketingDataLoader(args.train_input_file, args.train_batch_size, args.max_seq_length) else: train_dataloader = DistributedBucketingDataLoader( get_rank(), get_world_size(), args.train_input_file, args.train_batch_size, args.max_seq_length) eval_dataloader_loss = DynamicBatchingLoader( args.eval_input_file, enc, args.normalize_data, args.eval_batch_size, args.max_seq_length, is_train=True)
def run_model(): parser = argparse.ArgumentParser() parser.add_argument( '--model_name_or_path', type=str, default='', help='pretrained model name or path to local checkpoint') parser.add_argument("--seed", type=int, default=42) parser.add_argument("--load_checkpoint", '-c', type=str, default='') parser.add_argument("--fp16", type=boolean_string, default=False) parser.add_argument("--max_seq_length", type=int, default=128) parser.add_argument("--generation_length", type=int, default=20) parser.add_argument("--max_history", type=int, default=2) parser.add_argument("--temperature", type=float, default=1) parser.add_argument("--top_k", type=int, default=0) parser.add_argument("--top_p", type=float, default=0.9) parser.add_argument('--use_gpu', action='store_true') parser.add_argument("--gpu", type=int, default=0) args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) device = torch.device( "cuda" if torch.cuda.is_available() and args.use_gpu else "cpu") n_gpu = torch.cuda.device_count() args.device, args.n_gpu = device, n_gpu np.random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) #### load the GPT-2 model config = GPT2Config.from_json_file( os.path.join(args.model_name_or_path, 'config.json')) enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path) model = load_model(GPT2LMHeadModel(config), args.load_checkpoint, args, verbose=True) model.to(device) model.eval() bot = DialogptIrcBot(CHANNEL, NICKNAME, REALNAME, SERVER, PORT) thread_dialog = threading.Thread(target=bot.start) thread_dialog.setDaemon(True) thread_dialog.start() history = [] sleep(1) while bot.alive: a = 0 num = bot.num if bot.quest_rep: if len(bot.quest_rep) == num + 1: if len(bot.quest_rep[num]) == 1: a = 1 question = bot.quest_rep[num][0] if a == 1: try: history.append(question) context_tokens = sum( [enc.encode(h) + [EOS_ID] for h in history], []) context_tokens = torch.tensor(context_tokens, device=device, dtype=torch.long).unsqueeze(0) position_ids = torch.arange(0, context_tokens.size(-1), dtype=torch.long, device=context_tokens.device) out = generate_sequence(model, context_tokens, position_ids=position_ids, length=args.generation_length, temperature=args.temperature, top_k=args.top_k, top_p=args.top_p) out = out.tolist() text = enc.decode(cut_seq_to_eos(out[0])).encode( 'ascii', 'ignore').decode('ascii') history.append(text) history = history[-(2 * args.max_history + 1):] except: text = "Je ne comprends pas la question!" # Envoi de la réponse print("\nQuestion n°:", num) print("Question:", bot.quest_rep[num]) print("Response:", text) bot.quest_rep[num].append(text)
def __init__(self, model_name, epochs=1, batch_size=64, base_batch_size=32, part=1., half=1, last=True, seed=1234, debug_mode=False): self.device = torch.device('cuda') self.input_dir = "../input" self.work_dir = "../working/" self.debug_mode = debug_mode self.model_name = model_name self.half = half self.last = last self.seed = seed self.identity_list = [ 'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness' ] self.toxicity_type_list = [ 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat' ] if part == 1.: self.weight_dict = { "severe_toxicity": 1000, "obscene": 235, "identity_attack": 236, "insult": 22, "threat": 646, "male": 45, "female": 35, "homosexual_gay_or_lesbian": 176, "christian": 50, "jewish": 249, "muslim": 91, "black": 130, "white": 75, "psychiatric_or_mental_illness": 442, "pp": 101, "np": 13, "pn": 20, "nn": 1, "pp_male": 431, "np_male": 50, "pn_male": 17, "nn_male": 1, "pp_female": 384, "np_female": 39, "pn_female": 17, "nn_female": 1, "pp_homosexual_gay_or_lesbian": 900, "np_homosexual_gay_or_lesbian": 219, "pn_homosexual_gay_or_lesbian": 17, "nn_homosexual_gay_or_lesbian": 1, "pp_christian": 859, "np_christian": 54, "pn_christian": 17, "nn_christian": 1, "pp_jewish": 2365, "np_jewish": 278, "pn_jewish": 17, "nn_jewish": 1, "pp_muslim": 606, "np_muslim": 108, "pn_muslim": 17, "nn_muslim": 1, "pp_black": 586, "np_black": 167, "pn_black": 17, "nn_black": 1, "pp_white": 387, "np_white": 94, "pn_white": 17, "nn_white": 1, "pp_psychiatric_or_mental_illness": 2874, "np_psychiatric_or_mental_illness": 523, "pn_psychiatric_or_mental_illness": 17, "nn_psychiatric_or_mental_illness": 1 } else: self.weight_dict = { "severe_toxicity": 1000, "obscene": 196, "identity_attack": 278, "insult": 22, "threat": 609, "male": 45, "female": 33, "homosexual_gay_or_lesbian": 198, "christian": 48, "jewish": 243, "muslim": 133, "black": 131, "white": 90, "psychiatric_or_mental_illness": 369, "pp": 107, "np": 13, "pn": 19, "nn": 1, "pp_male": 434, "np_male": 51, "pn_male": 17, "nn_male": 1, "pp_female": 324, "np_female": 37, "pn_female": 17, "nn_female": 1, "pp_homosexual_gay_or_lesbian": 1055, "np_homosexual_gay_or_lesbian": 244, "pn_homosexual_gay_or_lesbian": 17, "nn_homosexual_gay_or_lesbian": 1, "pp_christian": 986, "np_christian": 50, "pn_christian": 17, "nn_christian": 1, "pp_jewish": 2680, "np_jewish": 268, "pn_jewish": 16, "nn_jewish": 1, "pp_muslim": 772, "np_muslim": 161, "pn_muslim": 17, "nn_muslim": 1, "pp_black": 633, "np_black": 165, "pn_black": 17, "nn_black": 1, "pp_white": 465, "np_white": 111, "pn_white": 17, "nn_white": 1, "pp_psychiatric_or_mental_illness": 2748, "np_psychiatric_or_mental_illness": 427, "pn_psychiatric_or_mental_illness": 16, "nn_psychiatric_or_mental_illness": 1 } self.stopwords = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—' self.seed_everything() self.max_len = 220 self.epochs = epochs self.base_batch_size = base_batch_size self.batch_size = batch_size self.split_ratio = 0.95 self.sample_num = 1804874 if not self.debug_mode: self.train_df = pd.read_csv( os.path.join( "../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv" )).sample(int(self.sample_num * part), random_state=1234).fillna(0.) self.test_df = pd.read_csv( os.path.join( "../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv" )) else: self.train_df = pd.read_csv( os.path.join( "../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv" )).head(1000).fillna(0.) self.test_df = pd.read_csv( os.path.join( "../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv" )).head(1000) self.train_len = int(len(self.train_df) * self.split_ratio) self.evaluator = self.init_evaluator() self.gpt2_config = GPT2Config("../input/gpt2-models/config.json") self.gpt2_model_path = '../input/gpt2-models/'
def question_generation(_input): metadata, output = _input args = DotMap() """ parser = ArgumentParser() parser.add_argument("--model_type", type=str, default="gpt", help="gpt or gpt2") parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--filename", type=str, default="data/instances_dev.pkl", help="File to use for decoding") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=50, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") # While using SQUASH in the pipeline mode, prefer using the --key flag parser.add_argument("--key", type=str, default=None, help="Override the default settings if the key is set, used in pipeline mode") args = parser.parse_args() """ """ if args.key is not None: # Override some the filename and top_p default settings if args.key is set # This is done when the question generation module is being used in the SQUASH pipeline mode args.filename = "squash/temp/%s/input.pkl" % args.key with open("squash/temp/%s/metadata.json" % args.key, "r") as f: metadata = json.loads(f.read()) args.top_p = metadata["settings"]["top_p"] args.filename = "squash/temp/%s/input.pkl" % args.key with open("squash/temp/%s/metadata.json" % args.key, "r") as f: metadata = json.loads(f.read()) args.top_p = metadata["settings"]["top_p"] """ setattr(args, "top_p", metadata["settings"]["top_p"]) args.top_p = metadata["settings"]["top_p"] logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) args.seed = 42 random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") # NEW BLOCK model_checkpoint = "question_generation/gpt2_corefs_question_generation" model_checkpoint = "/home/gpt2_corefs_question_generation" model_type = "gpt2" #model_checkpoint = "https://storage.cloud.google.com/ds-playground/squash/gpt2_qa.tar.gz" SAVED_MODEL_DIR = "gpt2_corefs_question_generation" dir_path = os.path.dirname(os.path.realpath(__file__)) model_checkpoint = os.path.join(dir_path, SAVED_MODEL_DIR) model_checkpoint = "question_generation/gpt2_corefs_question_generation" tokenizer = GPT2Tokenizer.from_pretrained(model_checkpoint) model = GPT2LMHeadModel.from_pretrained(model_checkpoint) """ OLD BLOCK if args.model_type == 'gpt2': tokenizer = GPT2Tokenizer.from_pretrained(args.model_checkpoint) model = GPT2LMHeadModel.from_pretrained(args.model_checkpoint) else: tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_checkpoint) model = OpenAIGPTLMHeadModel.from_pretrained(args.model_checkpoint) """ output_config_file = "/content/squash-generation/question_generation/gpt2_corefs_question_generation/config.json" output_model_file = "/content/squash-generation/question_generation/gpt2_corefs_question_generation/pytorch_model.bin" output_vocab_file = "/content/squash-generation/question_generation/gpt2_corefs_question_generation/vocab.json" merges_file = "/content/squash-generation/question_generation/gpt2_corefs_question_generation/merges.txt" output_config_file = SparkFiles.get("config.json") output_model_file = SparkFiles.get("pytorch_model.bin") output_vocab_file = SparkFiles.get("vocab.json") merges_file = SparkFiles.get("merges.txt") config = GPT2Config.from_json_file(output_config_file) model = GPT2LMHeadModel(config) state_dict = torch.load(output_model_file, map_location=torch.device('cpu')) model.load_state_dict(state_dict) tokenizer = GPT2Tokenizer(output_vocab_file, merges_file=merges_file) model.to("cpu") model.eval() args.device = "cpu" args.device = "cpu" model.to(args.device) model.eval() return {"break": "point"} #data = get_positional_dataset_from_file(tokenizer, args.filename) data = get_positional_dataset_from_file(tokenizer, output) final_output_dict = {"version": "squash-2.0", "data": [{"paragraphs": []}]} question_number = 0 para_cache = {"index": None, "hidden_states": None} for inst in tqdm.tqdm(data): with torch.no_grad(): para_index = inst["para_index"] # Questions from the same paragraph all appear together # We can re-use the paragraph hidden representations for different questions in the same paragraph if para_index != para_cache["index"]: # Since we have moved to a new paragraph, generate its cache para_cache["hidden_states"] = None # Ignore the answer and question while building the input instance, _ = build_para_only_input_from_segments( inst, tokenizer) input_ids = torch.tensor(instance['input_ids'], device=args.device).unsqueeze(0) token_type_ids = torch.tensor(instance['token_type_ids'], device=args.device).unsqueeze(0) # Run a forward pass to generate the para caches _, para_cache["hidden_states"] = model( input_ids, token_type_ids=token_type_ids) # Sample a question using the paragraph cache output = sample_sequence(inst, tokenizer, model, args, para_cache) original_paragraph = tokenizer.decode(output['paragraph']) generated_question = tokenizer.decode(output['question'], skip_special_tokens=True) original_answer = tokenizer.decode(output['answer'], skip_special_tokens=True) para_index = inst['para_index'] para_cache["index"] = inst['para_index'] # verify whether the answer position is correct, since this will be utilized for filtering original_ans_position = output["answer_position"] if original_paragraph[ output["answer_position"]:output["answer_position"] + len(original_answer)] != original_answer: # This should never be executed, only used as a last resort logger.info("Answer mismatch!") original_ans_position = original_paragraph.index(original_answer) # Output in a SQUAD-like format with questions clumped together under their parent paragraph if len(final_output_dict["data"][0]["paragraphs"]) > para_index: # verify whether the paragraph text is identical assert original_paragraph == final_output_dict["data"][0][ "paragraphs"][para_index]['context'] # append the question answer pair final_output_dict["data"][0]["paragraphs"][para_index][ 'qas'].append({ 'id': 'question_%d' % question_number, 'question': generated_question, 'answers': [{ 'text': original_answer, 'answer_start': original_ans_position, }], 'class': output['class'], 'algorithm': output['algorithm'], 'is_impossible': False }) else: # add a new question to the list of QA pairs final_output_dict['data'][0]['paragraphs'].append({ 'context': original_paragraph, 'qas': [{ 'id': 'question_%d' % question_number, 'question': generated_question, 'answers': [{ 'text': original_answer, 'answer_start': original_ans_position, }], 'class': output['class'], 'algorithm': output['algorithm'], 'is_impossible': False }] }) question_number += 1 #with open("squash/temp/%s/generated_questions.json" % args.key, "w") as f: # f.write(json.dumps(final_output_dict)) return final_output_dict
gpu = 0 use_gpu = True os.environ['CUDA_VISIBLE_DEVICES'] = 0 device = torch.device( "cuda" if torch.cuda.is_available() and use_gpu else "cpu") n_gpu = torch.cuda.device_count() fp16 = False np.random.seed(42) torch.random.manual_seed(42) torch.cuda.manual_seed(42) #### load the GPT-2 model config = GPT2Config.from_json_file( os.path.join(model_name_or_path, 'config.json')) enc = GPT2Tokenizer.from_pretrained(model_name_or_path) model = load_model(GPT2LMHeadModel(config), load_checkpoint, n_gpu, device, fp16, verbose=True) model.to(device) model.eval() history = [] while True: raw_text = input("USR >>> ") while not raw_text: print('Prompt should not be empty!')
DATA_DIR = "../input/jigsaw-unintended-bias-in-toxicity-classification" LOGGER_PATH = os.path.join(SAVE_DIR, "log.txt") FOLD_PATH = "../input/toxic-folds/fold01.csv" TRAIN_PATH = os.path.join(DATA_DIR, "train.csv") TEST_PATH = os.path.join(DATA_DIR, "test.csv") SUB_PATH = os.path.join(DATA_DIR, "sample_submission.csv") identity_columns = [ 'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness' ] AUX_COLUMNS = [ 'target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat' ] GPT2_MODEL_PATH = '../input/gpt2-models/' bert_config = GPT2Config(os.path.join(GPT2_MODEL_PATH, 'config.json')) # =============== # Settings # =============== fold_id = 0 seed = 0 device = "cuda:0" epochs = 1 n_labels = len(AUX_COLUMNS) + 1 max_len = 220 head_len = 80 batch_size = 16 base_lr = 2e-5 gammas = [0.75, 0.5, 0.25] accumulation_steps = 2