def test_config_to_json_file(self):
     config_first = GPT2Config(vocab_size_or_config_json_file=99, n_embd=37)
     json_file_path = "/tmp/config.json"
     config_first.to_json_file(json_file_path)
     config_second = GPT2Config.from_json_file(json_file_path)
     os.remove(json_file_path)
     self.assertEqual(config_second.to_dict(), config_first.to_dict())
Beispiel #2
0
def run_model():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name_or_path', type=str, default='', help='pretrained model name or path to local checkpoint')
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--load_checkpoint", '-c', type=str, default='')
    parser.add_argument("--fp16", type=boolean_string, default=False)
    parser.add_argument("--max_seq_length", type=int, default=128)
    
    parser.add_argument("--generation_length", type=int, default=20)
    parser.add_argument("--max_history", type=int, default=2)

    parser.add_argument("--temperature", type=float, default=1)
    parser.add_argument("--top_k", type=int, default=0)
    parser.add_argument("--top_p", type=float, default=0.9)

    parser.add_argument('--use_gpu', action='store_true')
    parser.add_argument("--gpu", type=int, default=0)

    args = parser.parse_args()
    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)


    device = torch.device("cuda" if torch.cuda.is_available() and args.use_gpu else "cpu")
    n_gpu = torch.cuda.device_count()
    args.device, args.n_gpu = device, n_gpu

    np.random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    #### load the GPT-2 model 
    config = GPT2Config.from_json_file(os.path.join(args.model_name_or_path, 'config.json'))
    enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
    model = load_model(GPT2LMHeadModel(config), args.load_checkpoint, args, verbose=True)
    model.to(device)
    model.eval()

    history = []
    while True:
        raw_text = input("USR >>> ")
        while not raw_text:
            print('Prompt should not be empty!')
            raw_text = input("USR >>> ")
        if raw_text.lower() == 'quit':
          print('SYS >>> Goodbye!')
          break
        history.append(raw_text)
        context_tokens = sum([enc.encode(h) + [EOS_ID] for h in history],[]) #+ [EOS_ID]
        context_tokens = torch.tensor(context_tokens, device=device, dtype=torch.long).unsqueeze(0)
        position_ids = torch.arange(0, context_tokens.size(-1), dtype=torch.long, device=context_tokens.device)

        out = generate_sequence(model, context_tokens, position_ids=position_ids,
                                length=args.generation_length, temperature=args.temperature, 
                                top_k=args.top_k, top_p= args.top_p) 

        out = out.tolist()                        
        text = enc.decode(cut_seq_to_eos(out[0])).encode('ascii','ignore').decode('ascii')
        print("SYS >>> ", text)
        history.append(text)
        history = history[-(2*args.max_history+1):]
        def prepare_config_and_inputs(self):
            input_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.vocab_size)

            position_ids = None
            if self.use_position_ids:
                position_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_positions)

            token_type_ids = None
            if self.use_token_type_ids:
                total_voc = self.vocab_size
                token_type_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)

            mc_labels = None
            lm_labels = None
            mc_token_ids = None
            if self.use_labels:
                mc_labels = GPT2ModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
                lm_labels = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
                mc_token_ids = GPT2ModelTest.ids_tensor([self.batch_size, self.n_choices], self.seq_length)

            config = GPT2Config(
                vocab_size_or_config_json_file=self.vocab_size,
                n_positions=self.n_positions,
                n_embd=self.n_embd,
                n_layer=self.n_layer,
                n_head=self.n_head,
                initializer_range=self.initializer_range)

            return (config, input_ids, token_type_ids, position_ids,
                    mc_labels, lm_labels, mc_token_ids)
Beispiel #4
0
def get_model(args, device):
    if args.scratch:
        config = GPT2Config(n_ctx=args.context_length,
                            n_positions=args.context_length)
        model = GPT2LMHeadModel(config)
    else:
        model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)
    #import torchsummary
    #torchsummary.summary(model, (args.context_length, vocab_size), args.train_batch_size)
    return model.to(device)
Beispiel #5
0
    def init(self, model_path, model_checkpoint):
        self.config = GPT2Config.from_json_file(os.path.join(model_path, "config.json"))
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_path)
        self.model = GPT2LMHeadModel(self.config)

        model_state_dict = fix_state_dict_namespace(torch.load(model_checkpoint))

        start_model = self.model
        if hasattr(self.model, "transformer") and all(not s.startswith('transformer.') for s in model_state_dict.keys()):
            print('loading transfomer only')
            start_model = self.model.transformer
        start_model.load_state_dict(model_state_dict)

        if self.fp16:
            self.model.half()

        self.model.to(self.device)
        self.model.eval()
 def test_config_to_json_string(self):
     config = GPT2Config(vocab_size_or_config_json_file=99, n_embd=37)
     obj = json.loads(config.to_json_string())
     self.assertEqual(obj["vocab_size"], 99)
     self.assertEqual(obj["n_embd"], 37)
Beispiel #7
0
def run_model():
    print(socket.gethostname())

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model_name_or_path',
        type=str,
        default='',
        help='pretrained model name or path to local checkpoint')
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--load_checkpoint", '-c', type=str, default='')
    parser.add_argument("--fp16", type=boolean_string, default=False)
    parser.add_argument("--test_file",
                        '-t',
                        type=str,
                        default=None,
                        help='input file for testing')
    parser.add_argument("--output_file",
                        '-o',
                        type=str,
                        default=None,
                        help='output file for testing')
    parser.add_argument("--normalize_data", type=boolean_string, default=True)
    parser.add_argument("--batch_size", '-b', type=int, default=256)
    parser.add_argument("--max_seq_length", type=int, default=512)
    parser.add_argument("--no_token_id", action='store_true')
    parser.add_argument("--no_attn_mask", action='store_true')
    parser.add_argument("--no_eos", action='store_true')

    parser.add_argument("--generation_length", type=int, default=20)
    parser.add_argument("--temperature", type=float, default=1)
    parser.add_argument("--top_k", type=int, default=0)
    parser.add_argument('--unconditional',
                        action='store_true',
                        help='If true, unconditional generation.')
    parser.add_argument('--is_sampling',
                        action='store_true',
                        help='If true, sampling for generation.')
    parser.add_argument('--output_ref',
                        action='store_true',
                        help='If true, output ref')

    #BEAM
    parser.add_argument("--beam",
                        action='store_true',
                        help='If true, beam search')
    parser.add_argument("--beam_width", type=int, default=1)

    parser.add_argument('--use_gpu', action='store_true')
    parser.add_argument("--gpu", type=int, default=0)
    parser.add_argument('--config', help='JSON config file')
    parser.add_argument('--eval', action='store_true')
    parser.add_argument('--cstr_decode', action='store_true')
    parser.add_argument("--bonus", type=float, default=0.0)

    args = parser.parse_args()
    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)

    if args.config is not None:
        # override argparse defaults by config JSON
        opts = json.load(open(args.config))
        for k, v in opts.items():
            if isinstance(v, str):
                # PHILLY ENV special cases
                if 'PHILLY_JOB_DIRECTORY' in v:
                    v = v.replace('PHILLY_JOB_DIRECTORY',
                                  os.environ['PHILLY_JOB_DIRECTORY'])
                elif 'PHILLY_LOG_DIRECTORY' in v:
                    v = v.replace('PHILLY_LOG_DIRECTORY',
                                  os.environ['PHILLY_LOG_DIRECTORY'])
            setattr(args, k, v)

        # command line should override config JSON
        argv = sys.argv[1:]
        overrides, _ = parser.parse_known_args(argv)
        for k, v in vars(overrides).items():
            if f'--{k}' in argv:
                setattr(args, k, v)
        # setattr(args, 'local_rank', overrides.local_rank)


# do normal parsing

    device = torch.device(
        "cuda" if torch.cuda.is_available() and args.use_gpu else "cpu")
    n_gpu = torch.cuda.device_count()
    args.device, args.n_gpu = device, n_gpu
    print(args)

    np.random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    config = GPT2Config.from_json_file(
        os.path.join(args.model_name_or_path, 'config.json'))
    enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
    model = load_model(GPT2LMHeadModel(config),
                       args.load_checkpoint,
                       args,
                       verbose=True)
    model.to(device)
    model.eval()

    if args.test_file:
        eval_dataloader = get_eval_list_same_length_with_order(
            args.test_file, enc, args.batch_size, True)

        model.eval()
        outs = []
        targets = []
        loss_all = []
        ppl_all = []
        sources = []
        conv_ids = []
        with torch.no_grad():
            with tqdm.tqdm(total=len(eval_dataloader), desc=f"Test") as pbar:
                for step, batch in enumerate(
                        tqdm.tqdm(eval_dataloader, desc="Iteration")):

                    new_batch = []
                    for t in batch:
                        if isinstance(t, list):
                            new_batch.append(t)
                        else:
                            new_batch.append(t.to(device))

                    input_ids, position_ids, token_ids, attn_masks, label_ids, context_len, conv_id = new_batch

                    if args.no_token_id:
                        token_ids = None
                    if args.no_eos:
                        input_ids = input_ids[:, :-1]
                    if args.no_attn_mask:
                        attn_masks = None
                    if args.beam:
                        out = beam_search_naive(model,
                                                input_ids,
                                                position_ids=position_ids,
                                                token_type_ids=token_ids,
                                                attn_masks=attn_masks,
                                                length=args.generation_length,
                                                beam_width=args.beam_width,
                                                device=args.device,
                                                use_bonus=args.cstr_decode,
                                                bonus=args.bonus,
                                                enc=enc)
                    else:
                        out = generate_sequence(model,
                                                input_ids,
                                                position_ids=position_ids,
                                                token_type_ids=token_ids,
                                                attn_masks=attn_masks,
                                                length=args.generation_length,
                                                start_token=None,
                                                temperature=args.temperature,
                                                top_k=args.top_k,
                                                sample=args.is_sampling,
                                                use_bonus=args.cstr_decode,
                                                bonus=args.bonus,
                                                enc=enc)

                    sources.extend(input_ids.cpu().numpy())
                    out = out.tolist()
                    outs.extend(out)
                    targets.extend(label_ids)
                    conv_ids.extend(conv_id.cpu().numpy())

                conv_id_map = {conv_ids[i]: i for i in range(len(conv_ids))}
                val_src = [
                    enc.decode(
                        cut_seq_to_eos(s)).encode('utf-8').decode('utf-8')
                    for s in sources
                ]
                #print(len(val_src),len(targets))

                val_set = [
                    enc.decode(s).encode('utf-8').decode('utf-8')
                    for s in targets
                ]
                gen = [
                    enc.decode(
                        cut_seq_to_eos(s)).encode('utf-8').decode('utf-8')
                    for s in outs
                ]

                val_src_orders = [
                    val_src[conv_id_map[i]] for i in sorted(conv_id_map)
                ]
                val_set_orders = [
                    val_set[conv_id_map[i]] for i in sorted(conv_id_map)
                ]
                gen_orders = [gen[conv_id_map[i]] for i in sorted(conv_id_map)]

                print("=" * 40 + " SAMPLE " + "=" * 40)
                src = enc.decode([
                    x for x in input_ids[-1].cpu().numpy() if x != 0
                ]).encode('utf-8').decode('utf-8')
                gt = val_set[-1]
                resp = gen[-1]
                print(
                    f"Source: \t {src} \n Oracle: \t {gt} \n Resp: \t {resp}\n"
                )
                if args.output_file:
                    with open(args.output_file + '.resp.txt', "w") as resp_f:
                        for i, r in enumerate(gen_orders):
                            r = re.sub("\n", "", r)
                            if args.output_ref:
                                # import pdb; pdb.set_trace()
                                resp_f.write(val_src_orders[i] + '\t' +
                                             val_set_orders[i] + '\t' + r +
                                             '\n')
                            else:
                                resp_f.write(r + '\n')
                print("=" * 80)

                sys.stdout.flush()

    else:
        generated = 0
        while True:
            raw_text = input("Model prompt >>> ")
            while not raw_text:
                print('Prompt should not be empty!')
                raw_text = input("Model prompt >>> ")
            context_tokens = enc.encode(raw_text) + [EOS_ID]
            context_tokens = torch.tensor(context_tokens,
                                          device=device,
                                          dtype=torch.long).unsqueeze(
                                              0)  #.repeat(batch_size, 1)
            generated += 1
            position_ids = torch.arange(0,
                                        context_tokens.size(-1),
                                        dtype=torch.long,
                                        device=context_tokens.device)
            token_ids = None if args.no_token_id else torch.zeros_like(
                context_tokens, dtype=torch.long, device=context_tokens.device)
            if args.beam:
                out = beam_search_naive(model,
                                        context_tokens,
                                        position_ids=None,
                                        token_type_ids=token_ids,
                                        length=args.generation_length,
                                        beam_width=args.beam_width,
                                        device=args.device)
            else:
                out = generate_sequence(model,
                                        context_tokens,
                                        position_ids=None,
                                        token_type_ids=token_ids,
                                        length=args.generation_length,
                                        start_token=None,
                                        temperature=args.temperature,
                                        top_k=args.top_k,
                                        sample=args.is_sampling)
            out = out.tolist()
            text = enc.decode(cut_seq_to_eos(
                out[0])).encode('utf-8').decode('utf-8')
            print("=" * 40 + " RESPONSE " + str(generated) + " " + "=" * 40)
            print(text)
            print("=" * 80)
Beispiel #8
0
for a in args_dict:
    logger.info('%-28s  %s' % (a, args_dict[a]))

if args.fp16:
    config = join(abspath(PROJECT_FOLDER),
                  'config_file/SeqLen_vs_BatchSize_1GPU_fp16.csv')
else:
    config = join(abspath(PROJECT_FOLDER),
                  'config_file/SeqLen_vs_BatchSize_1GPU_fp32.csv')
seq_len_mapper = get_len_mapper(config)
#########################################################################
# Prepare Data Set
##########################################################################
enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)

config = GPT2Config.from_json_file(
    join(args.model_name_or_path, 'config.json'))

if args.local_rank == -1:
    train_dataloader = BucketingDataLoader(args.train_input_file,
                                           args.train_batch_size,
                                           args.max_seq_length)
else:
    train_dataloader = DistributedBucketingDataLoader(
        get_rank(), get_world_size(),
        args.train_input_file, args.train_batch_size,
        args.max_seq_length)

eval_dataloader_loss = DynamicBatchingLoader(
    args.eval_input_file, enc, args.normalize_data,
    args.eval_batch_size, args.max_seq_length,
    is_train=True)
Beispiel #9
0
def run_model():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model_name_or_path',
        type=str,
        default='',
        help='pretrained model name or path to local checkpoint')
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--load_checkpoint", '-c', type=str, default='')
    parser.add_argument("--fp16", type=boolean_string, default=False)
    parser.add_argument("--max_seq_length", type=int, default=128)

    parser.add_argument("--generation_length", type=int, default=20)
    parser.add_argument("--max_history", type=int, default=2)

    parser.add_argument("--temperature", type=float, default=1)
    parser.add_argument("--top_k", type=int, default=0)
    parser.add_argument("--top_p", type=float, default=0.9)

    parser.add_argument('--use_gpu', action='store_true')
    parser.add_argument("--gpu", type=int, default=0)

    args = parser.parse_args()
    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)

    device = torch.device(
        "cuda" if torch.cuda.is_available() and args.use_gpu else "cpu")
    n_gpu = torch.cuda.device_count()
    args.device, args.n_gpu = device, n_gpu

    np.random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    #### load the GPT-2 model
    config = GPT2Config.from_json_file(
        os.path.join(args.model_name_or_path, 'config.json'))
    enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
    model = load_model(GPT2LMHeadModel(config),
                       args.load_checkpoint,
                       args,
                       verbose=True)
    model.to(device)
    model.eval()

    bot = DialogptIrcBot(CHANNEL, NICKNAME, REALNAME, SERVER, PORT)
    thread_dialog = threading.Thread(target=bot.start)
    thread_dialog.setDaemon(True)
    thread_dialog.start()

    history = []
    sleep(1)
    while bot.alive:
        a = 0
        num = bot.num
        if bot.quest_rep:
            if len(bot.quest_rep) == num + 1:
                if len(bot.quest_rep[num]) == 1:
                    a = 1
                    question = bot.quest_rep[num][0]

        if a == 1:
            try:
                history.append(question)
                context_tokens = sum(
                    [enc.encode(h) + [EOS_ID] for h in history], [])
                context_tokens = torch.tensor(context_tokens,
                                              device=device,
                                              dtype=torch.long).unsqueeze(0)
                position_ids = torch.arange(0,
                                            context_tokens.size(-1),
                                            dtype=torch.long,
                                            device=context_tokens.device)

                out = generate_sequence(model,
                                        context_tokens,
                                        position_ids=position_ids,
                                        length=args.generation_length,
                                        temperature=args.temperature,
                                        top_k=args.top_k,
                                        top_p=args.top_p)

                out = out.tolist()
                text = enc.decode(cut_seq_to_eos(out[0])).encode(
                    'ascii', 'ignore').decode('ascii')

                history.append(text)
                history = history[-(2 * args.max_history + 1):]

            except:
                text = "Je ne comprends pas la question!"

            # Envoi de la réponse
            print("\nQuestion n°:", num)
            print("Question:", bot.quest_rep[num])
            print("Response:", text)
            bot.quest_rep[num].append(text)
Beispiel #10
0
 def __init__(self,
              model_name,
              epochs=1,
              batch_size=64,
              base_batch_size=32,
              part=1.,
              half=1,
              last=True,
              seed=1234,
              debug_mode=False):
     self.device = torch.device('cuda')
     self.input_dir = "../input"
     self.work_dir = "../working/"
     self.debug_mode = debug_mode
     self.model_name = model_name
     self.half = half
     self.last = last
     self.seed = seed
     self.identity_list = [
         'male', 'female', 'homosexual_gay_or_lesbian', 'christian',
         'jewish', 'muslim', 'black', 'white',
         'psychiatric_or_mental_illness'
     ]
     self.toxicity_type_list = [
         'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat'
     ]
     if part == 1.:
         self.weight_dict = {
             "severe_toxicity": 1000,
             "obscene": 235,
             "identity_attack": 236,
             "insult": 22,
             "threat": 646,
             "male": 45,
             "female": 35,
             "homosexual_gay_or_lesbian": 176,
             "christian": 50,
             "jewish": 249,
             "muslim": 91,
             "black": 130,
             "white": 75,
             "psychiatric_or_mental_illness": 442,
             "pp": 101,
             "np": 13,
             "pn": 20,
             "nn": 1,
             "pp_male": 431,
             "np_male": 50,
             "pn_male": 17,
             "nn_male": 1,
             "pp_female": 384,
             "np_female": 39,
             "pn_female": 17,
             "nn_female": 1,
             "pp_homosexual_gay_or_lesbian": 900,
             "np_homosexual_gay_or_lesbian": 219,
             "pn_homosexual_gay_or_lesbian": 17,
             "nn_homosexual_gay_or_lesbian": 1,
             "pp_christian": 859,
             "np_christian": 54,
             "pn_christian": 17,
             "nn_christian": 1,
             "pp_jewish": 2365,
             "np_jewish": 278,
             "pn_jewish": 17,
             "nn_jewish": 1,
             "pp_muslim": 606,
             "np_muslim": 108,
             "pn_muslim": 17,
             "nn_muslim": 1,
             "pp_black": 586,
             "np_black": 167,
             "pn_black": 17,
             "nn_black": 1,
             "pp_white": 387,
             "np_white": 94,
             "pn_white": 17,
             "nn_white": 1,
             "pp_psychiatric_or_mental_illness": 2874,
             "np_psychiatric_or_mental_illness": 523,
             "pn_psychiatric_or_mental_illness": 17,
             "nn_psychiatric_or_mental_illness": 1
         }
     else:
         self.weight_dict = {
             "severe_toxicity": 1000,
             "obscene": 196,
             "identity_attack": 278,
             "insult": 22,
             "threat": 609,
             "male": 45,
             "female": 33,
             "homosexual_gay_or_lesbian": 198,
             "christian": 48,
             "jewish": 243,
             "muslim": 133,
             "black": 131,
             "white": 90,
             "psychiatric_or_mental_illness": 369,
             "pp": 107,
             "np": 13,
             "pn": 19,
             "nn": 1,
             "pp_male": 434,
             "np_male": 51,
             "pn_male": 17,
             "nn_male": 1,
             "pp_female": 324,
             "np_female": 37,
             "pn_female": 17,
             "nn_female": 1,
             "pp_homosexual_gay_or_lesbian": 1055,
             "np_homosexual_gay_or_lesbian": 244,
             "pn_homosexual_gay_or_lesbian": 17,
             "nn_homosexual_gay_or_lesbian": 1,
             "pp_christian": 986,
             "np_christian": 50,
             "pn_christian": 17,
             "nn_christian": 1,
             "pp_jewish": 2680,
             "np_jewish": 268,
             "pn_jewish": 16,
             "nn_jewish": 1,
             "pp_muslim": 772,
             "np_muslim": 161,
             "pn_muslim": 17,
             "nn_muslim": 1,
             "pp_black": 633,
             "np_black": 165,
             "pn_black": 17,
             "nn_black": 1,
             "pp_white": 465,
             "np_white": 111,
             "pn_white": 17,
             "nn_white": 1,
             "pp_psychiatric_or_mental_illness": 2748,
             "np_psychiatric_or_mental_illness": 427,
             "pn_psychiatric_or_mental_illness": 16,
             "nn_psychiatric_or_mental_illness": 1
         }
     self.stopwords = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'
     self.seed_everything()
     self.max_len = 220
     self.epochs = epochs
     self.base_batch_size = base_batch_size
     self.batch_size = batch_size
     self.split_ratio = 0.95
     self.sample_num = 1804874
     if not self.debug_mode:
         self.train_df = pd.read_csv(
             os.path.join(
                 "../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv"
             )).sample(int(self.sample_num * part),
                       random_state=1234).fillna(0.)
         self.test_df = pd.read_csv(
             os.path.join(
                 "../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv"
             ))
     else:
         self.train_df = pd.read_csv(
             os.path.join(
                 "../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv"
             )).head(1000).fillna(0.)
         self.test_df = pd.read_csv(
             os.path.join(
                 "../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv"
             )).head(1000)
     self.train_len = int(len(self.train_df) * self.split_ratio)
     self.evaluator = self.init_evaluator()
     self.gpt2_config = GPT2Config("../input/gpt2-models/config.json")
     self.gpt2_model_path = '../input/gpt2-models/'
Beispiel #11
0
def question_generation(_input):
    metadata, output = _input
    args = DotMap()
    """
    parser = ArgumentParser()
    parser.add_argument("--model_type", type=str, default="gpt", help="gpt or gpt2")
    parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--filename", type=str, default="data/instances_dev.pkl", help="File to use for decoding")
    parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length", type=int, default=50, help="Maximum length of the output utterances")
    parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature")
    parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument("--top_p", type=float, default=0.9,
                        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")

    # While using SQUASH in the pipeline mode, prefer using the --key flag
    parser.add_argument("--key", type=str, default=None,
                        help="Override the default settings if the key is set, used in pipeline mode")
    args = parser.parse_args()
    """
    """
    if args.key is not None:
        # Override some the filename and top_p default settings if args.key is set
        # This is done when the question generation module is being used in the SQUASH pipeline mode
        args.filename = "squash/temp/%s/input.pkl" % args.key

        with open("squash/temp/%s/metadata.json" % args.key, "r") as f:
            metadata = json.loads(f.read())
        args.top_p = metadata["settings"]["top_p"]
    args.filename = "squash/temp/%s/input.pkl" % args.key

    with open("squash/temp/%s/metadata.json" % args.key, "r") as f:
        metadata = json.loads(f.read())

    args.top_p = metadata["settings"]["top_p"]
    """
    setattr(args, "top_p", metadata["settings"]["top_p"])
    args.top_p = metadata["settings"]["top_p"]

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    args.seed = 42
    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")

    # NEW BLOCK
    model_checkpoint = "question_generation/gpt2_corefs_question_generation"
    model_checkpoint = "/home/gpt2_corefs_question_generation"
    model_type = "gpt2"
    #model_checkpoint = "https://storage.cloud.google.com/ds-playground/squash/gpt2_qa.tar.gz"
    SAVED_MODEL_DIR = "gpt2_corefs_question_generation"
    dir_path = os.path.dirname(os.path.realpath(__file__))
    model_checkpoint = os.path.join(dir_path, SAVED_MODEL_DIR)
    model_checkpoint = "question_generation/gpt2_corefs_question_generation"

    tokenizer = GPT2Tokenizer.from_pretrained(model_checkpoint)
    model = GPT2LMHeadModel.from_pretrained(model_checkpoint)
    """ OLD BLOCK
    if args.model_type == 'gpt2':
        tokenizer = GPT2Tokenizer.from_pretrained(args.model_checkpoint)
        model = GPT2LMHeadModel.from_pretrained(args.model_checkpoint)
    else:
        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_checkpoint)
        model = OpenAIGPTLMHeadModel.from_pretrained(args.model_checkpoint)
    """

    output_config_file = "/content/squash-generation/question_generation/gpt2_corefs_question_generation/config.json"
    output_model_file = "/content/squash-generation/question_generation/gpt2_corefs_question_generation/pytorch_model.bin"
    output_vocab_file = "/content/squash-generation/question_generation/gpt2_corefs_question_generation/vocab.json"
    merges_file = "/content/squash-generation/question_generation/gpt2_corefs_question_generation/merges.txt"

    output_config_file = SparkFiles.get("config.json")
    output_model_file = SparkFiles.get("pytorch_model.bin")
    output_vocab_file = SparkFiles.get("vocab.json")
    merges_file = SparkFiles.get("merges.txt")

    config = GPT2Config.from_json_file(output_config_file)
    model = GPT2LMHeadModel(config)
    state_dict = torch.load(output_model_file,
                            map_location=torch.device('cpu'))
    model.load_state_dict(state_dict)
    tokenizer = GPT2Tokenizer(output_vocab_file, merges_file=merges_file)
    model.to("cpu")
    model.eval()
    args.device = "cpu"

    args.device = "cpu"
    model.to(args.device)
    model.eval()

    return {"break": "point"}
    #data = get_positional_dataset_from_file(tokenizer, args.filename)
    data = get_positional_dataset_from_file(tokenizer, output)
    final_output_dict = {"version": "squash-2.0", "data": [{"paragraphs": []}]}
    question_number = 0

    para_cache = {"index": None, "hidden_states": None}

    for inst in tqdm.tqdm(data):
        with torch.no_grad():
            para_index = inst["para_index"]
            # Questions from the same paragraph all appear together
            # We can re-use the paragraph hidden representations for different questions in the same paragraph
            if para_index != para_cache["index"]:
                # Since we have moved to a new paragraph, generate its cache
                para_cache["hidden_states"] = None
                # Ignore the answer and question while building the input
                instance, _ = build_para_only_input_from_segments(
                    inst, tokenizer)
                input_ids = torch.tensor(instance['input_ids'],
                                         device=args.device).unsqueeze(0)
                token_type_ids = torch.tensor(instance['token_type_ids'],
                                              device=args.device).unsqueeze(0)

                # Run a forward pass to generate the para caches
                _, para_cache["hidden_states"] = model(
                    input_ids, token_type_ids=token_type_ids)

            # Sample a question using the paragraph cache
            output = sample_sequence(inst, tokenizer, model, args, para_cache)

        original_paragraph = tokenizer.decode(output['paragraph'])
        generated_question = tokenizer.decode(output['question'],
                                              skip_special_tokens=True)
        original_answer = tokenizer.decode(output['answer'],
                                           skip_special_tokens=True)
        para_index = inst['para_index']
        para_cache["index"] = inst['para_index']

        # verify whether the answer position is correct, since this will be utilized for filtering
        original_ans_position = output["answer_position"]
        if original_paragraph[
                output["answer_position"]:output["answer_position"] +
                len(original_answer)] != original_answer:
            # This should never be executed, only used as a last resort
            logger.info("Answer mismatch!")
            original_ans_position = original_paragraph.index(original_answer)

        # Output in a SQUAD-like format with questions clumped together under their parent paragraph
        if len(final_output_dict["data"][0]["paragraphs"]) > para_index:
            # verify whether the paragraph text is identical
            assert original_paragraph == final_output_dict["data"][0][
                "paragraphs"][para_index]['context']
            # append the question answer pair
            final_output_dict["data"][0]["paragraphs"][para_index][
                'qas'].append({
                    'id':
                    'question_%d' % question_number,
                    'question':
                    generated_question,
                    'answers': [{
                        'text': original_answer,
                        'answer_start': original_ans_position,
                    }],
                    'class':
                    output['class'],
                    'algorithm':
                    output['algorithm'],
                    'is_impossible':
                    False
                })
        else:
            # add a new question to the list of QA pairs
            final_output_dict['data'][0]['paragraphs'].append({
                'context':
                original_paragraph,
                'qas': [{
                    'id':
                    'question_%d' % question_number,
                    'question':
                    generated_question,
                    'answers': [{
                        'text': original_answer,
                        'answer_start': original_ans_position,
                    }],
                    'class':
                    output['class'],
                    'algorithm':
                    output['algorithm'],
                    'is_impossible':
                    False
                }]
            })

        question_number += 1

    #with open("squash/temp/%s/generated_questions.json" % args.key, "w") as f:
    #    f.write(json.dumps(final_output_dict))

    return final_output_dict
Beispiel #12
0
gpu = 0
use_gpu = True

os.environ['CUDA_VISIBLE_DEVICES'] = 0

device = torch.device(
    "cuda" if torch.cuda.is_available() and use_gpu else "cpu")
n_gpu = torch.cuda.device_count()
fp16 = False

np.random.seed(42)
torch.random.manual_seed(42)
torch.cuda.manual_seed(42)

#### load the GPT-2 model
config = GPT2Config.from_json_file(
    os.path.join(model_name_or_path, 'config.json'))
enc = GPT2Tokenizer.from_pretrained(model_name_or_path)
model = load_model(GPT2LMHeadModel(config),
                   load_checkpoint,
                   n_gpu,
                   device,
                   fp16,
                   verbose=True)
model.to(device)
model.eval()

history = []
while True:
    raw_text = input("USR >>> ")
    while not raw_text:
        print('Prompt should not be empty!')
DATA_DIR = "../input/jigsaw-unintended-bias-in-toxicity-classification"
LOGGER_PATH = os.path.join(SAVE_DIR, "log.txt")
FOLD_PATH = "../input/toxic-folds/fold01.csv"
TRAIN_PATH = os.path.join(DATA_DIR, "train.csv")
TEST_PATH = os.path.join(DATA_DIR, "test.csv")
SUB_PATH = os.path.join(DATA_DIR, "sample_submission.csv")
identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness'
]
AUX_COLUMNS = [
    'target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult',
    'threat'
]
GPT2_MODEL_PATH = '../input/gpt2-models/'
bert_config = GPT2Config(os.path.join(GPT2_MODEL_PATH, 'config.json'))

# ===============
# Settings
# ===============
fold_id = 0
seed = 0
device = "cuda:0"
epochs = 1
n_labels = len(AUX_COLUMNS) + 1
max_len = 220
head_len = 80
batch_size = 16
base_lr = 2e-5
gammas = [0.75, 0.5, 0.25]
accumulation_steps = 2