Example #1
0
def run_model():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name_or_path', type=str, default='', help='pretrained model name or path to local checkpoint')
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--load_checkpoint", '-c', type=str, default='')
    parser.add_argument("--fp16", type=boolean_string, default=False)
    parser.add_argument("--max_seq_length", type=int, default=128)
    
    parser.add_argument("--generation_length", type=int, default=20)
    parser.add_argument("--max_history", type=int, default=2)

    parser.add_argument("--temperature", type=float, default=1)
    parser.add_argument("--top_k", type=int, default=0)
    parser.add_argument("--top_p", type=float, default=0.9)

    parser.add_argument('--use_gpu', action='store_true')
    parser.add_argument("--gpu", type=int, default=0)

    args = parser.parse_args()
    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)


    device = torch.device("cuda" if torch.cuda.is_available() and args.use_gpu else "cpu")
    n_gpu = torch.cuda.device_count()
    args.device, args.n_gpu = device, n_gpu

    np.random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    #### load the GPT-2 model 
    config = GPT2Config.from_json_file(os.path.join(args.model_name_or_path, 'config.json'))
    enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
    model = load_model(GPT2LMHeadModel(config), args.load_checkpoint, args, verbose=True)
    model.to(device)
    model.eval()

    history = []
    while True:
        raw_text = input("USR >>> ")
        while not raw_text:
            print('Prompt should not be empty!')
            raw_text = input("USR >>> ")
        if raw_text.lower() == 'quit':
          print('SYS >>> Goodbye!')
          break
        history.append(raw_text)
        context_tokens = sum([enc.encode(h) + [EOS_ID] for h in history],[]) #+ [EOS_ID]
        context_tokens = torch.tensor(context_tokens, device=device, dtype=torch.long).unsqueeze(0)
        position_ids = torch.arange(0, context_tokens.size(-1), dtype=torch.long, device=context_tokens.device)

        out = generate_sequence(model, context_tokens, position_ids=position_ids,
                                length=args.generation_length, temperature=args.temperature, 
                                top_k=args.top_k, top_p= args.top_p) 

        out = out.tolist()                        
        text = enc.decode(cut_seq_to_eos(out[0])).encode('ascii','ignore').decode('ascii')
        print("SYS >>> ", text)
        history.append(text)
        history = history[-(2*args.max_history+1):]
Example #2
0
def get_model(args, device):
    if args.scratch:
        config = GPT2Config(n_ctx=args.context_length,
                            n_positions=args.context_length)
        model = GPT2LMHeadModel(config)
    else:
        model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)
    #import torchsummary
    #torchsummary.summary(model, (args.context_length, vocab_size), args.train_batch_size)
    return model.to(device)
 def create_gpt2_lm_head(self, config, input_ids, token_type_ids, position_ids,
                                mc_labels, lm_labels, mc_token_ids):
     model = GPT2LMHeadModel(config)
     model.eval()
     loss = model(input_ids, position_ids, token_type_ids, lm_labels)
     lm_logits, presents = model(input_ids, position_ids, token_type_ids)
     outputs = {
         "loss": loss,
         "lm_logits": lm_logits,
         "presents": presents,
     }
     return outputs
Example #4
0
def get_prob(context, topk, genre, title):
    os.environ["CUDA_VISIBLE_DEVICES"] = '0'  
    device = "cuda" if torch.cuda.is_available() else "cpu"
    tokenizer = tokenization_bert.BertTokenizer(vocab_file='cache/vocab_fine_tuning.txt')

    model_config = pytorch_pretrained_bert.GPT2Config.from_json_file('cache/model_config_single.json')
    model_state_dict = torch.load('cache/model_single/model_epoch_1.pt')

    model = GPT2LMHeadModel(config=model_config)
    model.load_state_dict(model_state_dict)
    model.to(device)
    model.eval()

    batch_size = 1
    temperature = 1

    context_tokens = []

    with open('./cache/label_to_id.json','r',encoding='utf-8') as f:
        title_to_ids = json.load(f)
    try:
        ids = title_to_ids[genre]
        context_tokens.append(ids)
    except:
        ids = title_to_ids['七言律诗']
        context_tokens.append(ids)

    context_tokens.append(100)
    context_tokens.extend(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(title)))
    context_tokens.append(4282) # 4282 is #

    raw_text = context
    if raw_text != "":
        context_tokens.extend(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(raw_text)))


    watcher = WatchProb(model=model, context=context_tokens, tokenizer=tokenizer, temperature=temperature, top_k=topk, device=device)
    prob_dis = watcher.show_prob(topk=topk)

    eight_cumu = watcher.show_cumulative(0.8)
    nine_cumu = watcher.show_cumulative(0.9)
    ninefive_cumu = watcher.show_cumulative(0.95)
    prob_dis.append("")
    prob_dis.append("")
    prob_dis.append("0.8累计覆盖: "+str(eight_cumu))
    prob_dis.append("0.9累计覆盖: "+str(nine_cumu))
    prob_dis.append("0.95累计覆盖: "+str(ninefive_cumu))

    return prob_dis
Example #5
0
    def load_model(self,
                   model_path='./cache/model/model_epoch_1.pt',
                   model_config='./cache/model_config.json',
                   device='cpu'):
        # /data/disk1/private/hujinyi/gpt_poem/model_with_title/model_epoch_1.pt
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        model_config = pytorch_pretrained_bert.GPT2Config.from_json_file(
            model_config)
        model_state_dict = torch.load(model_path)
        model = GPT2LMHeadModel(config=model_config)
        model.load_state_dict(model_state_dict)
        model.to(self.device)
        model.eval()
        self.model = model
Example #6
0
def main():
    LENGTH = -1
    BATCH_SIZE = 1
    NSAMPLES = 18
    TEMPERATURE = 0.5
    TOPK = 5

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = tokenization.BertTokenizer(vocab_file='cache/vocab.txt')
    model_config = pytorch_pretrained_bert.GPT2Config.from_json_file(
        'model_config.json')
    model_state_dict = torch.load('./model.pt')
    model = GPT2LMHeadModel(config=model_config)
    model.load_state_dict(model_state_dict)
    model.to(device)
    model.eval()

    if LENGTH == -1:
        LENGTH = model.config.n_ctx // 2
    elif LENGTH > model.config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" %
                         model.config.n_ctx)

    while True:
        raw_text = input("Model prompt >>> ")
        while not raw_text:
            print('Prompt should not be empty!')
            raw_text = input("Model prompt >>> ")
        context_tokens = tokenizer.convert_tokens_to_ids(
            tokenizer.tokenize(raw_text))
        generated = 0
        for _ in range(NSAMPLES // BATCH_SIZE):
            out = sample_sequence(model=model,
                                  length=LENGTH,
                                  context=context_tokens,
                                  start_token=None,
                                  batch_size=BATCH_SIZE,
                                  temperature=TEMPERATURE,
                                  top_k=TOPK,
                                  device=device)
            out = out[:, len(context_tokens):].tolist()
            for i in range(BATCH_SIZE):
                generated += 1
                text = tokenizer.convert_ids_to_tokens(out[i])
                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                print(text)
        print("=" * 80)
Example #7
0
    def init(self, model_path, model_checkpoint):
        self.config = GPT2Config.from_json_file(os.path.join(model_path, "config.json"))
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_path)
        self.model = GPT2LMHeadModel(self.config)

        model_state_dict = fix_state_dict_namespace(torch.load(model_checkpoint))

        start_model = self.model
        if hasattr(self.model, "transformer") and all(not s.startswith('transformer.') for s in model_state_dict.keys()):
            print('loading transfomer only')
            start_model = self.model.transformer
        start_model.load_state_dict(model_state_dict)

        if self.fp16:
            self.model.half()

        self.model.to(self.device)
        self.model.eval()
Example #8
0
def run_model():
    print(socket.gethostname())

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model_name_or_path',
        type=str,
        default='',
        help='pretrained model name or path to local checkpoint')
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--load_checkpoint", '-c', type=str, default='')
    parser.add_argument("--fp16", type=boolean_string, default=False)
    parser.add_argument("--test_file",
                        '-t',
                        type=str,
                        default=None,
                        help='input file for testing')
    parser.add_argument("--output_file",
                        '-o',
                        type=str,
                        default=None,
                        help='output file for testing')
    parser.add_argument("--normalize_data", type=boolean_string, default=True)
    parser.add_argument("--batch_size", '-b', type=int, default=256)
    parser.add_argument("--max_seq_length", type=int, default=512)
    parser.add_argument("--no_token_id", action='store_true')
    parser.add_argument("--no_attn_mask", action='store_true')
    parser.add_argument("--no_eos", action='store_true')

    parser.add_argument("--generation_length", type=int, default=20)
    parser.add_argument("--temperature", type=float, default=1)
    parser.add_argument("--top_k", type=int, default=0)
    parser.add_argument('--unconditional',
                        action='store_true',
                        help='If true, unconditional generation.')
    parser.add_argument('--is_sampling',
                        action='store_true',
                        help='If true, sampling for generation.')
    parser.add_argument('--output_ref',
                        action='store_true',
                        help='If true, output ref')

    #BEAM
    parser.add_argument("--beam",
                        action='store_true',
                        help='If true, beam search')
    parser.add_argument("--beam_width", type=int, default=1)

    parser.add_argument('--use_gpu', action='store_true')
    parser.add_argument("--gpu", type=int, default=0)
    parser.add_argument('--config', help='JSON config file')
    parser.add_argument('--eval', action='store_true')
    parser.add_argument('--cstr_decode', action='store_true')
    parser.add_argument("--bonus", type=float, default=0.0)

    args = parser.parse_args()
    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)

    if args.config is not None:
        # override argparse defaults by config JSON
        opts = json.load(open(args.config))
        for k, v in opts.items():
            if isinstance(v, str):
                # PHILLY ENV special cases
                if 'PHILLY_JOB_DIRECTORY' in v:
                    v = v.replace('PHILLY_JOB_DIRECTORY',
                                  os.environ['PHILLY_JOB_DIRECTORY'])
                elif 'PHILLY_LOG_DIRECTORY' in v:
                    v = v.replace('PHILLY_LOG_DIRECTORY',
                                  os.environ['PHILLY_LOG_DIRECTORY'])
            setattr(args, k, v)

        # command line should override config JSON
        argv = sys.argv[1:]
        overrides, _ = parser.parse_known_args(argv)
        for k, v in vars(overrides).items():
            if f'--{k}' in argv:
                setattr(args, k, v)
        # setattr(args, 'local_rank', overrides.local_rank)


# do normal parsing

    device = torch.device(
        "cuda" if torch.cuda.is_available() and args.use_gpu else "cpu")
    n_gpu = torch.cuda.device_count()
    args.device, args.n_gpu = device, n_gpu
    print(args)

    np.random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    config = GPT2Config.from_json_file(
        os.path.join(args.model_name_or_path, 'config.json'))
    enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
    model = load_model(GPT2LMHeadModel(config),
                       args.load_checkpoint,
                       args,
                       verbose=True)
    model.to(device)
    model.eval()

    if args.test_file:
        eval_dataloader = get_eval_list_same_length_with_order(
            args.test_file, enc, args.batch_size, True)

        model.eval()
        outs = []
        targets = []
        loss_all = []
        ppl_all = []
        sources = []
        conv_ids = []
        with torch.no_grad():
            with tqdm.tqdm(total=len(eval_dataloader), desc=f"Test") as pbar:
                for step, batch in enumerate(
                        tqdm.tqdm(eval_dataloader, desc="Iteration")):

                    new_batch = []
                    for t in batch:
                        if isinstance(t, list):
                            new_batch.append(t)
                        else:
                            new_batch.append(t.to(device))

                    input_ids, position_ids, token_ids, attn_masks, label_ids, context_len, conv_id = new_batch

                    if args.no_token_id:
                        token_ids = None
                    if args.no_eos:
                        input_ids = input_ids[:, :-1]
                    if args.no_attn_mask:
                        attn_masks = None
                    if args.beam:
                        out = beam_search_naive(model,
                                                input_ids,
                                                position_ids=position_ids,
                                                token_type_ids=token_ids,
                                                attn_masks=attn_masks,
                                                length=args.generation_length,
                                                beam_width=args.beam_width,
                                                device=args.device,
                                                use_bonus=args.cstr_decode,
                                                bonus=args.bonus,
                                                enc=enc)
                    else:
                        out = generate_sequence(model,
                                                input_ids,
                                                position_ids=position_ids,
                                                token_type_ids=token_ids,
                                                attn_masks=attn_masks,
                                                length=args.generation_length,
                                                start_token=None,
                                                temperature=args.temperature,
                                                top_k=args.top_k,
                                                sample=args.is_sampling,
                                                use_bonus=args.cstr_decode,
                                                bonus=args.bonus,
                                                enc=enc)

                    sources.extend(input_ids.cpu().numpy())
                    out = out.tolist()
                    outs.extend(out)
                    targets.extend(label_ids)
                    conv_ids.extend(conv_id.cpu().numpy())

                conv_id_map = {conv_ids[i]: i for i in range(len(conv_ids))}
                val_src = [
                    enc.decode(
                        cut_seq_to_eos(s)).encode('utf-8').decode('utf-8')
                    for s in sources
                ]
                #print(len(val_src),len(targets))

                val_set = [
                    enc.decode(s).encode('utf-8').decode('utf-8')
                    for s in targets
                ]
                gen = [
                    enc.decode(
                        cut_seq_to_eos(s)).encode('utf-8').decode('utf-8')
                    for s in outs
                ]

                val_src_orders = [
                    val_src[conv_id_map[i]] for i in sorted(conv_id_map)
                ]
                val_set_orders = [
                    val_set[conv_id_map[i]] for i in sorted(conv_id_map)
                ]
                gen_orders = [gen[conv_id_map[i]] for i in sorted(conv_id_map)]

                print("=" * 40 + " SAMPLE " + "=" * 40)
                src = enc.decode([
                    x for x in input_ids[-1].cpu().numpy() if x != 0
                ]).encode('utf-8').decode('utf-8')
                gt = val_set[-1]
                resp = gen[-1]
                print(
                    f"Source: \t {src} \n Oracle: \t {gt} \n Resp: \t {resp}\n"
                )
                if args.output_file:
                    with open(args.output_file + '.resp.txt', "w") as resp_f:
                        for i, r in enumerate(gen_orders):
                            r = re.sub("\n", "", r)
                            if args.output_ref:
                                # import pdb; pdb.set_trace()
                                resp_f.write(val_src_orders[i] + '\t' +
                                             val_set_orders[i] + '\t' + r +
                                             '\n')
                            else:
                                resp_f.write(r + '\n')
                print("=" * 80)

                sys.stdout.flush()

    else:
        generated = 0
        while True:
            raw_text = input("Model prompt >>> ")
            while not raw_text:
                print('Prompt should not be empty!')
                raw_text = input("Model prompt >>> ")
            context_tokens = enc.encode(raw_text) + [EOS_ID]
            context_tokens = torch.tensor(context_tokens,
                                          device=device,
                                          dtype=torch.long).unsqueeze(
                                              0)  #.repeat(batch_size, 1)
            generated += 1
            position_ids = torch.arange(0,
                                        context_tokens.size(-1),
                                        dtype=torch.long,
                                        device=context_tokens.device)
            token_ids = None if args.no_token_id else torch.zeros_like(
                context_tokens, dtype=torch.long, device=context_tokens.device)
            if args.beam:
                out = beam_search_naive(model,
                                        context_tokens,
                                        position_ids=None,
                                        token_type_ids=token_ids,
                                        length=args.generation_length,
                                        beam_width=args.beam_width,
                                        device=args.device)
            else:
                out = generate_sequence(model,
                                        context_tokens,
                                        position_ids=None,
                                        token_type_ids=token_ids,
                                        length=args.generation_length,
                                        start_token=None,
                                        temperature=args.temperature,
                                        top_k=args.top_k,
                                        sample=args.is_sampling)
            out = out.tolist()
            text = enc.decode(cut_seq_to_eos(
                out[0])).encode('utf-8').decode('utf-8')
            print("=" * 40 + " RESPONSE " + str(generated) + " " + "=" * 40)
            print(text)
            print("=" * 80)
Example #9
0
    args.eval_batch_size, args.max_seq_length,
    is_train=True)

eval_dataloader_gen = get_eval_list_same_length(
    args.eval_input_file, enc, args.eval_batch_size, True)

logger.info("***** For training dataset *****")
logger.info("***** For dev dataset *****")
logger.info('num example = %d, batch_size = %d, num_batches = %d'
            % (eval_dataloader_loss.num_examples, args.eval_batch_size,
               len(eval_dataloader_gen)))

#########################################################################
# Prepare Model and Optimizer
##########################################################################
model = load_model(GPT2LMHeadModel(config), args.init_checkpoint,
                   args, verbose=True)
if args.local_rank != -1:
    # when from scratch make sure initial models are the same
    params = [p.data for p in model.parameters()]
    all_reduce_and_rescale_tensors(
        params, float(torch.distributed.get_world_size()))
    # FIXME is averaging the best way? init variance will change

model_parameters = filter(lambda p: p.requires_grad, model.parameters())
total_params = sum([np.prod(p.size()) for p in model_parameters])
logger.info('Number of parameter = {}'.format(total_params))

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'ln']   # no decay for bias and LayerNorm (ln)
optimizer_grouped_parameters = [
Example #10
0
eval_dataloader_gen = get_eval_list(args.eval_input_file, enc,
                                    args.eval_batch_size,
                                    args.eval_range_begin, args.eval_range_end,
                                    True)

logger.info("***** For training dataset *****")
logger.info("***** For dev dataset *****")
logger.info('num example = %d, batch_size = %d, num_batches = %d' %
            (eval_dataloader_loss.num_examples, args.eval_batch_size,
             len(eval_dataloader_gen)))

#########################################################################
# Prepare Model
##########################################################################
model = load_model(GPT2LMHeadModel(config),
                   args.init_checkpoint,
                   args,
                   verbose=True)
logger.info('Loaded model from %s in device %s' %
            (args.init_checkpoint, args.device))

#########################################################################
# Prepare BERT reranker and Coordinator and Optimizer
##########################################################################
ranker, ranker_tokenizer = load_ranker(args)
logger.info('Loaded pretrained BERT reranker in device %s' % args.device)

# override
args.coord_config = AttnCoordConfig.from_json_file(args.coord_config)
args.gpt2_lm_head = model.lm_head.decoder.weight.clone()
Example #11
0
def run_model():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model_name_or_path',
        type=str,
        default='',
        help='pretrained model name or path to local checkpoint')
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--load_checkpoint", '-c', type=str, default='')
    parser.add_argument("--fp16", type=boolean_string, default=False)
    parser.add_argument("--max_seq_length", type=int, default=128)

    parser.add_argument("--generation_length", type=int, default=20)
    parser.add_argument("--max_history", type=int, default=2)

    parser.add_argument("--temperature", type=float, default=1)
    parser.add_argument("--top_k", type=int, default=0)
    parser.add_argument("--top_p", type=float, default=0.9)

    parser.add_argument('--use_gpu', action='store_true')
    parser.add_argument("--gpu", type=int, default=0)

    args = parser.parse_args()
    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)

    device = torch.device(
        "cuda" if torch.cuda.is_available() and args.use_gpu else "cpu")
    n_gpu = torch.cuda.device_count()
    args.device, args.n_gpu = device, n_gpu

    np.random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    #### load the GPT-2 model
    config = GPT2Config.from_json_file(
        os.path.join(args.model_name_or_path, 'config.json'))
    enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
    model = load_model(GPT2LMHeadModel(config),
                       args.load_checkpoint,
                       args,
                       verbose=True)
    model.to(device)
    model.eval()

    bot = DialogptIrcBot(CHANNEL, NICKNAME, REALNAME, SERVER, PORT)
    thread_dialog = threading.Thread(target=bot.start)
    thread_dialog.setDaemon(True)
    thread_dialog.start()

    history = []
    sleep(1)
    while bot.alive:
        a = 0
        num = bot.num
        if bot.quest_rep:
            if len(bot.quest_rep) == num + 1:
                if len(bot.quest_rep[num]) == 1:
                    a = 1
                    question = bot.quest_rep[num][0]

        if a == 1:
            try:
                history.append(question)
                context_tokens = sum(
                    [enc.encode(h) + [EOS_ID] for h in history], [])
                context_tokens = torch.tensor(context_tokens,
                                              device=device,
                                              dtype=torch.long).unsqueeze(0)
                position_ids = torch.arange(0,
                                            context_tokens.size(-1),
                                            dtype=torch.long,
                                            device=context_tokens.device)

                out = generate_sequence(model,
                                        context_tokens,
                                        position_ids=position_ids,
                                        length=args.generation_length,
                                        temperature=args.temperature,
                                        top_k=args.top_k,
                                        top_p=args.top_p)

                out = out.tolist()
                text = enc.decode(cut_seq_to_eos(out[0])).encode(
                    'ascii', 'ignore').decode('ascii')

                history.append(text)
                history = history[-(2 * args.max_history + 1):]

            except:
                text = "Je ne comprends pas la question!"

            # Envoi de la réponse
            print("\nQuestion n°:", num)
            print("Question:", bot.quest_rep[num])
            print("Response:", text)
            bot.quest_rep[num].append(text)
Example #12
0
def main():
    # Parse the arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name', type=str, default='gpt2',
                        help='pretrained model name')
    parser.add_argument("--bucket_name", type=str, default="al-ml-data")
    parser.add_argument("--s3_key", type=str,
                        default="e2e_training/ gpt2_train_with_ids_indexed.pkl")
    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
    parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=1)
    parser.add_argument('--train_batch_size', type=int, default=16)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Set the seed for random, numpy, PyTorch
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    #special_tokens = ['<POS>', '<NEG>','<END>']
    tokenizer = GPT2Tokenizer.from_pretrained(args.model_name)
    #start_token_id = tokenizer.convert_tokens_to_ids(['<START>'])[0]
    model = GPT2LMHeadModel.from_pretrained(args.model_name)
    model.to(device)
    
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Load and encode dataset
    def tokenize_and_encode(bucket_name, s3_key):
        '''
        This method tokenizes the input data and encodes it using the OpenAIGPTTokenizer
        :param file_path: Path of the input file, dtype: str
        :return: encoded dataset  dtype: list
        '''
        s3 = boto3.resource('s3')
        try:
            s3.Bucket(bucket_name).download_file(s3_key, '/tmp/gpt2_train_v1.pkl')
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                print("The object does not exist.")
            else:
                raise
                
        with open("/tmp/gpt2_train_v1.pkl","rb") as fp1:
            data = pickle.load(fp1)
        os.remove("/tmp/gpt2_train_v1.pkl")
        
        return data

    logger.info("Encoding dataset...")
    train_dataset = tokenize_and_encode(args.bucket_name, args.s3_key)
    print(len(train_dataset))
    train_dataset = [c for c in train_dataset if len(c) > 0]
    print(len(train_dataset))
    #eval_dataset = tokenize_and_encode(args.eval_dataset)
    print("Training samples = {}".format(len(train_dataset)))
    #print("Validation samples = {}".format(len(eval_dataset)))
    print("Example = {}".format(train_dataset[0]))
    time.sleep(2)
    #train_dataset = [x for x in train_dataset if len(x) <= 300]
    #eval_dataset = [x for x in eval_dataset if len(x) <= 300]
    # Compute the mex input length for the Transformer
    #input_length = max(max(len(t) for t in train_dataset), max(len(q) for q in eval_dataset))
    input_length = max(len(t) for t in train_dataset)
    if n_gpu > 1:
        input_length = min(input_length, model.module.config.n_positions)
    else:
        input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model
    print("Input Length = {}".format(input_length))
                       
    def pre_process_dataset(encoded_dataset, input_length):
        """
        This method is to create torch tensor of input ids and lm labels
        :param encoded_dataset: Input dataset, dtype: list
        :param input_length: Maximum length of sentence from training and eval dataset, dtype: int
        :return: torch.tensor of size [len(encoded_dataset), 2]
        """

        n_batch = len(encoded_dataset)
        input_ids = np.zeros(shape=(n_batch, input_length), dtype=np.int64)
        lm_labels = np.full(shape=(n_batch, input_length), fill_value=-1, dtype=np.int64)

        for i, tokens in enumerate(encoded_dataset):
            input_ids[i, :len(tokens)] = tokens[:input_length]
            start_token_index = tokens.index(9688) # 9688 is id for token 'start'
            lm_labels[i, start_token_index+2 : len(tokens)-1] = tokens[start_token_index+3 : input_length] 

        input_ids = torch.tensor(input_ids)
        lm_labels = torch.tensor(lm_labels)
        tensor_dataset = (input_ids, lm_labels)
        #tensor_dataset.append(torch.tensor(d) for d in all_inputs)

        return tensor_dataset

    # Prepare input tensors and dataloders
    train_tensor_dataset = pre_process_dataset(train_dataset, input_length=input_length)
    #eval_tensor_dataset = pre_process_dataset(eval_dataset, input_length=input_length)
    print(train_tensor_dataset[0].shape, train_tensor_dataset[1].shape)
    print("Training Example Input ids= {}".format(train_tensor_dataset[0][0]))
    print("Training Example Language Modeling ids = {}".format(train_tensor_dataset[1][0]))
    time.sleep(10)
    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

    #eval_data = TensorDataset(*eval_tensor_dataset)
    #eval_sampler = RandomSampler(eval_data)
    #eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                           lr=args.learning_rate,
                           warmup=args.warmup_proportion,
                           max_grad_norm=args.max_grad_norm,
                           weight_decay=args.weight_decay,
                           t_total=num_train_optimization_steps)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, lm_labels = batch
                loss = model(input_ids, lm_labels=lm_labels)
                if n_gpu > 1:
                    loss.mean().backward()
                else:
                    loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                if n_gpu > 1:
                    tmp_loss = loss.mean().item()
                else:
                    tmp_loss = loss.item()
                exp_average_loss = tmp_loss if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * tmp_loss
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0])
                '''
                if (step > 0  and step % 20 == 0):
                    print("SAving Model....")
                    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
                    output_model_file = os.path.join(args.output_dir, "language_model_{}.bin".format(epoch+1))
                    config = model.module.config if hasattr(model, 'module') else model.config
                    torch.save(model_to_save.state_dict(), output_model_file)
		'''
            model_to_save = model.module if hasattr(model, 'module') else model
            output_model_file = os.path.join(args.output_dir, "pytorch_model_final.bin")
            config = model.module.config if hasattr(model, 'module') else model.config
            torch.save(model_to_save.state_dict(), output_model_file)
            model_state_dict = torch.load(output_model_file)
            model = GPT2LMHeadModel(config)
            model.load_state_dict(model_state_dict)
            model.to(device)
            if n_gpu > 1:
                model = torch.nn.DataParallel(model)
		
    # Save a trained model
    # if args.do_train:
    #     model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
    #     output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
    #     config = model.config
    #     torch.save(model_to_save.state_dict(), output_model_file)
    #
    #     # Load a trained model that you have fine-tuned
    #     model_state_dict = torch.load(output_model_file)
    #     model = OpenAIGPTLMHeadModel(config)
    #     model.load_state_dict(model_state_dict)
    #     model.to(device)

    if args.do_eval:
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, lm_labels = batch
            with torch.no_grad():
                lm_loss = model(input_ids, lm_labels=lm_labels)

            eval_loss += lm_loss.mean().item()

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        train_loss = tr_loss/nb_tr_steps if args.do_train else None
        result = {'eval_loss': eval_loss,
                  'train_loss': train_loss}

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
def main():
    """Main training program."""

    print('Evaluate GPT2 model')

    # Disable CuDNN.
    torch.backends.cudnn.enabled = False

    # Timer.
    timers = Timers()

    # Arguments.
    args = get_args()

    # Pytorch distributed.
    initialize_distributed(args)

    # Random seeds for reproducability.
    set_random_seed(args.seed)

    # Data stuff.
    eval_data = get_eval_data(args)

    # Model, optimizer, and learning rate.
    if args.eval_hf:
        from pytorch_pretrained_bert import GPT2LMHeadModel
        from pytorch_pretrained_bert import GPT2Model as HFGPT2Model
        if args.num_layers == 24:
            model_path = args.load
            #model_path = '/home/universal-lm-data.cosmos549/repos/gpt2_mp/models/345M'
            hfmodel = HFGPT2Model.from_pretrained(model_path, cache_dir='gpt2_weights', from_tf=True).cuda()
            model = GPT2LMHeadModel(hfmodel.config)
            model.transformer.load_state_dict(hfmodel.state_dict())
            model.cuda()
        else:
            model = GPT2LMHeadModel.from_pretrained('gpt2', cache_dir='gpt2_weights').cuda()
    else:
        if args.load_openai:
            from utils import move_weights
            model_path = args.load
            args.load = None
            model = setup_model(args)
            from pytorch_pretrained_bert import GPT2LMHeadModel
            from pytorch_pretrained_bert import GPT2Model as HFGPT2Model

            model_path = 'gpt2'
            from_tf = False
            print('loading openai weights')
            model.cpu()
            if args.num_layers == 24:
                #model_path = '/home/universal-lm-data.cosmos549/repos/gpt2_mp/models/345M'
                hfmodel = HFGPT2Model.from_pretrained(model_path, cache_dir='gpt2_weights', from_tf=True)
                gpt2model = GPT2LMHeadModel(hfmodel.config)
                gpt2model.transformer.load_state_dict(hfmodel.state_dict())
                gpt2model
            else:
                gpt2model = GPT2LMHeadModel.from_pretrained('gpt2', cache_dir='gpt2_weights')
            model2fill = model
            while isinstance(model2fill, (DDP, FP16_Module)):
                model2fill = model2fill.module
            move_weights(model2fill, gpt2model)
            model.cuda()
        else:
            model = setup_model(args)

    # Run on test data.
    prefix = "wiki" #os.path.basename(args.valid_data)
    evaluate_and_print_results(prefix, eval_data,
                               model, args, timers)
Example #14
0
def question_generation(_input):
    metadata, output = _input
    args = DotMap()
    """
    parser = ArgumentParser()
    parser.add_argument("--model_type", type=str, default="gpt", help="gpt or gpt2")
    parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--filename", type=str, default="data/instances_dev.pkl", help="File to use for decoding")
    parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length", type=int, default=50, help="Maximum length of the output utterances")
    parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature")
    parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument("--top_p", type=float, default=0.9,
                        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")

    # While using SQUASH in the pipeline mode, prefer using the --key flag
    parser.add_argument("--key", type=str, default=None,
                        help="Override the default settings if the key is set, used in pipeline mode")
    args = parser.parse_args()
    """
    """
    if args.key is not None:
        # Override some the filename and top_p default settings if args.key is set
        # This is done when the question generation module is being used in the SQUASH pipeline mode
        args.filename = "squash/temp/%s/input.pkl" % args.key

        with open("squash/temp/%s/metadata.json" % args.key, "r") as f:
            metadata = json.loads(f.read())
        args.top_p = metadata["settings"]["top_p"]
    args.filename = "squash/temp/%s/input.pkl" % args.key

    with open("squash/temp/%s/metadata.json" % args.key, "r") as f:
        metadata = json.loads(f.read())

    args.top_p = metadata["settings"]["top_p"]
    """
    setattr(args, "top_p", metadata["settings"]["top_p"])
    args.top_p = metadata["settings"]["top_p"]

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    args.seed = 42
    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")

    # NEW BLOCK
    model_checkpoint = "question_generation/gpt2_corefs_question_generation"
    model_checkpoint = "/home/gpt2_corefs_question_generation"
    model_type = "gpt2"
    #model_checkpoint = "https://storage.cloud.google.com/ds-playground/squash/gpt2_qa.tar.gz"
    SAVED_MODEL_DIR = "gpt2_corefs_question_generation"
    dir_path = os.path.dirname(os.path.realpath(__file__))
    model_checkpoint = os.path.join(dir_path, SAVED_MODEL_DIR)
    model_checkpoint = "question_generation/gpt2_corefs_question_generation"

    tokenizer = GPT2Tokenizer.from_pretrained(model_checkpoint)
    model = GPT2LMHeadModel.from_pretrained(model_checkpoint)
    """ OLD BLOCK
    if args.model_type == 'gpt2':
        tokenizer = GPT2Tokenizer.from_pretrained(args.model_checkpoint)
        model = GPT2LMHeadModel.from_pretrained(args.model_checkpoint)
    else:
        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_checkpoint)
        model = OpenAIGPTLMHeadModel.from_pretrained(args.model_checkpoint)
    """

    output_config_file = "/content/squash-generation/question_generation/gpt2_corefs_question_generation/config.json"
    output_model_file = "/content/squash-generation/question_generation/gpt2_corefs_question_generation/pytorch_model.bin"
    output_vocab_file = "/content/squash-generation/question_generation/gpt2_corefs_question_generation/vocab.json"
    merges_file = "/content/squash-generation/question_generation/gpt2_corefs_question_generation/merges.txt"

    output_config_file = SparkFiles.get("config.json")
    output_model_file = SparkFiles.get("pytorch_model.bin")
    output_vocab_file = SparkFiles.get("vocab.json")
    merges_file = SparkFiles.get("merges.txt")

    config = GPT2Config.from_json_file(output_config_file)
    model = GPT2LMHeadModel(config)
    state_dict = torch.load(output_model_file,
                            map_location=torch.device('cpu'))
    model.load_state_dict(state_dict)
    tokenizer = GPT2Tokenizer(output_vocab_file, merges_file=merges_file)
    model.to("cpu")
    model.eval()
    args.device = "cpu"

    args.device = "cpu"
    model.to(args.device)
    model.eval()

    return {"break": "point"}
    #data = get_positional_dataset_from_file(tokenizer, args.filename)
    data = get_positional_dataset_from_file(tokenizer, output)
    final_output_dict = {"version": "squash-2.0", "data": [{"paragraphs": []}]}
    question_number = 0

    para_cache = {"index": None, "hidden_states": None}

    for inst in tqdm.tqdm(data):
        with torch.no_grad():
            para_index = inst["para_index"]
            # Questions from the same paragraph all appear together
            # We can re-use the paragraph hidden representations for different questions in the same paragraph
            if para_index != para_cache["index"]:
                # Since we have moved to a new paragraph, generate its cache
                para_cache["hidden_states"] = None
                # Ignore the answer and question while building the input
                instance, _ = build_para_only_input_from_segments(
                    inst, tokenizer)
                input_ids = torch.tensor(instance['input_ids'],
                                         device=args.device).unsqueeze(0)
                token_type_ids = torch.tensor(instance['token_type_ids'],
                                              device=args.device).unsqueeze(0)

                # Run a forward pass to generate the para caches
                _, para_cache["hidden_states"] = model(
                    input_ids, token_type_ids=token_type_ids)

            # Sample a question using the paragraph cache
            output = sample_sequence(inst, tokenizer, model, args, para_cache)

        original_paragraph = tokenizer.decode(output['paragraph'])
        generated_question = tokenizer.decode(output['question'],
                                              skip_special_tokens=True)
        original_answer = tokenizer.decode(output['answer'],
                                           skip_special_tokens=True)
        para_index = inst['para_index']
        para_cache["index"] = inst['para_index']

        # verify whether the answer position is correct, since this will be utilized for filtering
        original_ans_position = output["answer_position"]
        if original_paragraph[
                output["answer_position"]:output["answer_position"] +
                len(original_answer)] != original_answer:
            # This should never be executed, only used as a last resort
            logger.info("Answer mismatch!")
            original_ans_position = original_paragraph.index(original_answer)

        # Output in a SQUAD-like format with questions clumped together under their parent paragraph
        if len(final_output_dict["data"][0]["paragraphs"]) > para_index:
            # verify whether the paragraph text is identical
            assert original_paragraph == final_output_dict["data"][0][
                "paragraphs"][para_index]['context']
            # append the question answer pair
            final_output_dict["data"][0]["paragraphs"][para_index][
                'qas'].append({
                    'id':
                    'question_%d' % question_number,
                    'question':
                    generated_question,
                    'answers': [{
                        'text': original_answer,
                        'answer_start': original_ans_position,
                    }],
                    'class':
                    output['class'],
                    'algorithm':
                    output['algorithm'],
                    'is_impossible':
                    False
                })
        else:
            # add a new question to the list of QA pairs
            final_output_dict['data'][0]['paragraphs'].append({
                'context':
                original_paragraph,
                'qas': [{
                    'id':
                    'question_%d' % question_number,
                    'question':
                    generated_question,
                    'answers': [{
                        'text': original_answer,
                        'answer_start': original_ans_position,
                    }],
                    'class':
                    output['class'],
                    'algorithm':
                    output['algorithm'],
                    'is_impossible':
                    False
                }]
            })

        question_number += 1

    #with open("squash/temp/%s/generated_questions.json" % args.key, "w") as f:
    #    f.write(json.dumps(final_output_dict))

    return final_output_dict