Beispiel #1
0
def mine_triples(device, input_file, output_file, use_local_model=False):
    if use_local_model:
        print('loading BERT...')
        bert = BertForMaskedLM.from_pretrained("../models/BertForMaskedLM")
        print('loading GPT2...')
        gpt = GPT2LMHeadModel.from_pretrained("../models/GPT2LMHeadModel")
    else:
        print('loading BERT...')
        bert = BertForMaskedLM.from_pretrained(bert_model)
        print('loading GPT2...')
        gpt = GPT2LMHeadModel.from_pretrained(gpt2_model)
    """
        'concat': KnowledgeMiner(
            os.path.join(data_repo, candidate_file),
            device,
            DirectTemplate,
            bert
        ),
        'template': KnowledgeMiner(
            os.path.join(data_repo, candidate_file),
            device,
            PredefinedTemplate,
            bert,
            grammar=False,
            template_loc=os.path.join(template_repo, single_templates)
        ),
        'template_grammar': KnowledgeMiner(
            os.path.join(data_repo, candidate_file),
            device,
            PredefinedTemplate,
            bert,
            grammar=True,
            template_loc=os.path.join(template_repo, single_templates)
        ),
    """

    knowledge_miners = {
        'coherency':
        KnowledgeMiner(input_file,
                       device,
                       EnumeratedTemplate,
                       bert,
                       language_model=gpt,
                       template_loc=os.path.join(template_repo,
                                                 multiple_templates),
                       use_local_model=use_local_model)
    }

    for template_type in knowledge_miners.keys():
        predictions = run_experiment(template_type, knowledge_miners)
        triples = knowledge_miners[template_type].sentences.tuples
        scored_samples = list(zip(triples, predictions))
        scored_samples.sort(key=lambda x: x[1], reverse=True)
        with open(output_file, "w") as f:
            for triple, pred in scored_samples:
                rel, head, tail = triple
                triple = (rel.lower(), head, tail)
                f.write("\t".join(triple) + "\t" + "{:.5f}".format(pred))
                f.write("\n")
Beispiel #2
0
 def __init__(self, type, model_name_or_path="gpt2"):
     super(LM, self).__init__()
     self.enc = GPT2Tokenizer.from_pretrained(model_name_or_path)
     if type == '345M':
         self.model = GPT2LMHeadModel.from_pretrained('output/')
     elif type == '117M':
         self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
     self.model.to(self.device)
     self.model.eval()
     self.start_token = '<|endoftext|>'
def fluency_score(rated_a, opt):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    enc = GPT2Tokenizer.from_pretrained(opt.pretrained_model_path)
    model = GPT2LMHeadModel.from_pretrained(opt.pretrained_model_path)
    model.to(device)

    model.eval()
    nb_steps, eval_loss, exp_average_loss = 0, 0, None
    score_list = []
    # k = "the book is on the desk. These impressions show , when alive , they had smooth skin , robust limbs with webbed feet , and a ridge of skin on their undersides." tensor(169.6684, device='cuda:0')
    with torch.no_grad():
        for step, s in enumerate(
                rated_a):  # actually here is a batch with batchsize=1
            # Put model in training mode.
            if not s:
                print('space sentence')
                score_list.append(1e6)
                continue
            s = enc.encode(
                s)  # + [50256]  #50256 is the token_id for <|endoftext|>
            batch = torch.tensor([s]).to(device)
            loss = model(batch, lm_labels=batch)  # everage -logp
            # print(loss*len(s))
            eval_loss += loss.item()
            nb_steps += 1

            score_list.append(loss.item())

    cutoff = np.quantile([-t for t in score_list], 0.05)
    modified_rating = np.array(
        [cutoff if -t < cutoff else -t for t in score_list])
    normed_rating = (modified_rating - cutoff) / np.abs(cutoff)
    return normed_rating
Beispiel #4
0
def init_model(seed=0, model_path='gpt2'):
    '''
    Parameters:
    ----------
    seed : int
        seed number for different ramdomizers
    model_name_or_path : string, optional
        either model name for existing model or path for trained model
    '''
    np.random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    enc = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    
    model = nn.DataParallel(model)
    model.load_state_dict(torch.load(model_path))
    model = model.module
    
    model.to(device)
    model.eval()
    return model, enc, device
Beispiel #5
0
def mine_from_wikipedia(hardware):
    print('loading BERT...')
    bert = BertForMaskedLM.from_pretrained(bert_model)
    print('loading GPT2...')
    gpt = GPT2LMHeadModel.from_pretrained(gpt2_model)

    knowledge_miners = {
        'concat':
        KnowledgeMiner(data_repo + wikipedia_candidates, hardware,
                       DirectTemplate, bert),
        'template':
        KnowledgeMiner(data_repo + wikipedia_candidates,
                       hardware,
                       PredefinedTemplate,
                       bert,
                       grammar=False,
                       template_loc=template_repo + single_templates),
        'template_grammar':
        KnowledgeMiner(data_repo + wikipedia_candidates,
                       hardware,
                       PredefinedTemplate,
                       bert,
                       grammar=True,
                       template_loc=template_repo + single_templates),
        'coherency':
        KnowledgeMiner(data_repo + wikipedia_candidates,
                       hardware,
                       EnumeratedTemplate,
                       bert,
                       language_model=gpt,
                       template_loc=template_repo + multiple_templates)
    }

    for template_type in knowledge_miners.keys():
        run_experiment(template_type, knowledge_miners)
def download_model(name):
    if not name in MODELS:
        raise Exception(str(name) + ' not a model in the list')
    if not exists(PATH):
        print("# ", str(PATH), "not found, creating dir.")
        mkdir(PATH)
    print('# Downloading model: ' + str(name))
    name_path = MODEL_PATH_DICT[name]
    if name == 'word2vec':
        if not exists(join(PATH, name_path)):
            wget.download(
                'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz'
            )
            shutil.move(name_path, join(PATH, name_path))
            print('# Downloaded word2vec')
        else:
            print('# Already downloaded')
    if name == 'glove':
        if not exists(join(PATH, name_path)):
            wget.download(
                'http://nlp.stanford.edu/data/wordvecs/glove.840B.300d.zip')
            zip = zipfile.ZipFile('./glove.840B.300d.zip')
            zip.extractall()
            _ = glove2word2vec('./glove.840B.300d.txt', join(PATH, name_path))
            print('# Downloaded glove')
        else:
            print('# Already downloaded')
    if name == 'dict2vec':
        if not exists(join(PATH, name_path)):
            wget.download(
                'https://dict2vec.s3.amazonaws.com/dict2vec300.tar.bz2')
            tar = tarfile.open("dict2vec300.tar.bz2")
            tar.extractall()
            tar.close()
            shutil.move(name_path, join(PATH, name_path))
            print('# Downloaded dict2vec')
        else:
            print('# Already downloaded')

    if name == 'conceptnet':
        if not exists(join(PATH, name_path)):
            wget.download(
                'https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz'
            )
            shutil.move(name_path, join(PATH, name_path))
            print('# Downloaded Conceptnet Numberbatch')
        else:
            print('# Already downloaded')
    if name == 'bert' or name == 'bert-context':
        _ = BertTokenizer.from_pretrained('bert-large-uncased')
        _ = BertModel.from_pretrained(
            'bert-large-uncased').embeddings.word_embeddings.weight.data.numpy(
            )
        print('# Downloaded bert')
    if name == 'gpt2' or name == 'gpt2-context':
        _ = GPT2Tokenizer.from_pretrained('gpt2')
        _ = GPT2LMHeadModel.from_pretrained('gpt2')
        _ = GPT2Model.from_pretrained('gpt2')
        print('# Downloaded gpt-2')
Beispiel #7
0
 def __init__(self,GPU, model_name_or_path="gpt2"):
     self.device = torch.device(GPU if torch.cuda.is_available() else "cpu")
     self.enc = GPT2Tokenizer.from_pretrained(model_name_or_path)
     self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
     self.model.to(self.device)
     self.model.eval()
     self.start_token = '<|endoftext|>'
     print("Loaded GPT-2 model!")
Beispiel #8
0
 def __init__(self, model_name_or_path="gpt2"):
     super(LM, self).__init__()
     self.enc = GPT2Tokenizer.from_pretrained(model_name_or_path)
     self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
     self.model.to(self.device)
     self.model.eval()
     self.start_token = '<|endoftext|>'
     print("Loaded GPT-2 model!")
def run_model():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name_or_path', type=str, default='gpt2', help='pretrained model name or path to local checkpoint')
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--nsamples", type=int, default=1)
    parser.add_argument("--batch_size", type=int, default=-1)
    parser.add_argument("--length", type=int, default=-1)
    parser.add_argument("--temperature", type=int, default=1)
    parser.add_argument("--top_k", type=int, default=0)
    parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.')
    args = parser.parse_args()
    print(args)

    if args.batch_size == -1:
        args.batch_size = 1
    assert args.nsamples % args.batch_size == 0

    np.random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
    model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)
    model.to(device)
    model.eval()

    if args.length == -1:
        args.length = model.config.n_ctx // 2
    elif args.length > model.config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)

    while True:
        context_tokens = []
        if not args.unconditional:
            raw_text = input("Model prompt >>> ")
            while not raw_text:
                print('Prompt should not be empty!')
                raw_text = input("Model prompt >>> ")
            context_tokens = enc.encode(raw_text)
        generated = 0
        for _ in range(args.nsamples // args.batch_size):
            out = sample_sequence(
                model=model, length=args.length,
                context=context_tokens if not args.unconditional else None,
                start_token=enc.encoder['<|endoftext|>'] if args.unconditional else None,
                batch_size=args.batch_size,
                temperature=args.temperature, top_k=args.top_k, device=device
            )
            out = out[:, len(context_tokens):].tolist()
            for i in range(args.batch_size):
                generated += 1
                text = enc.decode(out[i])
                print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                print(text)
        print("=" * 80)
        if args.unconditional:
            break
Beispiel #10
0
 def __init__(
         self,
         model_name_or_path="/data/pradeesh/detecting-fake-text/pytorch/"):
     super(LM, self).__init__()
     self.enc = GPT2Tokenizer.from_pretrained(model_name_or_path)
     self.model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
     self.model.to(self.device)
     self.model.eval()
     self.start_token = '<|endoftext|>'
     print("Loaded GPT-2 model!")
Beispiel #11
0
def get_model(args, device):
    if args.scratch:
        config = GPT2Config(n_ctx=args.context_length,
                            n_positions=args.context_length)
        model = GPT2LMHeadModel(config)
    else:
        model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)
    #import torchsummary
    #torchsummary.summary(model, (args.context_length, vocab_size), args.train_batch_size)
    return model.to(device)
Beispiel #12
0
    def __init__(self, args):
        super().__init__()

        if args.gpt2_model_dir is not None:
            # load GPT2 model from file
            gpt_model_name = str(args.gpt2_model_dir) + "/"
            dict_file = gpt_model_name
            print("loading GPT2 model from {}".format(gpt_model_name))
        else:
            # load GPT2 model from huggingface cache
            gpt_model_name = args.gpt2_model_name
            dict_file = gpt_model_name

        # Load pre-trained model tokenizer (vocabulary)
        self.tokenizer = GPT2Tokenizer.from_pretrained(dict_file)

        # GPT uses different way to represent BPE then BERT. Namely, the
        # final suffixes are indicated with </w> suffix, while pieces that must
        # be followed are written as is. In BERT the prefixes are written as is
        # while the parts that must follow (not be followed!) have '##' prefix.
        # There is no one-to-one coversion. But at least we may make pieces that
        # may form a full word look the same.
        # Note that we should be very careful now,
        # tokenizer.convert_tokens_to_ids won't work with our vocabulary.

        def convert_word(word):
            if word == GPT2_EOS:
                return word

            if word.startswith('Ġ'):  # the token starts with a whitespace
                return word[1:]

            return f'_{word}_'  # the token not start with a white space.
            # may be not a head of a word,
            # or may be a head of a sentence.

        _, gpt_vocab = zip(*sorted(self.tokenizer.decoder.items()))
        self.vocab = [convert_word(word) for word in gpt_vocab]
        self._init_inverse_vocab()

        # Load pre-trained model (weights)
        self.gpt_model = GPT2LMHeadModel.from_pretrained(gpt_model_name)
        self.gpt_model.eval()
        # print(self.gpt_model.config)

        # Sanity check.
        assert len(self.vocab) == self.gpt_model.config.vocab_size
        #assert 0 == self.gpt_model.config.n_special

        self.eos_id = self.gpt_model.config.eos_token_id
        self.pad_id = self.gpt_model.config.eos_token_id
        self.unk_id = self.gpt_model.config.eos_token_id
        self.bos_id = self.gpt_model.config.bos_token_id
        self.model_vocab = self.vocab
Beispiel #13
0
def init():
    #seed = 42
    #np.random.seed(seed)
    #torch.random.manual_seed(seed)
    #torch.cuda.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    enc = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.to(device)
    model.eval()
    return enc, model
Beispiel #14
0
def mine(hardware):
    print('Loading GPT2...')
    gpt = GPT2LMHeadModel.from_pretrained(gpt2_model)

    knowledge_miners = {
        'coherency': KnowledgeMiner(
            data_repo + test_data,
            hardware,
            EnumeratedTemplate,
            language_model = gpt,
            template_loc = template_repo + multiple_templates)
    }

    return run_experiment('coherency', knowledge_miners)
Beispiel #15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--batch_size',default=1,type=int,help='Batch size for inference')

    parser.add_argument('--model_name',default='gpt2',type=str,
                        help='Pre-trained model name')
    parser.add_argument('--max_seq_length',default=128,type=int,
                        help='Maximum total input sequence length after tokenization')

    args = parser.parse_args()

    input_ids = torch.zeros([args.batch_size,args.max_seq_length],dtype=torch.long)

    model = GPT2LMHeadModel.from_pretrained(args.model_name)
    torch.onnx.export(model,input_ids,'gpt2_'+'batch'+str(args.batch_size)+'.onnx')
 def __init__(self, text_sequence, model_type, temperature = 1.0, top_k = 0, batch_size = 1, length = 1, nsamples =1, debug = True):
     self.text_sequence = text_sequence
     #eventually will differentiate between gpt-2, BERT, etc.
     self.model_type = model_type
     model_name = 'gpt2'
     self.debug = debug
     #detect device
     self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     self.temperature = temperature
     self.top_k = top_k
     self.batch_size = batch_size
     self.length = length
     self.nsamples = nsamples
     #create encoder and model
     self.enc = GPT2Tokenizer.from_pretrained(model_name)
     self.model = GPT2LMHeadModel.from_pretrained(model_name)
     self.model.to(self.device)
     self.model.eval()
def context_score(questions, answers, opt):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    enc = GPT2Tokenizer.from_pretrained(opt.pretrained_model_path)
    model = GPT2LMHeadModel.from_pretrained(opt.pretrained_model_path)
    model.to(device)

    model.eval()

    score_list = []
    with torch.no_grad():
        for step, (question, answer) in enumerate(
                zip(questions,
                    answers)):  # actually here is a batch with batchsize=1
            # Put model in training mode.
            if not answer:
                print('space sentence')
                score_list.append(-1e6)

                continue
            joint_enc = enc.encode(
                question + ' ' +
                answer)  # + [50256]  #50256 is the token_id for <|endoftext|>
            q = enc.encode(question)
            batch_joint = torch.tensor([joint_enc]).to(device)
            batch_q = torch.tensor([q]).to(device)

            loss_joint = model(batch_joint,
                               lm_labels=batch_joint)  # everage -logp
            loss_q = model(batch_q, lm_labels=batch_q)

            p_joint = -loss_joint * (len(joint_enc) - 1)
            p_q = -loss_q * (len(q) - 1)

            score = p_joint - (p_q)

            score_list.append(score.item())

    cutoff = np.quantile(score_list, 0.05)
    modified_rating = np.array(
        [cutoff if t < cutoff else t for t in score_list])
    normed_rating = (modified_rating - cutoff) / np.abs(cutoff)
    return normed_rating
def load_model_fromlist(name):
    if not name in MODELS:
        raise Exception(str(name) + ' not a model in the list')
    print('# Loading model: ' + str(name))
    name_path = MODEL_PATH_DICT[name]
    if name == 'word2vec':
        if not exists(join(PATH, name_path)): download_model(name)
        return (gensim.models.KeyedVectors.load_word2vec_format(join(
            PATH, name_path),
                                                                binary=True))
    if name == 'glove':
        if not exists(join(PATH, name_path)): download_model(name)
        return (gensim.models.KeyedVectors.load_word2vec_format(
            join(PATH, name_path)))
    if name == 'dict2vec':
        if not exists(join(PATH, name_path)): download_model(name)
        return (gensim.models.KeyedVectors.load_word2vec_format(
            join(PATH, name_path), binary=False, unicode_errors="ignore"))
    if name == 'conceptnet':
        if not exists(join(PATH, name_path)): download_model(name)
        return (gensim.models.KeyedVectors.load_word2vec_format(
            join(PATH, name_path)))
    if name == 'bert':
        tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
        model = BertModel.from_pretrained(
            'bert-large-uncased').embeddings.word_embeddings.weight.data.numpy(
            )
        return ([model, tokenizer])
    if name == 'bert-context':
        tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
        model = BertModel.from_pretrained('bert-large-uncased',
                                          output_hidden_states=True)
        return ([model, tokenizer])
    if name == 'gpt2':
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        model = GPT2LMHeadModel.from_pretrained(
            'gpt2').transformer.wte.weight.data.numpy()
        return ([model, tokenizer])
    if name == 'gpt2-context':
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        model = GPT2Model.from_pretrained('gpt2', output_hidden_states=True)
        return ([model, tokenizer])
def run_model():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name_or_path', type=str, default='gpt2', help='pretrained model name or path to local checkpoint')
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--nsamples", type=int, default=1)
    parser.add_argument("--batch_size", type=int, default=-1)
    parser.add_argument("--length", type=int, default=-1)
    parser.add_argument("--temperature", type=float, default=1.0)
    parser.add_argument("--top_k", type=int, default=0)
    parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.')
    parser.add_argument('--inputs_file', type=str, default=None)
    parser.add_argument('--output_file', type=str, default='results.json')
    parser.add_argument('--do_beam_search', type=bool, default=False)
    args = parser.parse_args()
    print(args)

    if args.batch_size == -1:
        args.batch_size = 1
    assert args.nsamples % args.batch_size == 0

    np.random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
    model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)
    model.to(device)
    model.eval()

    if args.length == -1:
        args.length = model.config.n_ctx // 2
    elif args.length > model.config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)

    if args.inputs_file is None:
      decode_interactive(model, enc, device, args)
    else:
      decode_from_file(model, enc, device, args)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name', type=str, default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
    parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
    parser.add_argument("--output_dir", default='tuned_gpt2', type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")
    parser.add_argument('--train_dataset', type=str, default='')
    
    parser.add_argument('--source_eval', type=str, default='')
    parser.add_argument('--target_eval', type=str, default='')
    parser.add_argument('--source_train', type=str, default='')
    parser.add_argument('--target_train', type=str, default='')
    
    
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=10)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--effective_batch_size',type=int, default=64)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)
    parser.add_argument('--bsz', type=int, default = 20)
    parser.add_argument('--bptt', type=int, default = 40)

    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()
#    print(args)

    model_type = 'gpt2'


    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

#    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = torch.device(type='cuda')
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

#    if not args.do_train and not args.do_eval:
#        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
        

    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2').to('cuda')

    model.to(device)


    #file_train = args.train_dataset #'cnn_train.txt'
    #file_eval =  args.eval_dataset #'cnn_valid.txt'
    bptt = args.bptt
    bsz = args.bsz
    

#    X_eval, nbatch_eval = load_dataset(file_eval, tokenizer, bptt, bsz)
#    X_train, nbatch_train =  load_dataset(file_train, tokenizer, bptt, bsz)
    
    batches_eval, labels_eval, nbatch_eval = load_dataset(args.source_eval, args.target_eval, tokenizer, bptt, bsz)
    batches_train, labels_train, nbatch_train =  load_dataset(args.source_train, args.target_train, tokenizer, bptt, bsz)
    
    

    # Prepare optimizer
#    param_optimizer = list(model.parameters())
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    
    print('here 3')
#    num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
    num_train_optimization_steps = nbatch_train * args.num_train_epochs
    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                           lr=args.learning_rate,
                           warmup=args.warmup_proportion,
                           max_grad_norm=args.max_grad_norm,
                           weight_decay=args.weight_decay,
                           t_total=num_train_optimization_steps)

    eval_loss_min = None
    print('here 4')
    model.to(device)

    nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
    model.train()
    for epoch_i in trange(int(args.num_train_epochs), desc="Epoch"):
        tr_loss = 0
        nb_tr_steps = 0
        
        for i_batch in tqdm(list(range(nbatch_train)), desc='Evaluating epoch {}'.format(epoch_i)):
            batch = batches_train[i_batch]#X_train[:, i_batch*bsz:(1+i_batch)*bsz].permute(1,0)
            
            batch = batch.cuda()
            lm_labels = labels_train[i_batch].cuda()
            if batch.numel() == 0:
                break
            
            #loss = model(batch, lm_labels = labels_train[i_batch].cuda())
                            # TRY DOING IT MANUALLY
            loss_fct = CrossEntropyLoss(reduction = 'none')
            lm_logits,_ = model(batch)
            shift_logits = lm_logits[:, :-1, :].contiguous()
            shift_labels = batch[:,1:].contiguous()
            
            shift_labels_mask = (lm_labels[:,1:].contiguous().view(-1) != -1).float()
            
            loss_mat = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                        shift_labels.view(-1))
            loss = (loss_mat*shift_labels_mask).view(-1).sum()/shift_labels_mask.sum() # avg over non-masked indices
            
            loss.backward()
            
            # only step the model if you've gone through 'effective_batch_size' examples
            if (i_batch*args.train_batch_size) % args.effective_batch_size == 0 and i_batch != 0:
                optimizer.step()
                optimizer.zero_grad()
                
            tr_loss += loss.item()
            

            exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()
            nb_tr_steps += 1
         
            
            
            ###
            # Evaluations
            ###
            
            
            if i_batch % 1000 == 0: # get eval score
                eval_loss = eval_model(model, nbatch_eval,batches_eval,labels_eval, bsz)
                
                # if eval_loss improves, save model
                if eval_loss_min is None or eval_loss < eval_loss_min:
                    eval_loss_min = eval_loss
                    
                    # save model if eval loss is lower
                    model_to_save = model
                    # If we save using the predefined names, we can load using `from_pretrained`
                    output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
                    output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        
                    torch.save(model_to_save.state_dict(), output_model_file)
                    to_json_file(model_to_save.config,output_config_file)
                
                print('eval_loss {}',format(eval_loss))
                model.train()
                
            if i_batch % 200 == 0: # try generating from model 
                print("Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0]))

                model.eval()
                if model_type == 'gpt':
                    encode = lambda a: tokenizer.convert_tokens_to_ids(tokenizer.tokenize(a))
                    decode = tokenizer.decode
                elif model_type == 'gpt2':
                    encode = tokenizer.encode
                    decode = tokenizer.decode
                
                generate_from_model(encode, decode, model = model,model_type = model_type)
                model.train()
import numpy as np
import torch
import torch.nn.functional as F
import tqdm
from tensorboardX import SummaryWriter
from torch.utils.data import DataLoader, Dataset
from tqdm import trange

import pytorch_pretrained_bert
from data_loader import get_data_loader
from model_sampler import print_samples
from pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer, OpenAIAdam
from torch.utils.data import DataLoader, Dataset, Subset
model_name = 'gpt2'
enc = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)


model_name = 'gpt2'
enc = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
device='cpu'
beam_width = 130
stopwords = []

def to_list(tensor):
    return list(tensor.cpu().numpy())

def predict(line, max_predictions):
    """Give continuation of the line with at most max_predictions BPE tokens. Returns line extended with predictions of
     the model."""
Beispiel #22
0
def run():
    parser = ArgumentParser()
    parser.add_argument("--model_type", type=str, default="gpt", help="gpt or gpt2")
    parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")
    parser.add_argument("--filename", type=str, default="data/instances_dev.pkl", help="File to use for decoding")
    parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length", type=int, default=50, help="Maximum length of the output utterances")
    parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature")
    parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")

    if args.model_type == 'gpt2':
        tokenizer = GPT2Tokenizer.from_pretrained(args.model_checkpoint)
        model = GPT2LMHeadModel.from_pretrained(args.model_checkpoint)
    else:
        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_checkpoint)
        model = OpenAIGPTLMHeadModel.from_pretrained(args.model_checkpoint)

    model.to(args.device)
    model.eval()

    data = get_dataset_from_file(tokenizer, args.filename)
    final_output_dict = {
        "version": "squash-2.0",
        "data": [{
            "paragraphs": []
        }]
    }
    question_number = 0
    # For all the instances corresponding one paragraph, model input format is: paragraph + answer + question) 
    # Paragraph will be common accross all the instances.
    # "past" can be used to reuse precomputed hidden state for paragraph in a subsequent predictions
    
    imort copy 
    
    previous_para_index = None
    past = None
    for inst in tqdm.tqdm(data):
        with torch.no_grad():
            current_para_index = inst['para_index']
            if current_para_index != prev_para_index:
                past = None
                currrent_inst = copy.deepcopy(inst)
                # only keeping paragraph details in the instance to get its hidden states 
                current_inst['question'] =  []
                current_inst['answer'] = []
                instance, _ = build_input_from_segments(current_inst,tokenizer,with_eos=False)
                input_ids = torch.tensor(instance['input_ids'][:-2],device=args.device).unsqueeze(0)
                token_type_ids = torch.tensor(instance['token_type_ids'][:-2],device=args.device).unsqueeze(0)
                _,past=model(input_ids,toekn_type_ids=toekn_type_ids,past=past) #output "past" will have paragraph embeddings
            output = sample_sequence(inst, tokenizer, model, args,past)

        original_paragraph = tokenizer.decode(output['paragraph'])
        generated_question = tokenizer.decode(output['question'], skip_special_tokens=True)
        original_answer = tokenizer.decode(output['answer'], skip_special_tokens=True)
        para_index = inst['para_index']

        # Output in a SQUAD-like format with questions clumped together under their parent paragraph
        if len(final_output_dict["data"][0]["paragraphs"]) > para_index:
            # verify whether the paragraph text is identical
            assert original_paragraph == final_output_dict["data"][0]["paragraphs"][para_index]['context']
            # append the question answer pair
            final_output_dict["data"][0]["paragraphs"][para_index]['qas'].append({
                'id': 'question_%d' % question_number,
                'question': generated_question,
                'answers': [{
                    'text': original_answer,
                    'answer_start': original_paragraph.index(original_answer)
                }],
                'class': output['class'],
                'algorithm': output['algorithm'],
                'is_impossible': False
            })
        else:
            # add a new question to the list of QA pairs
            final_output_dict['data'][0]['paragraphs'].append({
                'context': original_paragraph,
                'qas': [{
                    'id': 'question_%d' % question_number,
                    'question': generated_question,
                    'answers': [{
                        'text': original_answer,
                        'answer_start': original_paragraph.index(original_answer)
                    }],
                    'class': output['class'],
                    'algorithm': output['algorithm'],
                    'is_impossible': False
                }]
            })

        question_number += 1

    with open("squash/temp/generated_questions.json", "w") as f:
        f.write(json.dumps(final_output_dict))
Beispiel #23
0
 def __init__(self):
     self.model = GPT2LMHeadModel.from_pretrained('gpt2')
     self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
     self.model.cuda()
     self.model.eval()
Beispiel #24
0
def run_model():
    parser = argparse.ArgumentParser(description="")
    parser.add_argument('--model-path',
                        type=str,
                        help='pretrained model path to local checkpoint')
    parser.add_argument("--batch-size", type=int, default=40)
    parser.add_argument('--data-dir', type=str, default='../data')
    parser.add_argument('--dataset', type=str, default='../data')
    parser.add_argument("--test", action='store_true', default=False)
    args = parser.parse_args()
    print(args)

    if args.batch_size == -1:
        args.batch_size = 1

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = GPT2LMHeadModel.from_pretrained('gpt2', cache_dir='out/cache')

    if args.model_path:
        state = torch.load(args.model_path, map_location='cpu')
        model.load_state_dict(state)

    tokenizer = GPT2Tokenizer(os.path.join(args.data_dir, 'gpt2-vocab.json'),
                              os.path.join(args.data_dir, 'gpt2-merges.txt'))
    # Hack to allow tokenizing longer sequences.
    tokenizer.max_len = int(1e12)

    model.half().to(device)
    model.eval()
    print('Model loaded.')

    d_val = PromptDataset(
        os.path.join(
            args.data_dir, 'writingPrompts/{}.wp_source'.format(
                'test' if args.test else 'valid')),
        os.path.join(
            args.data_dir, 'writingPrompts/{}.wp_target'.format(
                'test' if args.test else 'valid')), wp_preprocess)
    d_val_raw = PromptDataset(
        os.path.join(
            args.data_dir, 'writingPrompts/{}.wp_source'.format(
                'test' if args.test else 'valid')),
        os.path.join(
            args.data_dir, 'writingPrompts/{}.wp_target'.format(
                'test' if args.test else 'valid')))

    print('Data loaded.')

    print('Running evaluation...')
    with torch.no_grad():
        ppls = []
        word_ppls = []
        token_diffs = []
        num_errs = 0

        batch = []
        for sample_id, (text, check_text) in enumerate(zip(d_val, d_val_raw)):
            bpe_tokens = [tokenizer.encoder['<|endoftext|>']
                          ] + tokenizer.encode(text)
            # (This limit applies to GPT2)
            bpe_tokens = bpe_tokens[:1025]
            # Pad
            batch.append(
                (bpe_tokens + [0] * (1025 - len(bpe_tokens)), len(bpe_tokens),
                 check_text.split('---\n')[1].split(' ')))

            if len(batch) == args.batch_size or len(
                    word_ppls) == len(d_val) - 1:
                x, x_lens, raw_tokens = zip(*batch)
                token_tensor = torch.tensor(x, dtype=torch.long, device=device)

                # Compute log probs
                lps = compute_logprobs(token_tensor, model)
                token_tensor = token_tensor.cpu().numpy()

                # Compute individually
                for i in range(lps.shape[0]):
                    try:
                        # Mask out some tokens
                        target_tokens = token_tensor[i, 1:x_lens[i]]
                        log_probs = lps[i, :x_lens[i] - 1]
                        ppl, token_diff = word_level_ppl(
                            target_tokens,
                            log_probs.cpu().float().numpy(), tokenizer,
                            raw_tokens[i])
                        token_diffs.append(token_diff)
                        word_ppls.append(ppl)
                        ppls.append(torch.exp(-log_probs.mean()).item())
                    except Exception as e:
                        print('Skipping anomaly.')
                        print(e)
                        num_errs += 1
                print(
                    'World Level PPL {:.2f} BPE PPL {:.2f} Diff {:.2f} Done: {:.2f}% Skip {}'
                    .format(np.mean(word_ppls), np.mean(ppls),
                            np.mean(token_diffs), sample_id / len(d_val) * 100,
                            num_errs))
                batch = []
#   "vocab_size": 50257
# }

## Predict hidden states features for each layer
with torch.no_grad():
    hidden_states_1, past = model(tokens_tensor_1)
    print(hidden_states_1.shape)  # torch.Size([1, 6, 768])
    print(len(past), past[0].shape)  # 12 torch.Size([2, 1, 12, 6, 64])
    hidden_states_2, past = model(tokens_tensor_2, past=past)
    print(hidden_states_2.shape)  # torch.Size([1, 8, 768])
    print(len(past), past[0].shape)  # 12 torch.Size([2, 1, 12, 14, 64]); 14 = 8 + 6
    ## past can be used to reuse precomputed hidden state in a subsequent predictions (see beam-search examples in the run_gpt2.py example).

##################################################################
## GPT2LMHeadModel
model = GPT2LMHeadModel.from_pretrained('/Users/coder352/datasets/WordVec/pytorch_pretrained_bert/gpt2/')
model.eval()

## Predict all tokens
with torch.no_grad():
    predictions_1, past = model(tokens_tensor_1)
    predictions_2, past = model(tokens_tensor_2, past=past)
    print(hidden_states_2.shape)  # torch.Size([1, 8, 768])
    print(len(past), past[0].shape)  # 12 torch.Size([2, 1, 12, 14, 64])

## get the predicted last token
predicted_index = torch.argmax(predictions_2[0, -1, :]).item(); print(predicted_index)  # 508
predicted_token = tokenizer.decode([predicted_index]); print(predicted_token)  #  who

##################################################################
## Transformer-XL
Beispiel #26
0
    def __init__(self, opt, shared=None):
        super(TransformerAgent, self).__init__(opt, shared)

        args = AttrDict(
            opt)  # to keep most commands identical to the interact.py script
        self.args = args

        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__file__)
        self.logger.info(pformat(args))

        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

        if shared is None:
            self.logger.info("Get pretrained model and tokenizer")
            if args.model_checkpoint == "":
                args.model_checkpoint = download_pretrained_model()

            if args.model.startswith('gpt2'):
                self.tokenizer = GPT2Tokenizer.from_pretrained(
                    args.model_checkpoint)
                if self.args.eval_type == "hits@1":
                    self.model_checkpoint = GPT2DoubleHeadsModel.from_pretrained(
                        args.model_checkpoint)
                else:
                    self.model_checkpoint = GPT2LMHeadModel.from_pretrained(
                        args.model_checkpoint)
            elif args.model == 'openai-gpt':
                self.tokenizer = OpenAIGPTTokenizer.from_pretrained(
                    args.model_checkpoint)
                if self.args.eval_type == "hits@1":
                    self.model_checkpoint = OpenAIGPTDoubleHeadsModel.from_pretrained(
                        args.model_checkpoint)
                else:
                    self.model_checkpoint = OpenAIGPTLMHeadModel.from_pretrained(
                        args.model_checkpoint)
            else:
                raise NotImplementedError(
                    'model type "%s" not implemented. Use either "openai-gpt" or "gpt2"'
                )
            self.model_checkpoint.to(args.device)
            self.model_checkpoint.eval()

            self.logger.info("Build BPE prefix dictionary")
            convai_dict = build_dict()
            assert len(convai_dict) == 19304
            self.prefix2words = self.get_prefix2words(convai_dict)
        else:
            self.model_checkpoint = shared['model']
            self.tokenizer = shared['tokenizer']
            self.prefix2words = shared['prefix2words']

        self.special_tokens_ids = self.tokenizer.convert_tokens_to_ids(
            SPECIAL_TOKENS)

        self.persona = []
        self.history = []
        self.labels = []

        self.reset()
Beispiel #27
0
def main():
    # Parse the arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name', type=str, default='gpt2',
                        help='pretrained model name')
    parser.add_argument("--bucket_name", type=str, default="al-ml-data")
    parser.add_argument("--s3_key", type=str,
                        default="e2e_training/ gpt2_train_with_ids_indexed.pkl")
    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
    parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=1)
    parser.add_argument('--train_batch_size', type=int, default=16)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Set the seed for random, numpy, PyTorch
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    #special_tokens = ['<POS>', '<NEG>','<END>']
    tokenizer = GPT2Tokenizer.from_pretrained(args.model_name)
    #start_token_id = tokenizer.convert_tokens_to_ids(['<START>'])[0]
    model = GPT2LMHeadModel.from_pretrained(args.model_name)
    model.to(device)
    
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Load and encode dataset
    def tokenize_and_encode(bucket_name, s3_key):
        '''
        This method tokenizes the input data and encodes it using the OpenAIGPTTokenizer
        :param file_path: Path of the input file, dtype: str
        :return: encoded dataset  dtype: list
        '''
        s3 = boto3.resource('s3')
        try:
            s3.Bucket(bucket_name).download_file(s3_key, '/tmp/gpt2_train_v1.pkl')
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                print("The object does not exist.")
            else:
                raise
                
        with open("/tmp/gpt2_train_v1.pkl","rb") as fp1:
            data = pickle.load(fp1)
        os.remove("/tmp/gpt2_train_v1.pkl")
        
        return data

    logger.info("Encoding dataset...")
    train_dataset = tokenize_and_encode(args.bucket_name, args.s3_key)
    print(len(train_dataset))
    train_dataset = [c for c in train_dataset if len(c) > 0]
    print(len(train_dataset))
    #eval_dataset = tokenize_and_encode(args.eval_dataset)
    print("Training samples = {}".format(len(train_dataset)))
    #print("Validation samples = {}".format(len(eval_dataset)))
    print("Example = {}".format(train_dataset[0]))
    time.sleep(2)
    #train_dataset = [x for x in train_dataset if len(x) <= 300]
    #eval_dataset = [x for x in eval_dataset if len(x) <= 300]
    # Compute the mex input length for the Transformer
    #input_length = max(max(len(t) for t in train_dataset), max(len(q) for q in eval_dataset))
    input_length = max(len(t) for t in train_dataset)
    if n_gpu > 1:
        input_length = min(input_length, model.module.config.n_positions)
    else:
        input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model
    print("Input Length = {}".format(input_length))
                       
    def pre_process_dataset(encoded_dataset, input_length):
        """
        This method is to create torch tensor of input ids and lm labels
        :param encoded_dataset: Input dataset, dtype: list
        :param input_length: Maximum length of sentence from training and eval dataset, dtype: int
        :return: torch.tensor of size [len(encoded_dataset), 2]
        """

        n_batch = len(encoded_dataset)
        input_ids = np.zeros(shape=(n_batch, input_length), dtype=np.int64)
        lm_labels = np.full(shape=(n_batch, input_length), fill_value=-1, dtype=np.int64)

        for i, tokens in enumerate(encoded_dataset):
            input_ids[i, :len(tokens)] = tokens[:input_length]
            start_token_index = tokens.index(9688) # 9688 is id for token 'start'
            lm_labels[i, start_token_index+2 : len(tokens)-1] = tokens[start_token_index+3 : input_length] 

        input_ids = torch.tensor(input_ids)
        lm_labels = torch.tensor(lm_labels)
        tensor_dataset = (input_ids, lm_labels)
        #tensor_dataset.append(torch.tensor(d) for d in all_inputs)

        return tensor_dataset

    # Prepare input tensors and dataloders
    train_tensor_dataset = pre_process_dataset(train_dataset, input_length=input_length)
    #eval_tensor_dataset = pre_process_dataset(eval_dataset, input_length=input_length)
    print(train_tensor_dataset[0].shape, train_tensor_dataset[1].shape)
    print("Training Example Input ids= {}".format(train_tensor_dataset[0][0]))
    print("Training Example Language Modeling ids = {}".format(train_tensor_dataset[1][0]))
    time.sleep(10)
    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

    #eval_data = TensorDataset(*eval_tensor_dataset)
    #eval_sampler = RandomSampler(eval_data)
    #eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                           lr=args.learning_rate,
                           warmup=args.warmup_proportion,
                           max_grad_norm=args.max_grad_norm,
                           weight_decay=args.weight_decay,
                           t_total=num_train_optimization_steps)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, lm_labels = batch
                loss = model(input_ids, lm_labels=lm_labels)
                if n_gpu > 1:
                    loss.mean().backward()
                else:
                    loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                if n_gpu > 1:
                    tmp_loss = loss.mean().item()
                else:
                    tmp_loss = loss.item()
                exp_average_loss = tmp_loss if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * tmp_loss
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0])
                '''
                if (step > 0  and step % 20 == 0):
                    print("SAving Model....")
                    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
                    output_model_file = os.path.join(args.output_dir, "language_model_{}.bin".format(epoch+1))
                    config = model.module.config if hasattr(model, 'module') else model.config
                    torch.save(model_to_save.state_dict(), output_model_file)
		'''
            model_to_save = model.module if hasattr(model, 'module') else model
            output_model_file = os.path.join(args.output_dir, "pytorch_model_final.bin")
            config = model.module.config if hasattr(model, 'module') else model.config
            torch.save(model_to_save.state_dict(), output_model_file)
            model_state_dict = torch.load(output_model_file)
            model = GPT2LMHeadModel(config)
            model.load_state_dict(model_state_dict)
            model.to(device)
            if n_gpu > 1:
                model = torch.nn.DataParallel(model)
		
    # Save a trained model
    # if args.do_train:
    #     model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
    #     output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
    #     config = model.config
    #     torch.save(model_to_save.state_dict(), output_model_file)
    #
    #     # Load a trained model that you have fine-tuned
    #     model_state_dict = torch.load(output_model_file)
    #     model = OpenAIGPTLMHeadModel(config)
    #     model.load_state_dict(model_state_dict)
    #     model.to(device)

    if args.do_eval:
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, lm_labels = batch
            with torch.no_grad():
                lm_loss = model(input_ids, lm_labels=lm_labels)

            eval_loss += lm_loss.mean().item()

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        train_loss = tr_loss/nb_tr_steps if args.do_train else None
        result = {'eval_loss': eval_loss,
                  'train_loss': train_loss}

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
Beispiel #28
0
def run_lm(data, year, model_name, predictions_dict):
    """
    Using BERT or GPT2 as Language models
    :param data: The actual data of the year stored on dictionary
    :param year: The corresponding year of the data. It is used when we save the predictions
    :param model_name: Name of LM_experiments we used (BERT or GPT2). It is used on the output file name
    :param predictions_dict: A dict where we save the predictions from our experiments
    :return: The updated prediction_dict
    """

    model, tokenizer, vocab_size = None, None, None
    if model_name == 'GPT2_LM':
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        vocab_size = len(tokenizer.encoder)

        model = GPT2LMHeadModel.from_pretrained('gpt2')

    elif model_name == 'BERT_LM':
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                                  do_lower_case=True)
        vocab_size = len(tokenizer.vocab)

        model = BertForMaskedLM.from_pretrained('bert-base-uncased')

    model.eval()
    model.to('cuda')

    # It is used when we normalize the predicted probabilities of LM_experiments to [0, 1]
    soft_max = torch.nn.Softmax()

    # For each κ initialize a dict to store the predictions
    each_case_k_predictions = [{} for _ in range(MAX_BPES_TO_SEARCH)]

    for doc_id, doc in data.items():

        for j in range(MAX_BPES_TO_SEARCH):
            each_case_k_predictions[j].update({doc_id: {}})

        for peer_id, peer in doc['peer_summarizers'].items():
            summary = peer['system_summary']

            if not_valid(peer_id=peer_id, doc_id=doc_id):
                for j in range(MAX_BPES_TO_SEARCH):
                    each_case_k_predictions[j][doc_id].update(
                        {peer_id: vocab_size})
                continue

            indexed_summary = None
            if model_name == 'GPT2_LM':
                indexed_summary = tokenizer.encode(summary)

            elif model_name == 'BERT_LM':
                # BERT can handle max 512 bpes
                tokenized_summary = tokenizer.tokenize(summary)[:512]
                indexed_summary = tokenizer.convert_tokens_to_ids(
                    tokenized_summary)

            # Convert the SUMMARY to PyTorch tensor
            tokens_tensor = torch.tensor([indexed_summary])
            tokens_tensor = tokens_tensor.to('cuda')

            with torch.no_grad():
                if summary != '':

                    if model_name == 'GPT2_LM':
                        predictions, _ = model(
                            tokens_tensor)  # GPT returns the present

                    elif model_name == 'BERT_LM':
                        predictions = model(
                            tokens_tensor)  # BERT returns only the predictions

                    probability_distribution = []

                    # i --> index of the word that we are looking (i+1 the next one)
                    for i in range(predictions.shape[1] - 1):
                        # Normalize the predictions of LM_experiments by passing them through the softmax
                        soft_predictions = soft_max(
                            predictions[0, i, :]).reshape(vocab_size)

                        if model_name == 'GPT2_LM':
                            # GPT -> probabilities (predictions) corresponds to the next word
                            p = soft_predictions[tokens_tensor[0,
                                                               i + 1]].item()

                        elif model_name == 'BERT_LM':
                            # BERT -> probabilities (predictions) corresponds to this word which is masked
                            p = soft_predictions[tokens_tensor[0, i]].item()

                        probability_distribution.append(math.log(p, 2))

                    perplexities = get_perplexity(
                        probabilities=probability_distribution)

                    for j in range(MAX_BPES_TO_SEARCH):
                        each_case_k_predictions[j][doc_id].update(
                            {peer_id: perplexities[j]})

                else:
                    print('BLANK')
                    for j in range(MAX_BPES_TO_SEARCH):
                        each_case_k_predictions[j][doc_id].update(
                            {peer_id: vocab_size})

    k = compute_correlations_of_each_k(data=data,
                                       predictions=each_case_k_predictions,
                                       model_name=model_name,
                                       year=year)

    return save_the_best_predictions(
        best_predictions=each_case_k_predictions[k - 1],
        predictions_dict=predictions_dict,
        year=year,
        model_name=model_name)
Beispiel #29
0
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--nsamples", type=int, default=1)
    parser.add_argument("--batch_size", type=int, default=-1)
    parser.add_argument("--length", type=int, default=randrange(50, 150, 1))
    parser.add_argument("--temperature", type=float, default=1.0)
    parser.add_argument("--top_k", type=int, default=5)
    parser.add_argument('--unconditional',
                        action='store_true',
                        help='If true, unconditional generation.')
    args = parser.parse_args()
    print(args)
    if args.batch_size == -1:
        args.batch_size = 1
    assert args.nsamples % args.batch_size == 0

    np.random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    model.to(device)

    if args.length == -1:
        args.length = model.config.n_ctx // 2
    elif args.length > model.config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" %
                         model.config.n_ctx)

    app.run("0.0.0.0", port=int(os.environ.get("PORT", 5000)))
Beispiel #30
0
def run():
    parser = ArgumentParser()
    parser.add_argument("--model_type",
                        type=str,
                        default="gpt",
                        help="gpt or gpt2")
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--filename",
                        type=str,
                        default="data/instances_dev.pkl",
                        help="File to use for decoding")
    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=50,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature",
                        type=int,
                        default=0.7,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")

    # While using SQUASH in the pipeline mode, prefer using the --key flag
    parser.add_argument(
        "--key",
        type=str,
        default=None,
        help=
        "Override the default settings if the key is set, used in pipeline mode"
    )
    args = parser.parse_args()

    if args.key is not None:
        # Override some the filename and top_p default settings if args.key is set
        # This is done when the question generation module is being used in the SQUASH pipeline mode
        args.filename = "squash/temp/%s/input.pkl" % args.key

        with open("squash/temp/%s/metadata.json" % args.key, "r") as f:
            metadata = json.loads(f.read())
        args.top_p = metadata["settings"]["top_p"]

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")

    if args.model_type == 'gpt2':
        tokenizer = GPT2Tokenizer.from_pretrained(args.model_checkpoint)
        model = GPT2LMHeadModel.from_pretrained(args.model_checkpoint)
    else:
        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_checkpoint)
        model = OpenAIGPTLMHeadModel.from_pretrained(args.model_checkpoint)

    model.to(args.device)
    model.eval()

    data = get_positional_dataset_from_file(tokenizer, args.filename)
    final_output_dict = {"version": "squash-2.0", "data": [{"paragraphs": []}]}
    question_number = 0

    para_cache = {"index": None, "hidden_states": None}

    for inst in tqdm.tqdm(data):
        with torch.no_grad():
            para_index = inst["para_index"]
            # Questions from the same paragraph all appear together
            # We can re-use the paragraph hidden representations for different questions in the same paragraph
            if para_index != para_cache["index"]:
                # Since we have moved to a new paragraph, generate its cache
                para_cache["hidden_states"] = None
                # Ignore the answer and question while building the input
                instance, _ = build_para_only_input_from_segments(
                    inst, tokenizer)
                input_ids = torch.tensor(instance['input_ids'],
                                         device=args.device).unsqueeze(0)
                token_type_ids = torch.tensor(instance['token_type_ids'],
                                              device=args.device).unsqueeze(0)

                # Run a forward pass to generate the para caches
                _, para_cache["hidden_states"] = model(
                    input_ids, token_type_ids=token_type_ids)

            # Sample a question using the paragraph cache
            output = sample_sequence(inst, tokenizer, model, args, para_cache)

        original_paragraph = tokenizer.decode(output['paragraph'])
        generated_question = tokenizer.decode(output['question'],
                                              skip_special_tokens=True)
        original_answer = tokenizer.decode(output['answer'],
                                           skip_special_tokens=True)
        para_index = inst['para_index']
        para_cache["index"] = inst['para_index']

        # verify whether the answer position is correct, since this will be utilized for filtering
        original_ans_position = output["answer_position"]
        if original_paragraph[
                output["answer_position"]:output["answer_position"] +
                len(original_answer)] != original_answer:
            # This should never be executed, only used as a last resort
            logger.info("Answer mismatch!")
            original_ans_position = original_paragraph.index(original_answer)

        # Output in a SQUAD-like format with questions clumped together under their parent paragraph
        if len(final_output_dict["data"][0]["paragraphs"]) > para_index:
            # verify whether the paragraph text is identical
            assert original_paragraph == final_output_dict["data"][0][
                "paragraphs"][para_index]['context']
            # append the question answer pair
            final_output_dict["data"][0]["paragraphs"][para_index][
                'qas'].append({
                    'id':
                    'question_%d' % question_number,
                    'question':
                    generated_question,
                    'answers': [{
                        'text': original_answer,
                        'answer_start': original_ans_position,
                    }],
                    'class':
                    output['class'],
                    'algorithm':
                    output['algorithm'],
                    'is_impossible':
                    False
                })
        else:
            # add a new question to the list of QA pairs
            final_output_dict['data'][0]['paragraphs'].append({
                'context':
                original_paragraph,
                'qas': [{
                    'id':
                    'question_%d' % question_number,
                    'question':
                    generated_question,
                    'answers': [{
                        'text': original_answer,
                        'answer_start': original_ans_position,
                    }],
                    'class':
                    output['class'],
                    'algorithm':
                    output['algorithm'],
                    'is_impossible':
                    False
                }]
            })

        question_number += 1

    with open("squash/temp/%s/generated_questions.json" % args.key, "w") as f:
        f.write(json.dumps(final_output_dict))
 def __init__(self):
     # Load pre-trained model tokenizer (vocabulary)
     self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
     self.model = GPT2LMHeadModel.from_pretrained('gpt2')
     self.model.eval()
Beispiel #32
0
                              size_hint=(1, 1.5))
        self.window.add_widget(self.user)

        # button widget
        self.button = Button(text="Generate !",
                             size_hint=(1, 0.5),
                             bold=True,
                             background_color='32A67F',
                             background_normal='')
        self.button.bind(on_press=self.callback)
        self.window.add_widget(self.button)

        self.window.add_widget(self.m_output0)

        self.window.add_widget(self.m_output1)
        # self.window.add_widget(self.m_output2)

        return self.window

    def callback(self, instance):
        self.m_output1.text, _ = Gen_new(self.user.text, 20)


if __name__ == "__main__":
    # Configuration
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    translator = Translator()

    SayHello().run()