Ejemplo n.º 1
0
def load_openai_gpt(n_special=1, n_ctx=512):
    text_encoder = TextEncoder("pytorch-openai-transformer-lm/model/encoder_bpe_40000.json",
                               "pytorch-openai-transformer-lm/model/vocab_40000.bpe")
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)
    vocab = n_vocab + n_special + n_ctx

    args = DEFAULT_CONFIG
    lm_model = LMModel(args, vocab, n_ctx, return_probs=True)
    load_openai_pretrained_model(lm_model.transformer, n_ctx=n_ctx, n_special=n_special,
                                 path="pytorch-openai-transformer-lm/model/",
                                 path_names="pytorch-openai-transformer-lm/")
    # lm_model.to(device)
    lm_model.return_probs = False
    lm_model.eval()
    return lm_model, text_encoder
Ejemplo n.º 2
0
    def __init__(self):
        # initialize lm and text encoder and everything

        # set up the encoder to turn words into indices
        encoder_path = 'model/encoder_bpe_40000.json'
        bpe_path = 'model/vocab_40000.bpe'
        self.text_encoder = TextEncoder(encoder_path, bpe_path)

        self.nvocab = len(self.text_encoder.encoder)
        nctx = 512 # number of positional embeddings (nctx = number of context)
        vocab = self.nvocab + nctx

        # set up pretrained openai model
        args = DEFAULT_CONFIG
        self.lm_model = LMModel(args, vocab, nctx, return_probs = True)
        load_openai_pretrained_model(self.lm_model.transformer, n_ctx=nctx, n_special=0)
        self.lm_model.eval() # this line puts the model in eval mode so we don't do dropout :) 


        # set up spacy for pos tagging
        self.nlp = spacy.load('en', disable=['ner', 'textcat', 'parser'])
    log_dir = args.log_dir
    submission_dir = args.submission_dir

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)

    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    n_special = 0   # XD: useless for language modeling task
    vocab = n_vocab + n_special + n_ctx #the size of the vocabalery - in this case it is letters - so I don;t think its what we need


    lm_model = LMModel(args, vocab, n_ctx, return_probs=True)
    load_openai_pretrained_model(lm_model.transformer, n_ctx=n_ctx, n_special=n_special)
    lm_model.to(device)

    lm_model.eval()
    #till now it loaded the previuos model and the vocabalery that will be used
    text = input('Input some beginning words:') #why we need this?
    create_dictionary(text_encoder)

    while text != 'q':
        X = text_encoder.encode([text,])
        XMB = make_batch(X)

        for _ in range(args.gen_len):
            lm_probs = lm_model(XMB) #the porbability of each word in the vocabalry?
            if args.topk == 0:
Ejemplo n.º 4
0
class SurprisalAnalyzer:

    def __init__(self):
        # initialize lm and text encoder and everything

        # set up the encoder to turn words into indices
        encoder_path = 'model/encoder_bpe_40000.json'
        bpe_path = 'model/vocab_40000.bpe'
        self.text_encoder = TextEncoder(encoder_path, bpe_path)

        self.nvocab = len(self.text_encoder.encoder)
        nctx = 512 # number of positional embeddings (nctx = number of context)
        vocab = self.nvocab + nctx

        # set up pretrained openai model
        args = DEFAULT_CONFIG
        self.lm_model = LMModel(args, vocab, nctx, return_probs = True)
        load_openai_pretrained_model(self.lm_model.transformer, n_ctx=nctx, n_special=0)
        self.lm_model.eval() # this line puts the model in eval mode so we don't do dropout :) 


        # set up spacy for pos tagging
        self.nlp = spacy.load('en', disable=['ner', 'textcat', 'parser'])

    def  make_batch(self, X):
        X = np.array(X)
        assert X.ndim in [1, 2]
        if X.ndim == 1:
            X = np.expand_dims(X, axis=0)
        # add positional encodings - just second dimension that says which word is where
        pos_enc = np.arange(self.nvocab, self.nvocab + X.shape[-1])
        pos_enc = np.expand_dims(pos_enc, axis=0)
        batch = np.stack([X, pos_enc], axis=-1)
        batch = torch.tensor(batch, dtype=torch.long)
        return batch

    def _get_continuation_tensor(self, sent_vec):
        """
        Deals strictly with tensors
        """
        sent_batch = self.make_batch(sent_vec)
        sent_res = self.lm_model(sent_batch)
        return sent_res

    def tensor_to_probs(self, tensor):
        """
        converts torch tensor to clean numpy array holding probabilities
        (Basically just hides some nasty code)
        """
        return tensor[:, -1, :].flatten().detach().numpy()

    def get_continuation_probs(self, sentence):
        sent_vec = self.text_encoder.encode([sentence])
        tensor = self._get_continuation_tensor(sent_vec)
        return self.tensor_to_probs(tensor)

    def _get_continuations(self, sent_res, k=10, verbose=False):
        """
        Making this private so I can access it externally... that's awful

        This is a helper function for the `get_continuations` wrapper that 
        separates the actual processing of the sentence from getting top
        continuations
        """
        probs, decode = sent_res[:,-1,:].topk(k)
        if verbose:
            for p, d in zip(probs.flatten(), decode.flatten()):
                print("\t...%s (%.4f)"%(self.text_encoder.decoder[d.item()], p.item()))
        words = [self.text_encoder.decoder[d.item()] for d in decode.flatten()]
        # strip of the word ending tags if there are some - if it's not a full continuation, what to do?
        for i in range(len(words)):
            if words[i][-4:] == "</w>":
                words[i] = words[i][:-4]
        probs = probs.flatten().detach().numpy() # convert probs from tensor to numpy array
        return words, probs

    def get_continuations(self, sentence, k=10, verbose=False):
        """
        sentence: a string that you want to get next words for
        k: how many next words you want to get
        verbose: do you want to print the output
        """
        sent_vec = self.text_encoder.encode([sentence])
        sent_res = self._get_continuation_tensor(sent_vec)
        if verbose:
            print(sentence)

        return self._get_continuations(sent_res, k, verbose)


    def _get_pos_continuations(self, sentence, words, probs):
        """
        helper function for `get_pos_continuations` that takes the lists of words and
        probabilities and performs all the computation to get the most common pos
        tags independently of processing an individual sentence
        """
        # get POS of all of k continuations
        pos_counter = Counter()

        for word, prob in zip(words, probs):
            sentence_continuation = "{} {}".format(sentence, word)
            encoded = self.nlp(sentence_continuation)
            pos_counter[encoded[-1].pos_] += prob

        # format pos_counter most common output as two lists, one of probs and one of pos tags
        pos_counter_list = list(zip(*pos_counter.most_common()))
        pos_tags, pos_tag_probs = list(pos_counter_list[0]), np.array((pos_counter_list[1]), dtype=np.float32)
        return pos_tags, pos_tag_probs

    def get_pos_continuations(self, sentence, k=10, verbose=False):
        """
        sentence: string you want next parts of speech for
        k: how many top words to analyze 
        NOTE: unlike in the `get_continuation` function, the k is NOT how many
        unique POS tags you want to look at, it's how many words you want to consider
        """
        # get likely next words
        words, probs = self.get_continuations(sentence, k, verbose=False)
        return self._get_pos_continuations(sentence, words, probs)



    ################################################################################
    # The following three functions calculate entropy/surprisal of a SINGLE function
    ################################################################################
    def _get_surprisal(self, distribution, index):
        word_prob = distribution[index]
        return -np.log2(word_prob)
    
    def get_surprisal(self, sentence, word):
        """
        get the -log2 probability of the word following the sentence
        """
        all_probs = self.get_continuation_probs(sentence)
        # if the word is not in the vocabulary in full, represent its probability by the 
        # probability of the first part of its encoding (the 0 index)
        word_index = self.text_encoder.encode([word])[0]
        # word_prob = all_probs[word_index]
        return self._get_surprisal(all_probs, word_index)#-np.log2(word_prob)

    def _get_entropy(self, distribution):
        return -np.sum([p*np.log2(p) if p > 0 else 0 for p in distribution])

    def get_entropy(self, sentence):
        """
        finds the shannon entropy of predicting the word following sentence
        """
        all_probs = self.get_continuation_probs(sentence)
        return self._get_entropy(all_probs)#-np.sum([p*np.log2(p) if p > 0 else 0 for p in all_probs])

    def get_surprisal_entropy_ratio(self, sentence, word):
        "gets ratio betwen surprisal and entropy at the end of the sentence for a given word"
        all_probs = self.get_continuation_probs(sentence)
        word_index = self.text_encoder.encode([word])[0]
        entropy = self._get_entropy(all_probs)
        surprisal = self._get_surprisal(all_probs, word_index)
        return surprisal/entropy

    ####################################################################
    # Same as above but for part of speech
    ####################################################################
    def get_surprisal_pos(self, sentence, pos, k=1000):
        """
        Because we the language model is not a POS tagger, we cannot directly
        calculate the surprisal of the pos from a full probability distribution,
        instead we have to use the degenerate distribution computed from the 
        top k most probable POS continuations

        sentence is full sentence
        pos is pos we want to get surprisal of
        k is how many possible continuations to check
        """
        pos_tags, pos_tag_probs = self.get_pos_continuations(sentence, k)
        pos_index = pos_tags.index(pos) # assume the POS we want is in the list somewhere...
        return self._get_surprisal(pos_tag_probs, pos_index)

        
    def get_entropy_pos(self, sentence, k=1000):
        """
        Disclaimer about degenerate distribution same as above
        """
        pos_tags, pos_tag_probs = self.get_pos_continuations(sentence, k)
        return self._get_entropy(pos_tag_probs)


    
    #####################################################################
    # Gets all of the above metrics for every word in a single sentence #
    #####################################################################
    def get_surprisal_sentence(self, sentence, prepend=None, start=1):
        """
        A little uglier, but perhaps faster

        """
        surprisals = []
        sent_enc = self.text_encoder.encode([sentence])[0] # list of indices in enocder 1-d
        if prepend != None:
            sent_enc = prepend + sent_enc
        sent_dec = [self.text_encoder.decoder[ind] for ind in sent_enc]

        sent_batch = None

        # if you run the language model with the whole sentence the outputs for each
        # word are the probabilities for the next word!
        sent_batch = self.make_batch([sent_enc])
        sent_tensor = self.lm_model(sent_batch)
        for i in range(start, len(sent_enc)):
            surprisals.append(-np.log2(sent_tensor[:,i-1,sent_enc[i]].item()))
        return surprisals, sent_dec
        
    def get_s_h_shr_sentence(self, sentence, prepend=None, start=1):
        """
        calculates the surprisal, entropy, and surprisal-entropy-ratio at each word (defined by bpe)
        in the sentence

        returns, in order
        1. The list of surprisals (len(sentence) - 1)
        2. The list of entropies  (len(sentence) - 1)
        3. The list of rations between surprisals and entropies (len(sentence) - 1)
        4. The decoded tokens that are used by the BPE encoder wrapper
        """
        surprisals, entropies, surprisal_entropy_ratios = [],[],[]
        sent_enc = self.text_encoder.encode([sentence])[0] # list of indices in enocder 1-d
        if prepend != None:
            sent_enc = prepend + sent_enc
        sent_dec = [self.text_encoder.decoder[ind] for ind in sent_enc]

        # start = max(0, min(1, start)) # doesn't work because language model needs to condition on something
        start = 1

        for i in range(start, len(sent_enc)):
            partial_sent_enc = [sent_enc[:i]]
            cont_tensor = self._get_continuation_tensor(partial_sent_enc)
            partial_probs = self.tensor_to_probs(cont_tensor)

            surprisals.append(self._get_surprisal(partial_probs, sent_enc[i]))
            entropies.append(self._get_entropy(partial_probs))
            surprisal_entropy_ratios.append(surprisals[-1]/entropies[-1])

        return surprisals, entropies, surprisal_entropy_ratios, sent_dec
    n_special = 3
    max_len = n_ctx // 2 - 2
    n_ctx = 626 * 2 + 4
    vocab = n_vocab + n_special + n_ctx
    print(vocab)
    trX, trM = transform_roc(trX)
    vaX, vaM = transform_roc(vaX)

    n_train = len(trX)
    n_valid = len(vaX)

    n_batch_train = args.n_batch * max(n_gpu, 1)
    n_updates_total = (n_train // n_batch_train) * args.n_iter

    lm_model = LMModel(args, vocab, n_ctx)

    criterion = nn.CrossEntropyLoss(reduce=False)
    model_opt = OpenAIAdam(lm_model.parameters(),
                           lr=args.lr,
                           schedule=args.lr_schedule,
                           warmup=args.lr_warmup,
                           t_total=n_updates_total,
                           b1=args.b1,
                           b2=args.b2,
                           e=args.e,
                           l2=args.l2,
                           vector_l2=args.vector_l2,
                           max_grad_norm=args.max_grad_norm)
    compute_loss_fct = LMLossCompute(criterion, model_opt)
    load_openai_pretrained_model(lm_model.transformer,
    firstbpe, secondbpe = encode_dataset(*(firstsent, secondsent),
                                         encoder=text_encoder)
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)

    n_special = 3
    max_len = n_ctx // 2 - 2
    n_ctx = 1256
    vocab = n_vocab + n_special + n_ctx
    n_train = len(firstsent)
    n_valid = len(secondsent)

    n_batch_train = args.n_batch * max(n_gpu, 1)
    n_updates_total = (n_train // n_batch_train) * args.n_iter

    dh_model = LMModel(args, vocab, n_ctx)

    load_openai_pretrained_model(dh_model.transformer,
                                 n_ctx=n_ctx,
                                 n_special=n_special)
    dh_model.to(device)
    dh_model = nn.DataParallel(dh_model)
    n_updates = 0
    n_epochs = 0

    desc = "challenge"
    path = os.path.join(save_dir, desc, 'best_params')
    dh_model.load_state_dict(torch.load(path))
    arr = predict(firstsent, secondsent, firstbpe, secondbpe)
    with open(os.path.join(os.getcwd(), 'part1.txt'), 'w') as w:
        for pred in arr:
def main(args):
    init(args)

    # Constants
    n_ctx = args.n_ctx
    data_dir = args.data_dir

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)

    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)
    text_encoder.decoder[len(encoder)] = '_start_'
    encoder['_start_'] = len(encoder)
    text_encoder.decoder[len(encoder)] = '_delimiter_'
    encoder['_delimiter_'] = len(encoder)
    text_encoder.decoder[len(encoder)] = '_classify_'
    encoder['_classify_'] = len(encoder)

    n_special = 3  # XD: useless for language modeling task
    vocab = n_vocab + n_special + n_ctx

    lm_model = LMModel(args,
                       vocab,
                       n_ctx,
                       return_probs=True,
                       doc_embed=args.doc_model)
    load_openai_pretrained_model(lm_model.transformer,
                                 n_ctx=n_ctx,
                                 n_special=n_special)
    if args.checkpoint != "none":
        checkpoint = torch.load(args.checkpoint, map_location='cpu')
        state_dict = checkpoint["state_dict"]
        for key in list(state_dict.keys()):
            state_dict[key[7:]] = state_dict[key]
            del state_dict[key]
        pos_emb_mask = torch.zeros(1, 1, vocab)
        pos_emb_mask[:, :, -n_ctx] = -1e12
        state_dict['pos_emb_mask'] = pos_emb_mask
        lm_model.load_state_dict(state_dict)
    lm_model.to(device)
    lm_model = DataParallelModel(lm_model)

    train_bar = get_loader(os.path.join(data_dir, "val_encoded.jsonl"),
                           n_gpu,
                           encoder,
                           num_workers=1,
                           shuffle=True,
                           max_size=args.n_iter)
    srcs, hyps, refs = [], [], []
    with torch.no_grad():
        lm_model.eval()
        for i, (pad_output, mask_output) in enumerate(tqdm(train_bar), 1):
            src_strs, tgt_strs, gen_strs = generate_outputs(
                lm_model, pad_output, mask_output, text_encoder, device,
                args.beam, args.gen_len, args.k, args.decoding_strategy)
            srcs.extend(src_strs)
            hyps.extend(gen_strs)
            refs.extend(tgt_strs)

    for i in range(len(hyps)):
        print("*" * 50)
        print("Source: {}".format(srcs[i]))
        print('Hypothesis: {}'.format(hyps[i]))
        print("Reference: {}".format(refs[i]))
        ) + 3,
        n_ctx)
    vocab = n_vocab + n_special + n_ctx
    trX, trM = transform_roc(trX1, trX2, trX3)
    vaX, vaM = transform_roc(vaX1, vaX2, vaX3)
    if submit:
        teX, teM = transform_roc(teX1, teX2, teX3)

    n_train = len(trY)
    n_valid = len(vaY)
    n_batch_train = args.n_batch * max(n_gpu, 1)
    n_updates_total = (n_train // n_batch_train) * args.n_iter

    # change this one to LMModel
    # dh_model = DoubleHeadModel(args, clf_token, 'multiple_choice', vocab, n_ctx)
    lm_model = LMModel(args, vocab, n_ctx, return_probs=True)
    # load_openai_pretrained_model(lm_model.transformer, n_ctx=n_ctx, n_special=n_special)
    lm_model.to(device)

    criterion = nn.CrossEntropyLoss(reduce=False)
    model_opt = OpenAIAdam(lm_model.parameters(),
                           lr=args.lr,
                           schedule=args.lr_schedule,
                           warmup=args.lr_warmup,
                           t_total=n_updates_total,
                           b1=args.b1,
                           b2=args.b2,
                           e=args.e,
                           l2=args.l2,
                           vector_l2=args.vector_l2,
                           max_grad_norm=args.max_grad_norm)
def main(args):
    # Constants
    n_ctx = args.n_ctx
    desc = args.desc

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)

    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3

    print("Loading dataset...")
    test_loader = get_loader(args.data_file,
                             args.n_batch,
                             encoder,
                             num_workers=1,
                             shuffle=False,
                             subset=args.subset)

    vocab = n_vocab + n_special + n_ctx
    dh_model = LMModel(args,
                       vocab=vocab,
                       n_ctx=n_ctx,
                       doc_embed=args.doc_model)

    print("Loading model...")
    load_openai_pretrained_model(dh_model.transformer,
                                 n_ctx=n_ctx,
                                 n_special=n_special,
                                 path="./model/",
                                 path_names="./")
    if args.checkpoint != "none":
        checkpoint = torch.load(args.checkpoint, map_location='cpu')
        state_dict = checkpoint["state_dict"]
        for key in list(state_dict.keys()):
            state_dict[key[7:]] = state_dict[key]
            del state_dict[key]
        pos_emb_mask = torch.zeros(1, 1, vocab)
        pos_emb_mask[:, :, -n_ctx] = -1e12
        state_dict['pos_emb_mask'] = pos_emb_mask
        dh_model.load_state_dict(state_dict)

    dh_model.to(device)
    dh_model = DataParallelModel(dh_model)

    stop_words = []
    if args.stop_words is not None:
        with open(args.stop_words) as f:
            for line in f:
                stop_words.append(line)
    evaluate_model(dh_model, test_loader, text_encoder, device, args.beam,
                   args.gen_len, args.k, args.decoding_strategy,
                   args.save_file, args.gen_dir, args.tgt_dir, args.max_len,
                   stop_words, args)
def main(args):
    init(args)

    # Constants
    n_ctx = args.n_ctx
    save_dir = os.path.join(args.output_dir, args.experiment_name, "checkpoints")
    desc = args.desc
    data_dir = args.data_dir
    log_dir = os.path.join(args.output_dir, args.experiment_name, "logs")
    train_log_interval = args.train_log_interval
    val_log_interval = args.val_log_interval
    beam = args.beam
    gen_len = args.gen_len
    k = args.k
    decoding_strategy = args.decoding_strategy
    accum_iter = args.accum_iter

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)
    logger = Logger(log_dir)

    text_encoder = TextEncoder(args.encoder_path, args.vocab_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3

    print("Loading dataset...")
    train_loader = get_loader(os.path.join(data_dir, "train_encoded.jsonl"), args.n_batch, encoder, num_workers=3, shuffle=True)
    val_loader = get_loader(os.path.join(data_dir, "val_encoded.jsonl"), n_gpu, encoder, num_workers=0, shuffle=False, max_size=args.num_val_examples)
    print("Train length: {}, Validation length: {}".format(len(train_loader), len(val_loader)))

    vocab = n_vocab + n_special + n_ctx
    n_updates_total = (len(train_loader) // args.accum_iter) * (args.num_epochs_dat + args.num_epochs_ft)

    dh_model = LMModel(args, vocab=vocab, n_ctx=n_ctx, doc_embed=args.doc_model)

    criterion = nn.CrossEntropyLoss(reduction="none")
    model_opt = OpenAIAdam(dh_model.parameters(),
                           lr=args.lr,
                           schedule=args.lr_schedule,
                           warmup=args.lr_warmup,
                           t_total=n_updates_total,
                           b1=args.b1,
                           b2=args.b2,
                           e=args.e,
                           l2=args.l2,
                           vector_l2=args.vector_l2,
                           max_grad_norm=args.max_grad_norm)

    lm_loss = LMLoss(criterion)
    summary_loss = SummaryLoss(criterion)

    print("Loading Model")
    if args.use_pretrain:
        load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special, path="./model/", path_names="./")
    start_iter, running_loss = load_checkpoint(args.checkpoint, dh_model, model_opt, vocab, n_ctx)

    dh_model.to(device)
    dh_model = DataParallelModel(dh_model)
    lm_loss = DataParallelCriterion(lm_loss)
    summary_loss = DataParallelCriterion(summary_loss)

    for i in range(args.num_epochs_dat):
        start_iter, running_loss = run_epoch(start_iter, running_loss, dh_model, lm_loss, model_opt, train_loader, val_loader, train_log_interval, val_log_interval, device, beam, gen_len, k, decoding_strategy, accum_iter, "DAT Training Epoch [{}/{}]".format(i + 1, args.num_epochs_dat), save_dir, logger, text_encoder, show_progress=args.show_progress, summary_loss=summary_loss)
    for i in range(args.num_epochs_ft):
        start_iter, running_loss = run_epoch(start_iter, running_loss, dh_model, summary_loss, model_opt, train_loader, val_loader, train_log_interval, val_log_interval, device, beam, gen_len, k, decoding_strategy, accum_iter, "FT Training Epoch [{}/{}]".format(i + 1, args.num_epochs_ft), save_dir, logger, text_encoder, show_progress=args.show_progress)