Exemple #1
0
def main(args):
    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    with jsonl.open(args.original_file, gzip=True) as test_file:
        data = test_file.read()

    with jsonl.open(args.out_file, gzip=True) as out_file:
        out_file.write(data[-args.n:])
def main(args):
    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    train_split, val_split, test_split = load_splits(args.splits_file)
    summaries = os.listdir(args.summary_dir)

    num_summaries = 0
    train_data, val_data, test_data = [], [], []
    for file_name in tqdm(summaries):
        summary_data = load_summary(os.path.join(args.summary_dir, file_name))
        if len(summary_data["summary"]) == 0 or len(summary_data["text"]) == 0:
            continue
        summary_data["summary"] = encode_line(summary_data["summary"],
                                              text_encoder)
        summary_data["text"] = encode_line(summary_data["text"], text_encoder)
        file_id = file_name.split(".")[0]
        if file_id in train_split:
            train_data.append(summary_data)
            num_summaries += 1
        elif file_id in val_split:
            val_data.append(summary_data)
            num_summaries += 1
        elif file_id in test_split:
            test_data.append(summary_data)
            num_summaries += 1

    with jsonl.open(args.train_file, gzip=True) as train_file:
        train_file.write(train_data)
    with jsonl.open(args.val_file, gzip=True) as val_file:
        val_file.write(val_data)
    with jsonl.open(args.test_file, gzip=True) as test_file:
        test_file.write(test_data)
    print("Number of successful conversions: {}".format(num_summaries))
Exemple #3
0
def encode(encoder=None):
    if encoder == None:
        ENCODER_PATH = 'model/encoder_bpe_40000.json'
        BPE_PATH = 'model/vocab_40000.bpe'
        encoder = TextEncoder(ENCODER_PATH, BPE_PATH)

    tokens = encoder(get_paragraphs(), verbose=False)
    with open('Data/tokens.pkl', 'wb') as pkl:
        pickle.dump(tokens, pkl)
Exemple #4
0
    def __init__(self):
        # initialize lm and text encoder and everything

        # set up the encoder to turn words into indices
        encoder_path = 'model/encoder_bpe_40000.json'
        bpe_path = 'model/vocab_40000.bpe'
        self.text_encoder = TextEncoder(encoder_path, bpe_path)

        self.nvocab = len(self.text_encoder.encoder)
        nctx = 512 # number of positional embeddings (nctx = number of context)
        vocab = self.nvocab + nctx

        # set up pretrained openai model
        args = DEFAULT_CONFIG
        self.lm_model = LMModel(args, vocab, nctx, return_probs = True)
        load_openai_pretrained_model(self.lm_model.transformer, n_ctx=nctx, n_special=0)
        self.lm_model.eval() # this line puts the model in eval mode so we don't do dropout :) 


        # set up spacy for pos tagging
        self.nlp = spacy.load('en', disable=['ner', 'textcat', 'parser'])
 def __init__(self, cfg, vocab=40990, n_ctx=512, return_probs=True,
              encoder_path='./model/encoder_bpe_40000.json', bpe_path='./model/vocab_40000.bpe'):
     super(CustomLMModel, self).__init__()
     self.transformer = TransformerModel(cfg, vocab=vocab, n_ctx=n_ctx)
     self.lm_head = LMHead(self.transformer, cfg, trunc_and_reshape=False)
     self.return_probs = return_probs
     self.text_encoder = TextEncoder(encoder_path,bpe_path)
     
     if self.return_probs:
         pos_emb_mask = torch.zeros(1, 1, vocab)
         pos_emb_mask[:, :, -n_ctx:] = -1e12
         self.register_buffer('pos_emb_mask', pos_emb_mask)
Exemple #6
0
def transformer_predict(input_file: str, text_encoder: TextEncoder,
                        device: int):
    if device > -1:
        device_name = "cuda"
    else:
        device_name = "cpu"

    print(input_file)
    n_ctx = 512

    transformer = TransformerModel(DEFAULT_CONFIG,
                                   n_ctx=n_ctx,
                                   requires_grad=False)
    load_openai_pretrained_model(transformer, n_ctx=n_ctx)

    with open(input_file) as f:
        sentences = f.readlines()

    encoded_sentences = text_encoder.encode(sentences)

    masks = [
        np.concatenate((np.ones(len(s)), np.zeros(n_ctx - len(s))))
        for s in encoded_sentences
    ]

    input_tensor = torch.LongTensor([
        pad_sequence_to_length(s, desired_length=512)
        for s in encoded_sentences
    ])
    if device_name == "cuda":
        input_tensor = input_tensor.cuda()

    batch_size, num_timesteps = input_tensor.size()

    positional_encodings = get_range_vector(num_timesteps, device) + n_ctx

    batch_tensor = torch.stack(
        [input_tensor,
         positional_encodings.expand(batch_size, num_timesteps)],
        dim=-1)

    if device_name == "cuda":
        transformer = transformer.cuda()

    transformer_embeddings = transformer(batch_tensor)

    np.save("openai_transformer_test_input.npy",
            batch_tensor.data.cpu().numpy())
    np.save("openai_transformer_test_output.npy",
            transformer_embeddings.data.cpu().numpy())
def load_openai_gpt(n_special=1, n_ctx=512):
    text_encoder = TextEncoder("pytorch-openai-transformer-lm/model/encoder_bpe_40000.json",
                               "pytorch-openai-transformer-lm/model/vocab_40000.bpe")
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)
    vocab = n_vocab + n_special + n_ctx

    args = DEFAULT_CONFIG
    lm_model = LMModel(args, vocab, n_ctx, return_probs=True)
    load_openai_pretrained_model(lm_model.transformer, n_ctx=n_ctx, n_special=n_special,
                                 path="pytorch-openai-transformer-lm/model/",
                                 path_names="pytorch-openai-transformer-lm/")
    # lm_model.to(device)
    lm_model.return_probs = False
    lm_model.eval()
    return lm_model, text_encoder
def main(args):
    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    num_summaries = 0
    out_data = []
    with jsonl.open(args.in_file, gzip=True) as in_file:
        data = in_file.read()
        for entry in tqdm(data):
            if entry["summary"] is None or entry["text"] is None:
                continue
            entry["summary"] = encode_line(entry["summary"], text_encoder)
            entry["text"] = encode_line(entry["text"], text_encoder)
            num_summaries += 1
            out_data.append(entry)
    with jsonl.open(args.out_file, gzip=True) as out_file:
        out_file.write(out_data)
    print("Number of successful conversions: {}".format(num_summaries))
Exemple #9
0
def encode_dataset(*splits: Tuple[
    # the four lists are first_four_sentences (len=1497), first_choice(len=1497), second_choice(len=1497), true_choice(len=1497)
    Tuple[List[str], List[str], List[str], ndarray],  # each list of len 1497, train instances,
    Tuple[List, List, List, List],  # each list of len 374, val instances
    Tuple[List, List, List, List]  # each list of len 1871, test instances
], encoder: TextEncoder):
    encoded_splits = []
    for split in splits:  # loop over trainInstances, valInstances and testInstances
        fields = []
        for field in split: #  a field is one list of str (sentences) or int (true answers)
            if isinstance(field[0], str): # check first element in field to see if str
                # each str element in the field list is encoded as a list of int, hence field becomes List[List[int]]
                field = encoder.encode(field)  # only encode sentences, not encoding true answers (type int choice: {0,1})
            fields.append(field)
        encoded_splits.append(fields)
    return encoded_splits
Exemple #10
0
    def __init__(self, args):
        globals().update(args.__dict__)
        random.seed(seed)
        np.random.seed(seed)
        tf.set_random_seed(seed)

        self.text_encoder = TextEncoder(encoder_path)
        self.encoder = self.text_encoder.encoder
        self.n_vocab = len(self.text_encoder.encoder)
        self.n_y = 2
        self.encoder['_start_'] = len(self.encoder)
        self.encoder['_delimiter_'] = len(self.encoder)
        self.encoder['_end_'] = len(self.encoder)
        self.clf_token = self.encoder['_end_']
        self.n_special = 3
        self.n_batch_train = n_batch * n_gpu
        self.n_updates_total = n_iter
def main(args):
    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    num_summaries = 0
    out_data = []
    with open(args.src_file) as src_file, open(args.tgt_file) as tgt_file:
        src_lines = src_file.readlines()
        tgt_lines = tgt_file.readlines()
        for i in tqdm(range(len(src_lines))):
            num_summaries += 1
            out_data.append({
                "summary":
                encode_line(tgt_lines[i].strip(), text_encoder),
                "text":
                encode_line(src_lines[i].strip(), text_encoder)
            })
    with jsonl.open(args.out_file, gzip=True) as out_file:
        out_file.write(out_data)
    print("Number of successful conversions: {}".format(num_summaries))
Exemple #12
0
 def __init__(self, args):
     #globals().update(args.__dict__)
     random.seed(args.seed)
     np.random.seed(args.seed)
     tf.set_random_seed(args.seed)
     # self.ps_hosts = ps_hosts.split(',')
     # self.worker_hosts = worker_hosts.split(',')
     #self.logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__)
     self.text_encoder = TextEncoder(args.vocab_path)
     self.encoder = self.text_encoder.encoder
     self.n_vocab = len(self.text_encoder.encoder)
     self.encoder['_start_'] = len(self.encoder)
     self.encoder['_delimiter_'] = len(self.encoder)
     self.encoder['_end_'] = len(self.encoder)
     self.clf_token = self.encoder['_end_']
     self.n_special = 3
     self.n_batch_train = args.n_batch * args.n_gpu
     self.n_updates_total = args.n_step * 10000
     self.n_ctx = args.n_ctx
Exemple #13
0
    def __init__(self,
                 cfg,
                 vocab=40990,
                 n_ctx=512,
                 return_probs=True,
                 encoder_path='./model/encoder_bpe_40000.json',
                 bpe_path='./model/vocab_40000.bpe'):
        super(CustomLMModel, self).__init__()
        # Transformer block
        self.transformer = TransformerModel(cfg, vocab=vocab, n_ctx=n_ctx)
        # Language modeling head to convert transformer output to word probabilities
        self.lm_head = LMHead(self.transformer, cfg, trunc_and_reshape=False)
        # Should the model return probabilities or without softmax
        self.return_probs = return_probs
        # Text encoder to convert word to index
        self.text_encoder = TextEncoder(encoder_path, bpe_path)

        if self.return_probs:
            pos_emb_mask = torch.zeros(1, 1, vocab)
            pos_emb_mask[:, :, -n_ctx:] = -1e12
            self.register_buffer('pos_emb_mask', pos_emb_mask)
    data_dir = args.data_dir
    log_dir = args.log_dir
    submission_dir = args.submission_dir
    test_path = args.test_path
    pred_path = args.pred_path
    out_path = args.out_path
    topic = args.topic

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)

    logger = ResultLogger(path=os.path.join(log_dir,
                                            '{}log.json'.format(desc)),
                          **args.__dict__)
    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    print("Encoding dataset...")
    dataLoader = DataLoader()
    ((trX, trY), (vaX, vaY),
     (teX, )) = encode_dataset(*dataLoader.veracity(data_dir, topic=topic),
                               encoder=text_encoder)

    encoder['_start_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 2
    max_len = n_ctx - 2
    # Define maximum context as the minimum of [512, x] where x is the max sentence length
    # Constants
    submit = args.submit
    dataset = args.dataset
    n_ctx = args.n_ctx
    save_dir = args.save_dir
    desc = args.desc
    data_dir = args.data_dir #I think this is the location of the vocablery?
    log_dir = args.log_dir
    submission_dir = args.submission_dir

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)

    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    n_special = 0   # XD: useless for language modeling task
    vocab = n_vocab + n_special + n_ctx #the size of the vocabalery - in this case it is letters - so I don;t think its what we need


    lm_model = LMModel(args, vocab, n_ctx, return_probs=True)
    load_openai_pretrained_model(lm_model.transformer, n_ctx=n_ctx, n_special=n_special)
    lm_model.to(device)

    lm_model.eval()
    #till now it loaded the previuos model and the vocabalery that will be used
    text = input('Input some beginning words:') #why we need this?
    create_dictionary(text_encoder)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input_file', required=True)
    parser.add_argument('-o', '--output_file', required=True)
    parser.add_argument('--n_batch', type=int, default=8)
    parser.add_argument('--skip_preprocess', action='store_true')
    parser.add_argument('--sentence_pair', action='store_true')
    parser.add_argument('--force_delimiter', action='store_true')
    parser.add_argument('--encoder_path', type=str, default='model/encoder_bpe_40000.json')
    parser.add_argument('--bpe_path', type=str, default='model/vocab_40000.bpe')
    parser.add_argument('--model_dir', required=True)
    parser.add_argument('--mc_dropout_iter', type=int, default=0)
    args = parser.parse_args()

    meta = json.load(open(os.path.join(args.model_dir, 'meta.json'), 'r', encoding='utf8'))

    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    encoder['_start_'] = len(encoder)
    if args.sentence_pair or args.force_delimiter:
        encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 2 + int('_delimiter_' in encoder)
    n_ctx = meta['dh_model']['n_ctx']
    max_len = meta['encoder']['max_len']
    if args.sentence_pair:
        max_len = min(max_len, n_ctx // 2 - 2)

    texts, labels = load_headerless_tsv(args.input_file, sentence_pair=args.sentence_pair)
    ((X, Y),) = encode_dataset(*[(texts, labels)],
                               encoder=text_encoder,
                               skip_preprocess=args.skip_preprocess)

    X, M = transform_classification(X, max_len, encoder['_start_'], clf_token,
                                    n_vocab, n_special, n_ctx, encoder.get('_delimiter_'))

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    n_batch_train = args.n_batch * max(n_gpu, 1)

    meta['dh_model']['cfg'] = dotdict(meta['dh_model']['cfg'])
    dh_model = DoubleHeadModel(**meta['dh_model'])
    dh_model.to(device)
    dh_model = torch.nn.DataParallel(dh_model)
    path = os.path.join(args.model_dir, 'best_params')
    if device == torch.device('cpu'):
        map_location = lambda storage, loc: storage
    else:
        map_location = None

    dh_model.load_state_dict(torch.load(path, map_location=map_location))
    prediction_output = predict(X=X,
                                submission_dir=None,
                                filename=None,
                                pred_fn=lambda x: x,
                                label_decoder=None,
                                dh_model=dh_model,
                                n_batch_train=n_batch_train,
                                device=device)

    predictions = np.argmax(prediction_output, axis=1)
    if type(texts) is tuple:
        df = pd.DataFrame({'question': texts[0], 'text': texts[1], 'label': labels, 'prediction': predictions})
    else:
        df = pd.DataFrame({'text': texts, 'label': labels, 'prediction': predictions})
    df.to_csv(args.output_file,
              index=False,
              sep='\t',
              header=False,
              columns=['text', 'label', 'prediction'],
              float_format='%.0f')

    accuracy = accuracy_score(Y, predictions) * 100.
    print('Accuracy: {}%'.format(accuracy))

    basename = os.path.splitext(args.output_file)[0]

    prediction_output_file = basename + '_output.npy'
    np.savetxt(prediction_output_file, prediction_output)
    prediction_probs = np_softmax(prediction_output)
    prediction_probs_file = basename + '_probs.npy'
    np.savetxt(prediction_probs_file, prediction_probs)

    mc_dropout_prediction_output = []
    for _ in tqdm(range(args.mc_dropout_iter)):
        prediction_output = predict(X=X,
                                    submission_dir=None,
                                    filename=None,
                                    pred_fn=lambda x: x,
                                    label_decoder=None,
                                    dh_model=dh_model,
                                    n_batch_train=n_batch_train,
                                    device=device,
                                    enable_dropout=True)
        mc_dropout_prediction_output.append(prediction_output)

    if mc_dropout_prediction_output:
        mc_dropout_prediction_output = np.asarray(mc_dropout_prediction_output)
        mc_dropout_prediction_probs = np.zeros(mc_dropout_prediction_output.shape)
        for i in range(mc_dropout_prediction_output.shape[0]):
            mc_dropout_prediction_probs[i, ...] = np_softmax(mc_dropout_prediction_output[i, ...])

        transpose_dims = (2, 1, 0)
        mc_dropout_prediction_output = mc_dropout_prediction_output.transpose(transpose_dims)
        mc_dropout_prediction_probs = mc_dropout_prediction_probs.transpose(transpose_dims)
        for i in range(mc_dropout_prediction_output.shape[0]):
            prediction_output_file = '{}_class{}_{}'.format(basename, i, 'output.npy')
            np.savetxt(prediction_output_file, mc_dropout_prediction_output[i, ...])
            prediction_probs_file = '{}_class{}_{}'.format(basename, i, 'probs.npy')
            np.savetxt(prediction_probs_file, mc_dropout_prediction_probs[i, ...])
Exemple #17
0
class SurprisalAnalyzer:

    def __init__(self):
        # initialize lm and text encoder and everything

        # set up the encoder to turn words into indices
        encoder_path = 'model/encoder_bpe_40000.json'
        bpe_path = 'model/vocab_40000.bpe'
        self.text_encoder = TextEncoder(encoder_path, bpe_path)

        self.nvocab = len(self.text_encoder.encoder)
        nctx = 512 # number of positional embeddings (nctx = number of context)
        vocab = self.nvocab + nctx

        # set up pretrained openai model
        args = DEFAULT_CONFIG
        self.lm_model = LMModel(args, vocab, nctx, return_probs = True)
        load_openai_pretrained_model(self.lm_model.transformer, n_ctx=nctx, n_special=0)
        self.lm_model.eval() # this line puts the model in eval mode so we don't do dropout :) 


        # set up spacy for pos tagging
        self.nlp = spacy.load('en', disable=['ner', 'textcat', 'parser'])

    def  make_batch(self, X):
        X = np.array(X)
        assert X.ndim in [1, 2]
        if X.ndim == 1:
            X = np.expand_dims(X, axis=0)
        # add positional encodings - just second dimension that says which word is where
        pos_enc = np.arange(self.nvocab, self.nvocab + X.shape[-1])
        pos_enc = np.expand_dims(pos_enc, axis=0)
        batch = np.stack([X, pos_enc], axis=-1)
        batch = torch.tensor(batch, dtype=torch.long)
        return batch

    def _get_continuation_tensor(self, sent_vec):
        """
        Deals strictly with tensors
        """
        sent_batch = self.make_batch(sent_vec)
        sent_res = self.lm_model(sent_batch)
        return sent_res

    def tensor_to_probs(self, tensor):
        """
        converts torch tensor to clean numpy array holding probabilities
        (Basically just hides some nasty code)
        """
        return tensor[:, -1, :].flatten().detach().numpy()

    def get_continuation_probs(self, sentence):
        sent_vec = self.text_encoder.encode([sentence])
        tensor = self._get_continuation_tensor(sent_vec)
        return self.tensor_to_probs(tensor)

    def _get_continuations(self, sent_res, k=10, verbose=False):
        """
        Making this private so I can access it externally... that's awful

        This is a helper function for the `get_continuations` wrapper that 
        separates the actual processing of the sentence from getting top
        continuations
        """
        probs, decode = sent_res[:,-1,:].topk(k)
        if verbose:
            for p, d in zip(probs.flatten(), decode.flatten()):
                print("\t...%s (%.4f)"%(self.text_encoder.decoder[d.item()], p.item()))
        words = [self.text_encoder.decoder[d.item()] for d in decode.flatten()]
        # strip of the word ending tags if there are some - if it's not a full continuation, what to do?
        for i in range(len(words)):
            if words[i][-4:] == "</w>":
                words[i] = words[i][:-4]
        probs = probs.flatten().detach().numpy() # convert probs from tensor to numpy array
        return words, probs

    def get_continuations(self, sentence, k=10, verbose=False):
        """
        sentence: a string that you want to get next words for
        k: how many next words you want to get
        verbose: do you want to print the output
        """
        sent_vec = self.text_encoder.encode([sentence])
        sent_res = self._get_continuation_tensor(sent_vec)
        if verbose:
            print(sentence)

        return self._get_continuations(sent_res, k, verbose)


    def _get_pos_continuations(self, sentence, words, probs):
        """
        helper function for `get_pos_continuations` that takes the lists of words and
        probabilities and performs all the computation to get the most common pos
        tags independently of processing an individual sentence
        """
        # get POS of all of k continuations
        pos_counter = Counter()

        for word, prob in zip(words, probs):
            sentence_continuation = "{} {}".format(sentence, word)
            encoded = self.nlp(sentence_continuation)
            pos_counter[encoded[-1].pos_] += prob

        # format pos_counter most common output as two lists, one of probs and one of pos tags
        pos_counter_list = list(zip(*pos_counter.most_common()))
        pos_tags, pos_tag_probs = list(pos_counter_list[0]), np.array((pos_counter_list[1]), dtype=np.float32)
        return pos_tags, pos_tag_probs

    def get_pos_continuations(self, sentence, k=10, verbose=False):
        """
        sentence: string you want next parts of speech for
        k: how many top words to analyze 
        NOTE: unlike in the `get_continuation` function, the k is NOT how many
        unique POS tags you want to look at, it's how many words you want to consider
        """
        # get likely next words
        words, probs = self.get_continuations(sentence, k, verbose=False)
        return self._get_pos_continuations(sentence, words, probs)



    ################################################################################
    # The following three functions calculate entropy/surprisal of a SINGLE function
    ################################################################################
    def _get_surprisal(self, distribution, index):
        word_prob = distribution[index]
        return -np.log2(word_prob)
    
    def get_surprisal(self, sentence, word):
        """
        get the -log2 probability of the word following the sentence
        """
        all_probs = self.get_continuation_probs(sentence)
        # if the word is not in the vocabulary in full, represent its probability by the 
        # probability of the first part of its encoding (the 0 index)
        word_index = self.text_encoder.encode([word])[0]
        # word_prob = all_probs[word_index]
        return self._get_surprisal(all_probs, word_index)#-np.log2(word_prob)

    def _get_entropy(self, distribution):
        return -np.sum([p*np.log2(p) if p > 0 else 0 for p in distribution])

    def get_entropy(self, sentence):
        """
        finds the shannon entropy of predicting the word following sentence
        """
        all_probs = self.get_continuation_probs(sentence)
        return self._get_entropy(all_probs)#-np.sum([p*np.log2(p) if p > 0 else 0 for p in all_probs])

    def get_surprisal_entropy_ratio(self, sentence, word):
        "gets ratio betwen surprisal and entropy at the end of the sentence for a given word"
        all_probs = self.get_continuation_probs(sentence)
        word_index = self.text_encoder.encode([word])[0]
        entropy = self._get_entropy(all_probs)
        surprisal = self._get_surprisal(all_probs, word_index)
        return surprisal/entropy

    ####################################################################
    # Same as above but for part of speech
    ####################################################################
    def get_surprisal_pos(self, sentence, pos, k=1000):
        """
        Because we the language model is not a POS tagger, we cannot directly
        calculate the surprisal of the pos from a full probability distribution,
        instead we have to use the degenerate distribution computed from the 
        top k most probable POS continuations

        sentence is full sentence
        pos is pos we want to get surprisal of
        k is how many possible continuations to check
        """
        pos_tags, pos_tag_probs = self.get_pos_continuations(sentence, k)
        pos_index = pos_tags.index(pos) # assume the POS we want is in the list somewhere...
        return self._get_surprisal(pos_tag_probs, pos_index)

        
    def get_entropy_pos(self, sentence, k=1000):
        """
        Disclaimer about degenerate distribution same as above
        """
        pos_tags, pos_tag_probs = self.get_pos_continuations(sentence, k)
        return self._get_entropy(pos_tag_probs)


    
    #####################################################################
    # Gets all of the above metrics for every word in a single sentence #
    #####################################################################
    def get_surprisal_sentence(self, sentence, prepend=None, start=1):
        """
        A little uglier, but perhaps faster

        """
        surprisals = []
        sent_enc = self.text_encoder.encode([sentence])[0] # list of indices in enocder 1-d
        if prepend != None:
            sent_enc = prepend + sent_enc
        sent_dec = [self.text_encoder.decoder[ind] for ind in sent_enc]

        sent_batch = None

        # if you run the language model with the whole sentence the outputs for each
        # word are the probabilities for the next word!
        sent_batch = self.make_batch([sent_enc])
        sent_tensor = self.lm_model(sent_batch)
        for i in range(start, len(sent_enc)):
            surprisals.append(-np.log2(sent_tensor[:,i-1,sent_enc[i]].item()))
        return surprisals, sent_dec
        
    def get_s_h_shr_sentence(self, sentence, prepend=None, start=1):
        """
        calculates the surprisal, entropy, and surprisal-entropy-ratio at each word (defined by bpe)
        in the sentence

        returns, in order
        1. The list of surprisals (len(sentence) - 1)
        2. The list of entropies  (len(sentence) - 1)
        3. The list of rations between surprisals and entropies (len(sentence) - 1)
        4. The decoded tokens that are used by the BPE encoder wrapper
        """
        surprisals, entropies, surprisal_entropy_ratios = [],[],[]
        sent_enc = self.text_encoder.encode([sentence])[0] # list of indices in enocder 1-d
        if prepend != None:
            sent_enc = prepend + sent_enc
        sent_dec = [self.text_encoder.decoder[ind] for ind in sent_enc]

        # start = max(0, min(1, start)) # doesn't work because language model needs to condition on something
        start = 1

        for i in range(start, len(sent_enc)):
            partial_sent_enc = [sent_enc[:i]]
            cont_tensor = self._get_continuation_tensor(partial_sent_enc)
            partial_probs = self.tensor_to_probs(cont_tensor)

            surprisals.append(self._get_surprisal(partial_probs, sent_enc[i]))
            entropies.append(self._get_entropy(partial_probs))
            surprisal_entropy_ratios.append(surprisals[-1]/entropies[-1])

        return surprisals, entropies, surprisal_entropy_ratios, sent_dec
def main(args):
    init(args)

    # Constants
    n_ctx = args.n_ctx
    save_dir = os.path.join(args.output_dir, args.experiment_name, "checkpoints")
    desc = args.desc
    data_dir = args.data_dir
    log_dir = os.path.join(args.output_dir, args.experiment_name, "logs")
    train_log_interval = args.train_log_interval
    val_log_interval = args.val_log_interval
    beam = args.beam
    gen_len = args.gen_len
    k = args.k
    decoding_strategy = args.decoding_strategy
    accum_iter = args.accum_iter

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)
    logger = Logger(log_dir)

    text_encoder = TextEncoder(args.encoder_path, args.vocab_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3

    print("Loading dataset...")
    train_loader = get_loader(os.path.join(data_dir, "train_encoded.jsonl"), args.n_batch, encoder, num_workers=3, shuffle=True)
    val_loader = get_loader(os.path.join(data_dir, "val_encoded.jsonl"), n_gpu, encoder, num_workers=0, shuffle=False, max_size=args.num_val_examples)
    print("Train length: {}, Validation length: {}".format(len(train_loader), len(val_loader)))

    vocab = n_vocab + n_special + n_ctx
    n_updates_total = (len(train_loader) // args.accum_iter) * (args.num_epochs_dat + args.num_epochs_ft)

    dh_model = LMModel(args, vocab=vocab, n_ctx=n_ctx, doc_embed=args.doc_model)

    criterion = nn.CrossEntropyLoss(reduction="none")
    model_opt = OpenAIAdam(dh_model.parameters(),
                           lr=args.lr,
                           schedule=args.lr_schedule,
                           warmup=args.lr_warmup,
                           t_total=n_updates_total,
                           b1=args.b1,
                           b2=args.b2,
                           e=args.e,
                           l2=args.l2,
                           vector_l2=args.vector_l2,
                           max_grad_norm=args.max_grad_norm)

    lm_loss = LMLoss(criterion)
    summary_loss = SummaryLoss(criterion)

    print("Loading Model")
    if args.use_pretrain:
        load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special, path="./model/", path_names="./")
    start_iter, running_loss = load_checkpoint(args.checkpoint, dh_model, model_opt, vocab, n_ctx)

    dh_model.to(device)
    dh_model = DataParallelModel(dh_model)
    lm_loss = DataParallelCriterion(lm_loss)
    summary_loss = DataParallelCriterion(summary_loss)

    for i in range(args.num_epochs_dat):
        start_iter, running_loss = run_epoch(start_iter, running_loss, dh_model, lm_loss, model_opt, train_loader, val_loader, train_log_interval, val_log_interval, device, beam, gen_len, k, decoding_strategy, accum_iter, "DAT Training Epoch [{}/{}]".format(i + 1, args.num_epochs_dat), save_dir, logger, text_encoder, show_progress=args.show_progress, summary_loss=summary_loss)
    for i in range(args.num_epochs_ft):
        start_iter, running_loss = run_epoch(start_iter, running_loss, dh_model, summary_loss, model_opt, train_loader, val_loader, train_log_interval, val_log_interval, device, beam, gen_len, k, decoding_strategy, accum_iter, "FT Training Epoch [{}/{}]".format(i + 1, args.num_epochs_ft), save_dir, logger, text_encoder, show_progress=args.show_progress)
Exemple #19
0
N_EMBD = 768
N_HEAD = 12
N_LAYER = 12
EMBD_PDROP = 0.1
ATTN_PDROP = 0.1
RESID_PDROP = 0.1
AFN = 'gelu'
ENCODER_PATH = 'model/encoder_bpe_40000.json'
BPE_PATH = 'model/vocab_40000.bpe'
N_TRANSFER = 12

random.seed(SEED)
np.random.seed(SEED)
tf.set_random_seed(SEED)

TEXT_ENCODER = TextEncoder(ENCODER_PATH, BPE_PATH)
ENCODER = TEXT_ENCODER.encoder
N_VOCAB = len(TEXT_ENCODER.encoder)

# parser.add_argument('--n_batch', type=int, default=8)
# parser.add_argument('--n_gpu', type=int, default=4)
# parser.add_argument('--lm_coef', type=float, default=0.5)


def transform_texts(list_of_texts):
    tokens = TEXT_ENCODER.encode(list_of_texts, verbose=False)
    n_batch = len(tokens)
    xmb = np.zeros((n_batch, N_CTX, 2), dtype=np.int32)
    mmb = np.zeros((n_batch, N_CTX), dtype=np.float32)
    for i, x in enumerate(tokens):
        x1 = x[:N_CTX]
Exemple #20
0
def fever_app(caller):


    global db, tokenizer, text_encoder, encoder, X_train, M_train, X, M, Y_train, Y,params,sess, n_batch_train, db_file, \
        drqa_index, max_page, max_sent, encoder_path, bpe_path, n_ctx, n_batch, model_file
    global n_vocab,n_special,n_y,max_len,clf_token,eval_lm_losses,eval_clf_losses,eval_mgpu_clf_losses,eval_logits, \
        eval_mgpu_logits,eval_logits

    LogHelper.setup()
    logger = LogHelper.get_logger("papelo")

    logger.info("Load config")
    config = json.load(open(os.getenv("CONFIG_FILE","configs/config-docker.json")))
    globals().update(config)
    print(globals())

    logger.info("Set Seeds")
    random.seed(42)
    np.random.seed(42)
    tf.set_random_seed(42)

    logger.info("Load FEVER DB")
    db = FeverDocDB(db_file)
    retrieval = TopNDocsTopNSents(db, max_page, max_sent, True, False, drqa_index)

    logger.info("Init word tokenizer")
    tokenizer = SimpleWordSplitter()

    # Prepare text encoder
    logger.info("Load BPE Text Encoder")
    text_encoder = TextEncoder(encoder_path, bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    n_y = 3
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    max_len = n_ctx // 2 - 2

    n_batch_train = n_batch

    logger.info("Create TF Placeholders")
    X_train = tf.placeholder(tf.int32, [n_batch, 1, n_ctx, 2])
    M_train = tf.placeholder(tf.float32, [n_batch, 1, n_ctx])
    X = tf.placeholder(tf.int32, [None, 1, n_ctx, 2])
    M = tf.placeholder(tf.float32, [None, 1, n_ctx])

    Y_train = tf.placeholder(tf.int32, [n_batch])
    Y = tf.placeholder(tf.int32, [None])

    logger.info("Model Setup")
    eval_logits, eval_clf_losses, eval_lm_losses = model(X, M, Y, train=False, reuse=None)
    eval_mgpu_logits, eval_mgpu_clf_losses, eval_mgpu_lm_losses = mgpu_predict(X_train, M_train, Y_train)

    logger.info("Create TF Session")
    params = find_trainable_variables('model')

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=float(os.getenv("TF_GPU_MEMORY_FRACTION","0.5")))
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options))
    sess.run(tf.global_variables_initializer())
    sess.run([p.assign(ip) for p, ip in zip(params, joblib.load(model_file))])

    logger.info("Ready")

    def predict(instances):
        predictions = []

        for instance in tqdm(instances):
            sents = retrieval.get_sentences_for_claim(instance["claim"])
            found_evidence = resolve_evidence(sents)
            instance["tokenized_claim"] = " ".join(map(lambda x: x.text, tokenizer.split_words(instance["claim"])))

            sub_instances = make_instances(instance, found_evidence)
            sub_predictions = predict_sub_instances(text_encoder, sub_instances)

            refute_evidence =  [i for i, x in enumerate(sub_predictions) if x == 2]
            support_evidence = [i for i, x in enumerate(sub_predictions) if x == 0]

            if len(support_evidence):
                predicted_label = "SUPPORTS"
                predicted_evidence = [[found_evidence[i]["title"], found_evidence[i]["line_number"]] for i in support_evidence]
            elif len(refute_evidence):
                predicted_label = "REFUTES"
                predicted_evidence = [[found_evidence[i]["title"], found_evidence[i]["line_number"]] for i in refute_evidence]
            else:
                predicted_label = "NOT ENOUGH INFO"
                predicted_evidence = []

            predictions.append({"predicted_label":predicted_label,
                                "predicted_evidence": predicted_evidence})




        return predictions

    return caller(predict)
def main(args):
    # Constants
    n_ctx = args.n_ctx
    desc = args.desc

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)

    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3

    print("Loading dataset...")
    test_loader = get_loader(args.data_file,
                             args.n_batch,
                             encoder,
                             num_workers=1,
                             shuffle=False,
                             subset=args.subset)

    vocab = n_vocab + n_special + n_ctx
    dh_model = LMModel(args,
                       vocab=vocab,
                       n_ctx=n_ctx,
                       doc_embed=args.doc_model)

    print("Loading model...")
    load_openai_pretrained_model(dh_model.transformer,
                                 n_ctx=n_ctx,
                                 n_special=n_special,
                                 path="./model/",
                                 path_names="./")
    if args.checkpoint != "none":
        checkpoint = torch.load(args.checkpoint, map_location='cpu')
        state_dict = checkpoint["state_dict"]
        for key in list(state_dict.keys()):
            state_dict[key[7:]] = state_dict[key]
            del state_dict[key]
        pos_emb_mask = torch.zeros(1, 1, vocab)
        pos_emb_mask[:, :, -n_ctx] = -1e12
        state_dict['pos_emb_mask'] = pos_emb_mask
        dh_model.load_state_dict(state_dict)

    dh_model.to(device)
    dh_model = DataParallelModel(dh_model)

    stop_words = []
    if args.stop_words is not None:
        with open(args.stop_words) as f:
            for line in f:
                stop_words.append(line)
    evaluate_model(dh_model, test_loader, text_encoder, device, args.beam,
                   args.gen_len, args.k, args.decoding_strategy,
                   args.save_file, args.gen_dir, args.tgt_dir, args.max_len,
                   stop_words, args)
Exemple #22
0
    parser.add_argument('--b1', type=float, default=0.9)
    parser.add_argument('--b2', type=float, default=0.999)
    parser.add_argument('--e', type=float, default=1e-8)

    args = parser.parse_args()
    print(args)
    globals().update(
        args.__dict__
    )  ## https://thepythonguru.com/python-builtin-functions/globals/
    random.seed(seed)
    np.random.seed(seed)
    tf.set_random_seed(seed)

    logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)),
                          **args.__dict__)  ## 로그 유틸
    text_encoder = TextEncoder(encoder_path, bpe_path)  ## BPE Encoder 생성
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    ## TextEncoder(BPE)를 이용해 train, valid, test set 생성
    (trX1, trX2, trX3,
     trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3) = encode_dataset(
         rocstories(data_dir),
         encoder=text_encoder)  ## Train, Valid, Test 데이터 로딩
    n_y = 2
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    max_len = n_ctx // 2 - 2
Exemple #23
0
    parser.add_argument('--encoder_path',
                        type=str,
                        default=pretrained_model_path +
                        '/encoder_bpe_40000.json')
    parser.add_argument('--bpe_path',
                        type=str,
                        default=pretrained_model_path + '/vocab_40000.bpe')

    args = parser.parse_args()
    print(args)

    # Constants
    n_ctx = args.n_ctx

    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    #encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    tokens_regular = n_vocab
    token_start = text_encoder.encoder['_start_'] = len(
        text_encoder.encoder)  # Last number (increments)
    token_delim = text_encoder.encoder['_delimiter_'] = len(
        text_encoder.encoder)  # Last number (increments)
    token_clf = text_encoder.encoder['_classify_'] = len(
        text_encoder.encoder)  # Last number (increments)

    tokens_special = len(
        text_encoder.encoder) - tokens_regular  # Number of extra tokens

    vocab_count = tokens_regular + tokens_special
        'embd_pdrop': 0.1,
        'attn_pdrop': 0.1,
        'resid_pdrop': 0.1,
        'afn': 'gelu',
        'clf_pdrop': 0.1
    })

    args = DEFAULT_CONFIG

    encoder = pickle.load(open('vect.p', 'rb')).vocabulary_

    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #n_gpu = torch.cuda.device_count()
    #print("device", device, "n_gpu", n_gpu)

    text_encoder = TextEncoder()
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    x = pd.read_csv('../notes_small.csv').iloc[:200]
    x['NOTE_TEXT'] = x['NOTE_TEXT'].apply(u2.cleanNotes)

    seq = text_encoder.encode(x['NOTE_TEXT'])
    seq = [s[:64] if len(s) > 64 else s for s in seq]
    seq = sorted(seq, key=lambda x: len(x))

    #Setup Model
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    parser.add_argument('--bpe_path', type=str, default='model/vocab_40000.bpe')
    parser.add_argument('--n_transfer', type=int, default=12)
    parser.add_argument('--lm_coef', type=float, default=0.5)
    parser.add_argument('--b1', type=float, default=0.9)
    parser.add_argument('--b2', type=float, default=0.999)
    parser.add_argument('--e', type=float, default=1e-8)

    args = parser.parse_args()
    print(args)
    globals().update(args.__dict__)
    random.seed(seed)
    np.random.seed(seed)
    tf.set_random_seed(seed)

    logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__)
    text_encoder = TextEncoder(encoder_path, bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    #(trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3) = encode_dataset(rocstories(data_dir), encoder=text_encoder)
    #enco_ry = ruoyao(data_dir)
    #(trX1,trX2,tyY), (vaX1, vaX2, vaY), (teX1, teX2) = ruoyao(data_dir)
    #print(trX1[0])
    (trX1,trX2,trY), (vaX1, vaX2, vaY), (teX1, teX2, teY) = encode_dataset(ruoyao(data_dir), encoder=text_encoder)
    n_y = 2
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    max_len = n_ctx//2-2
def main(args):
    init(args)

    # Constants
    n_ctx = args.n_ctx
    data_dir = args.data_dir

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)

    text_encoder = TextEncoder(args.encoder_path, args.bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)
    text_encoder.decoder[len(encoder)] = '_start_'
    encoder['_start_'] = len(encoder)
    text_encoder.decoder[len(encoder)] = '_delimiter_'
    encoder['_delimiter_'] = len(encoder)
    text_encoder.decoder[len(encoder)] = '_classify_'
    encoder['_classify_'] = len(encoder)

    n_special = 3  # XD: useless for language modeling task
    vocab = n_vocab + n_special + n_ctx

    lm_model = LMModel(args,
                       vocab,
                       n_ctx,
                       return_probs=True,
                       doc_embed=args.doc_model)
    load_openai_pretrained_model(lm_model.transformer,
                                 n_ctx=n_ctx,
                                 n_special=n_special)
    if args.checkpoint != "none":
        checkpoint = torch.load(args.checkpoint, map_location='cpu')
        state_dict = checkpoint["state_dict"]
        for key in list(state_dict.keys()):
            state_dict[key[7:]] = state_dict[key]
            del state_dict[key]
        pos_emb_mask = torch.zeros(1, 1, vocab)
        pos_emb_mask[:, :, -n_ctx] = -1e12
        state_dict['pos_emb_mask'] = pos_emb_mask
        lm_model.load_state_dict(state_dict)
    lm_model.to(device)
    lm_model = DataParallelModel(lm_model)

    train_bar = get_loader(os.path.join(data_dir, "val_encoded.jsonl"),
                           n_gpu,
                           encoder,
                           num_workers=1,
                           shuffle=True,
                           max_size=args.n_iter)
    srcs, hyps, refs = [], [], []
    with torch.no_grad():
        lm_model.eval()
        for i, (pad_output, mask_output) in enumerate(tqdm(train_bar), 1):
            src_strs, tgt_strs, gen_strs = generate_outputs(
                lm_model, pad_output, mask_output, text_encoder, device,
                args.beam, args.gen_len, args.k, args.decoding_strategy)
            srcs.extend(src_strs)
            hyps.extend(gen_strs)
            refs.extend(tgt_strs)

    for i in range(len(hyps)):
        print("*" * 50)
        print("Source: {}".format(srcs[i]))
        print('Hypothesis: {}'.format(hyps[i]))
        print("Reference: {}".format(refs[i]))