Esempio n. 1
0
    def infilling_ngram(self, context: str):
        result.clear()
        with open(os.path.join(MODEL_DIR, 'additional_ids_to_tokens.pkl'),
                  'rb') as f:
            additional_ids_to_tokens = pickle.load(f)
        additional_tokens_to_ids = {
            v: k
            for k, v in additional_ids_to_tokens.items()
        }
        try:
            ilm.tokenize_util.update_tokenizer(additional_ids_to_tokens,
                                               tokenizer)
        except ValueError:
            print('Already updated')

        # Load model
        device = 'cpu'
        model = GPT2LMHeadModel.from_pretrained(MODEL_DIR)
        model.eval()
        _ = model.to(device)
        context_ids = ilm.tokenize_util.encode(context, tokenizer)
        _blank_id = ilm.tokenize_util.encode(' _', tokenizer)[0]
        # Infilling type: One of sentence, document, mixture, paragraph, ngram, or word
        context_ids[context_ids.index(
            _blank_id)] = additional_tokens_to_ids['<|infill_ngram|>']

        generated = infill_with_ilm(model,
                                    additional_tokens_to_ids,
                                    context_ids,
                                    num_infills=5)
        for g in generated:
            result.append(str(ilm.tokenize_util.decode(g, tokenizer)))
        return result
Esempio n. 2
0
    def infilling_word(self, context: str, order: int, mask: str):
        result.clear()
        # with open(os.path.join(MODEL_DIR, 'additional_ids_to_tokens.pkl'), 'rb') as f:
        #     additional_ids_to_tokens = pickle.load(f)
        # additional_tokens_to_ids = {v: k for k, v in additional_ids_to_tokens.items()}
        # try:
        #     ilm.tokenize_util.update_tokenizer(additional_ids_to_tokens, tokenizer)
        # except ValueError:
        #     print('Already updated')

        # Load model
        # device = 'cpu'
        # model = GPT2LMHeadModel.from_pretrained(MODEL_DIR)
        # model.eval()
        # _ = model.to(device)

        context_ids = ilm.tokenize_util.encode(context, tokenizer)
        _blank_id = ilm.tokenize_util.encode(' _', tokenizer)[0]
        print('blank', _blank_id)
        print('context', context)
        # Infilling type: One of sentence, document, mixture, paragraph, ngram, or word
        # context_ids[context_ids.index(_blank_id)] = additional_tokens_to_ids['<|infill_word|>']
        print('before', context_ids)
        count_mask = 0
        for i in range(len(context_ids)):
            if _blank_id == context_ids[i]:
                if mask[count_mask] == 'word':
                    context_ids[i] = self.additional_tokens_to_ids[
                        '<|infill_word|>']
                if mask[count_mask] == 'sent':
                    context_ids[i] = self.additional_tokens_to_ids[
                        '<|infill_sentence|>']
                count_mask += 1
        print('after', context_ids)

        generated = infill_with_ilm(self.model,
                                    self.additional_tokens_to_ids,
                                    context_ids,
                                    num_infills=order)
        print('generated', generated)
        for g in generated:
            result.append(str(ilm.tokenize_util.decode(g, tokenizer)))
        return result
Esempio n. 3
0
# Replace blanks with appropriate tokens from left to right
_blank_id = ilm.tokenize_util.encode(' _', tokenizer)[0]
context_ids[context_ids.index(
    _blank_id)] = additional_tokens_to_ids['<|infill_word|>']
context_ids[context_ids.index(
    _blank_id)] = additional_tokens_to_ids['<|infill_word|>']
#context_ids[context_ids.index(_blank_id)] = additional_tokens_to_ids['<|infill_ngram|>']
#context_ids[context_ids.index(_blank_id)] = additional_tokens_to_ids['<|infill_sentence|>']
#context_ids[context_ids.index(_blank_id)] = additional_tokens_to_ids['<|infill_sentence|>']
#context_ids[context_ids.index(_blank_id)] = additional_tokens_to_ids['<|infill_sentence|>']
print(ilm.tokenize_util.decode(context_ids, tokenizer))

from ilm.infer import infill_with_ilm

generated = infill_with_ilm(model,
                            additional_tokens_to_ids,
                            context_ids,
                            num_infills=10)
for g in generated:
    print('-' * 80)
    print(ilm.tokenize_util.decode(g, tokenizer))

blankCandidates = []

with open(
        f"/u/scr/mhahn/PRETRAINED/GLUE/glue_data/SST-2/dev_alternatives_c_sentBreak_new_finetuned_large.tsv",
        "r") as inFile:
    for line in inFile:
        if line.startswith("####"):
            next(inFile)
            tokenized = next(inFile).strip().split(" ")
            print("TOK", tokenized)