def create_openai_lm_head(self, config, input_ids, token_type_ids, position_ids,
                                mc_labels, lm_labels, mc_token_ids):
     model = OpenAIGPTLMHeadModel(config)
     model.eval()
     loss = model(input_ids, position_ids, token_type_ids, lm_labels)
     lm_logits = model(input_ids, position_ids, token_type_ids)
     outputs = {
         "loss": loss,
         "lm_logits": lm_logits,
     }
     return outputs
Example #2
0
def main():
  # 3 examples
  train_dataset = 'small brown fox jumps over the lazy dog\n' \
                  'small brown fox jumps over the lazy dog\n' \
                  'small brown fox jumps over the lazy dog\n'
  tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt',
                                                 special_tokens=[])
  tokenized = [tokenizer.tokenize(t) for t in train_dataset.strip().split('\n')]

  encoded=[tokenizer.convert_tokens_to_ids(t) for t in tokenized]  # 3x8
  dataset = TensorDataset(torch.tensor(encoded))
  sampler = SequentialSampler(dataset)
  dataloader = DataLoader(dataset, sampler=sampler, batch_size=1)
  model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')

  optimizer = torch.optim.SGD(model.parameters(), lr = 0.0001, momentum=0.9)

  batch = next(iter(dataloader))
  batch=batch[0]   # dataloader gives [batch] instead of batch...why?
 
  for i in range(20):
    loss = model(input_ids=batch, lm_labels=batch)
    print(loss.detach().numpy())
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
Example #3
0
def perplexity_filtering(sentences_df, threshold=1000, sentence_col="sentence"):
    """
    Function used to filter sentences by perplexity

    ---

    **Arguments**\n
    `sentences_df` (DataFrame): DataFrame with sentences and which contains *sentence* column.\n
    `threshold` (int): Perplexity threshold used for filtering. Default value = 1000.\n
    `sentence_col` (String): Name of the sentence column in data frame. Default value = "sentence".

    ---

    **Returns**\n
    `sentences_df` (DataFrame): DataFrame filtered by perplexity.
    """

    # Load pre-trained model (weights)
    model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
    model.eval()
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

    def score(sentence):
        tokenize_input = tokenizer.tokenize(sentence)
        tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
        loss=model(tensor_input, lm_labels=tensor_input)
        return math.exp(loss.item())

    l = list(sentences_df)
    sentences_df['perplexity'] = sentences_df[sentence_col].apply(lambda x: score(x) if len(re.sub('[^0-9a-zA-Z ]', '', x)) > 0 else -1.0)
    return sentences_df[(sentences_df['perplexity'] <= threshold) & (sentences_df['perplexity'] != - 1.0)][l]
Example #4
0
 def __init__(self):
     self.lm_model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
     self.lm_model.eval()
     self.cuda = torch.cuda.is_available()
     if self.cuda:
         self.lm_model = self.lm_model.cuda()
     self.tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
Example #5
0
def main():
    global tokenizer, model

    train_dataset = 'the quick brown fox jumps over the lazy dog'
    tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
    tokenized = [tokenizer.tokenize(train_dataset)]

    # [[481, 2279, 2507, 8573, 11670, 715, 481, 8447, 2585]]
    encoded = [tokenizer.convert_tokens_to_ids(t) for t in tokenized]
    model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')

    optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

    batch = torch.tensor(encoded)

    start_words = ['the']
    start_tokens = [tokenizer.convert_tokens_to_ids(w) for w in start_words]

    for i in range(20):
        loss = model(input_ids=batch, lm_labels=batch)
        perplexity = math.exp(loss.item())
        print('%5.2f -- %s' % (perplexity, decode(start_tokens)))

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
Example #6
0
    def __init__(self,
                 opt,
                 vocab_size,
                 pad_idx,
                 start_idx,
                 end_idx,
                 special_token_len,
                 dict,
                 longest_label=1,
                 length_penalty=1.0,
                 diversity_groups=1,
                 diversity_coef=0.2,
                 annealing_topk=None,
                 annealing=0,
                 sample=False,
                 temperature=0.7):
        super().__init__()
        # original vocab size plus special vocab
        self.vocab_size = vocab_size + 40478
        self.token_type_dict = {}
        # max is 30
        for i in range(29):
            self.token_type_dict['dis' + str(i)] = self.vocab_size + i
        # pred for prediction turn embedding
        self.token_type_dict['pred'] = self.vocab_size + 29
        # the remaining 30 is the distance size
        special_token_len += 30
        self.vocab_size += 29
        # regard input and output as one sentence, given the input as context, generate the next sentence.
        self.transformer_module = OpenAIGPTLMHeadModel.from_pretrained(
            'openai-gpt', num_special_tokens=special_token_len)
        self.pad_idx = pad_idx
        self.start_idx = start_idx
        self.end_idx = end_idx
        self.register_buffer('start_tensor', torch.LongTensor([start_idx]))
        self.register_buffer('pred_turn_tensor',
                             torch.LongTensor([self.token_type_dict['pred']]))
        # default beam equal to 1
        self.beam_size = opt.get('beam_size', 1)
        self.rank = opt.get('rank_candidates', False)

        self.use_turn = opt.get('encoder_turn_use', False)
        self.use_dis = opt.get('encoder_dis_use', False)
        # longest label
        self.longest_label = min(longest_label,
                                 opt.get('decode_max_seq_len', 100))
        self.length_penalty_coef = length_penalty
        self.diversity_groups = diversity_groups
        self.diversity_coef = diversity_coef
        self.annealing_topk = annealing_topk
        self.annealing = annealing
        self.temperature = temperature
        self.topk = opt.get('top_k', 0)
        self.dict = dict
        self.no_repeat_ngram_size = 2
        self.dropout = nn.Dropout(p=0.2)
        self.linear = nn.Linear(768, 2, bias=False)
        nn.init.normal_(self.linear.weight, std=0.02)
    def __init__(perplexity_threshold=137):
        ### Lang Model:
        # Load Language Model
        # Load pre-trained model (weights)
        model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
        model.eval()
        # Load pre-trained model tokenizer (vocabulary)
        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

        ### For clarity error:
        self.perplexity_threshold = perplexity_threshold
    def _load_model(self):
        """ Helper function for loading model and tokenizer in one shot
        and assigning as class attributes

        """
        # Load tokenizer and model within `main` function
        ckpt = download_pretrained_model()
        print("Model location:", ckpt)

        self.tokenizer = OpenAIGPTTokenizer.from_pretrained(ckpt)
        self.model = OpenAIGPTLMHeadModel.from_pretrained(ckpt)
        print("Tokenizer and model loaded...")
Example #9
0
def run():
    parser = ArgumentParser()
    parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache")
    parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model")
    parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")

    parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances")
    parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature")
    parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        args.model_checkpoint = download_pretrained_model()

    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_checkpoint)
    model = OpenAIGPTLMHeadModel.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    model.eval()

    logger.info("Sample a personality")
    personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache)
    personality = random.choice(personalities)
    logger.info("Selected personality: %s", tokenizer.decode(chain(*personality)))

    history = []
    while True:
        raw_text = input(">>> ")
        while not raw_text:
            print('Prompt should not be empty!')
            raw_text = input(">>> ")
        history.append(tokenizer.encode(raw_text))
        with torch.no_grad():
            out_ids = sample_sequence(personality, history, tokenizer, model, args)
        history.append(out_ids)
        history = history[-(2*args.max_history+1):]
        out_text = tokenizer.decode(out_ids, skip_special_tokens=True)
        print(out_text)
Example #10
0
    def __init__(self, args):
        super().__init__()

        if args.gpt_model_dir is not None:
            # load bert model from file
            gpt_model_name = str(args.gpt_model_dir) + "/"
            dict_file = gpt_model_name
            print("loading Open AI GPT model from {}".format(gpt_model_name))
        else:
            # load GPT model from huggingface cache
            gpt_model_name = args.gpt_model_name
            dict_file = gpt_model_name

        # Load pre-trained model tokenizer (vocabulary)
        self.tokenizer = OpenAIGPTTokenizer.from_pretrained(dict_file)

        # GPT uses different way to represent BPE then BERT. Namely, the
        # final suffixes are indicated with </w> suffix, while pieces that must
        # be followed are written as is. In BERT the prefixes are written as is
        # while the parts that must follow (not be followed!) have '##' prefix.
        # There is no one-to-one coversion. But at least we may make pieces that
        # may form a full word look the same.
        # Note that we should be very careful now,
        # tokenizer.convert_tokens_to_ids won't work with our vocabulary.
        def convert_word(word):
            if word == OPENAI_UNK:
                return word
            if word == '\n</w>':
                # Redefine symbol EOS to improve visualization.
                return OPENAI_EOS
            return word[:-4] if word.endswith('</w>') else f'{word}##'

        _, gpt_vocab = zip(*sorted(self.tokenizer.decoder.items()))
        self.vocab = [convert_word(word) for word in gpt_vocab]
        self._init_inverse_vocab()

        # Get UNK symbol as it's written in the origin GPT vocab.
        unk_index = self.inverse_vocab[OPENAI_UNK]
        self.unk_symbol = self.tokenizer.decoder[unk_index]

        # Load pre-trained model (weights)
        self.gpt_model = OpenAIGPTLMHeadModel.from_pretrained(gpt_model_name)
        self.gpt_model.eval()
        print(self.gpt_model.config)

        # Sanity check.
        assert len(self.vocab) == self.gpt_model.config.vocab_size
        assert 0 == self.gpt_model.config.n_special

        self.eos_id = self.inverse_vocab[OPENAI_EOS]
        self.model_vocab = self.vocab
Example #11
0
    def __init__(self, opt, shared=None):
        super(TransformerAgent, self).__init__(opt, shared)

        args = AttrDict(
            opt)  # to keep most commands identical to the interact.py script
        self.args = args

        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__file__)
        self.logger.info(pformat(args))

        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

        if shared is None:
            self.logger.info("Get pretrained model and tokenizer")
            if args.model_checkpoint == "":
                args.model_checkpoint = download_pretrained_model()

            self.tokenizer = OpenAIGPTTokenizer.from_pretrained(
                args.model_checkpoint)
            if self.args.eval_type == "hits@1":
                self.model_checkpoint = OpenAIGPTDoubleHeadsModel.from_pretrained(
                    args.model_checkpoint)
            else:
                self.model_checkpoint = OpenAIGPTLMHeadModel.from_pretrained(
                    args.model_checkpoint)
            self.model_checkpoint.to(args.device)
            self.model_checkpoint.eval()

            self.logger.info("Build BPE prefix dictionary")
            convai_dict = build_dict()
            assert len(convai_dict) == 19304
            self.prefix2words = self.get_prefix2words(convai_dict)
        else:
            self.model_checkpoint = shared['model']
            self.tokenizer = shared['tokenizer']
            self.prefix2words = shared['prefix2words']

        # self.special_tokens_ids = self.tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)

        self.persona = []
        self.history = []
        self.labels = []

        self.reset()
Example #12
0
def load(small=False):
    """
    Load OpenAI model and NLP model

    Requires running

    > python -m spacy download en_core_web_lg
    """
    # Load pretrained model and tokenizer
    global model, tokenizer, nlp
    model = OpenAIGPTLMHeadModel.from_pretrained("openai-gpt").eval()
    tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
    if small:
        nlp = spacy.load("en_core_web_sm")
    else:
        nlp = spacy.load("en_core_web_lg")
    return nlp
Example #13
0
    def __init__(self):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = OpenAIGPTTokenizer.from_pretrained(
            download_pretrained_model())
        self.model = OpenAIGPTLMHeadModel.from_pretrained(
            download_pretrained_model())
        self.model.to(self.device)
        self.model.eval()

        with open(join(dirname(realpath(__file__)), "RoboyPersonality.txt"),
                  "r") as input_file:
            roboy_personality = input_file.read().split('\n')
        self.personality = []
        for p in roboy_personality:
            self.personality.append(self.tokenizer.encode(p))
        self.history = []
        self.fix_spaces = re.compile(r'\s*([?!.,]+(?:\s+[?!.,]+)*)\s*')
def getParams():
    model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
    model.eval()
    tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
    return model, tokenizer
Example #15
0
import torch

from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel

if __name__ == '__main__':
    tokenizer = OpenAIGPTTokenizer.from_pretrained(
        '/home/work/waka/projects/pytorch-pretrained-BERT/samples/LM/Ads/models/epoch1'
    )
    model = OpenAIGPTLMHeadModel.from_pretrained(
        '/home/work/waka/projects/pytorch-pretrained-BERT/samples/LM/Ads/models/epoch1'
    )
    device = torch.device("cuda")
    model.to(device)
    txt = '[BOA] Locked Out? Call Us Now [SEP] Get The Facts You Need To Know [SEP] Fast, Reliable Auto Lockout Services. Call now to Schedule a Service! [EOA]'
    ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(txt))
    model.eval()
    with torch.no_grad():
        ppls = model.forward_ppl(
            torch.tensor([ids]).to(device),
            torch.tensor([len(ids)]).to(device))
    print(ppls.numpy().item())
    pass
Example #16
0
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "openai-gpt"
print("using model: {}".format(model_name), file=sys.stderr)

split_words = True
if 'no_split' in sys.argv:
    split_words = False
    print("We don't split words", file=sys.stderr)

use_postfix = False
if 'use_postfix' in sys.argv:
    use_postfix = True
    print("We compute probabilities over the entire sentence", file=sys.stderr)

model = OpenAIGPTLMHeadModel.from_pretrained(model_name)
tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model.eval()
model.to(device)


def get_probs_for_words(sent, w1, w2):
    pre, target, post = sent.split("***")
    if "mask" in target.lower():
        target = ["[MASK]"]
    else:
        target = tokenizer.tokenize(target)
    tokens = tokenizer.tokenize(pre)
    target_idx = len(tokens)
Example #17
0
def run():
    parser = ArgumentParser()
    parser.add_argument("--model_type",
                        type=str,
                        default="gpt",
                        help="gpt or gpt2")
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--filename",
                        type=str,
                        default="data/instances_dev.pkl",
                        help="File to use for decoding")
    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=50,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature",
                        type=int,
                        default=0.7,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")

    # While using SQUASH in the pipeline mode, prefer using the --key flag
    parser.add_argument(
        "--key",
        type=str,
        default=None,
        help=
        "Override the default settings if the key is set, used in pipeline mode"
    )
    args = parser.parse_args()

    if args.key is not None:
        # Override some the filename and top_p default settings if args.key is set
        # This is done when the question generation module is being used in the SQUASH pipeline mode
        args.filename = "squash/temp/%s/input.pkl" % args.key

        with open("squash/temp/%s/metadata.json" % args.key, "r") as f:
            metadata = json.loads(f.read())
        args.top_p = metadata["settings"]["top_p"]

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")

    if args.model_type == 'gpt2':
        tokenizer = GPT2Tokenizer.from_pretrained(args.model_checkpoint)
        model = GPT2LMHeadModel.from_pretrained(args.model_checkpoint)
    else:
        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_checkpoint)
        model = OpenAIGPTLMHeadModel.from_pretrained(args.model_checkpoint)

    model.to(args.device)
    model.eval()

    data = get_positional_dataset_from_file(tokenizer, args.filename)
    final_output_dict = {"version": "squash-2.0", "data": [{"paragraphs": []}]}
    question_number = 0

    para_cache = {"index": None, "hidden_states": None}

    for inst in tqdm.tqdm(data):
        with torch.no_grad():
            para_index = inst["para_index"]
            # Questions from the same paragraph all appear together
            # We can re-use the paragraph hidden representations for different questions in the same paragraph
            if para_index != para_cache["index"]:
                # Since we have moved to a new paragraph, generate its cache
                para_cache["hidden_states"] = None
                # Ignore the answer and question while building the input
                instance, _ = build_para_only_input_from_segments(
                    inst, tokenizer)
                input_ids = torch.tensor(instance['input_ids'],
                                         device=args.device).unsqueeze(0)
                token_type_ids = torch.tensor(instance['token_type_ids'],
                                              device=args.device).unsqueeze(0)

                # Run a forward pass to generate the para caches
                _, para_cache["hidden_states"] = model(
                    input_ids, token_type_ids=token_type_ids)

            # Sample a question using the paragraph cache
            output = sample_sequence(inst, tokenizer, model, args, para_cache)

        original_paragraph = tokenizer.decode(output['paragraph'])
        generated_question = tokenizer.decode(output['question'],
                                              skip_special_tokens=True)
        original_answer = tokenizer.decode(output['answer'],
                                           skip_special_tokens=True)
        para_index = inst['para_index']
        para_cache["index"] = inst['para_index']

        # verify whether the answer position is correct, since this will be utilized for filtering
        original_ans_position = output["answer_position"]
        if original_paragraph[
                output["answer_position"]:output["answer_position"] +
                len(original_answer)] != original_answer:
            # This should never be executed, only used as a last resort
            logger.info("Answer mismatch!")
            original_ans_position = original_paragraph.index(original_answer)

        # Output in a SQUAD-like format with questions clumped together under their parent paragraph
        if len(final_output_dict["data"][0]["paragraphs"]) > para_index:
            # verify whether the paragraph text is identical
            assert original_paragraph == final_output_dict["data"][0][
                "paragraphs"][para_index]['context']
            # append the question answer pair
            final_output_dict["data"][0]["paragraphs"][para_index][
                'qas'].append({
                    'id':
                    'question_%d' % question_number,
                    'question':
                    generated_question,
                    'answers': [{
                        'text': original_answer,
                        'answer_start': original_ans_position,
                    }],
                    'class':
                    output['class'],
                    'algorithm':
                    output['algorithm'],
                    'is_impossible':
                    False
                })
        else:
            # add a new question to the list of QA pairs
            final_output_dict['data'][0]['paragraphs'].append({
                'context':
                original_paragraph,
                'qas': [{
                    'id':
                    'question_%d' % question_number,
                    'question':
                    generated_question,
                    'answers': [{
                        'text': original_answer,
                        'answer_start': original_ans_position,
                    }],
                    'class':
                    output['class'],
                    'algorithm':
                    output['algorithm'],
                    'is_impossible':
                    False
                }]
            })

        question_number += 1

    with open("squash/temp/%s/generated_questions.json" % args.key, "w") as f:
        f.write(json.dumps(final_output_dict))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name', type=str, default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
    parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)
    special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    model = OpenAIGPTLMHeadModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens))
    model.to(device)

    # Load and encode the datasets
    if not args.train_dataset and not args.eval_dataset:
        roc_stories = cached_path(ROCSTORIES_URL)
    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)
    logger.info("Encoding dataset...")
    train_dataset = load_rocstories_dataset(args.train_dataset)
    eval_dataset = load_rocstories_dataset(args.eval_dataset)
    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
    input_length = max(len(sentence[:max_length]) + max(len(cand1[:max_length]), len(cand2[:max_length]), len(cand3[:max_length]), len(cand4[:max_length])) + 3  \
                           for dataset in encoded_datasets for sentence, cand1, cand2, cand3, cand4, _ in dataset)
    input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids)
    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Prepare optimizer
    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
        num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
        optimizer = OpenAIAdam(optimizer_grouped_parameters,
                               lr=args.learning_rate,
                               warmup=args.warmup_proportion,
                               max_grad_norm=args.max_grad_norm,
                               weight_decay=args.weight_decay,
                               t_total=num_train_optimization_steps)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, lm_labels = batch
                losses = model(input_ids, lm_labels)
                loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0])

    # Save a trained model
    if args.do_train:
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir)
        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
        model.to(device)

    if args.do_eval:
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, lm_labels = batch
            with torch.no_grad():
                _, lm_loss = model(input_ids, lm_labels)
                _, lm_logits = model(input_ids)

            lm_logits = lm_logits.detach().cpu().numpy()
            lm_labels = lm_labels.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(lm_logits, lm_labels)

            eval_loss += lm_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        train_loss = tr_loss/nb_tr_steps if args.do_train else None
        result = {'eval_loss': eval_loss,
                  'eval_accuracy': eval_accuracy,
                  'train_loss': train_loss}

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
Example #19
0
def run_model():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model_name',
        type=str,
        default='openai-gpt',
        help='pretrained model name or path to local checkpoint')
    parser.add_argument('--setting', type=str, default='explain_predict')
    parser.add_argument('--eval_preds_prefix', type=str, default='preds_')
    parser.add_argument("--n_train_print", type=int, default=10)
    parser.add_argument("--n_gen", type=int, default=20)
    parser.add_argument("--batch_size", type=int, default=-1)
    parser.add_argument("--length", type=int, default=-1)
    parser.add_argument("--temperature", type=int, default=1)
    parser.add_argument("--top_k", type=int, default=0)
    parser.add_argument('--unconditional',
                        action='store_true',
                        help='If true, unconditional generation.')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_test",
                        action='store_true',
                        help="Whether to run eval on the test set.")
    parser.add_argument("--do_eval_train",
                        action='store_true',
                        help="Whether to run eval on the test set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=10)
    parser.add_argument('--num_eval_print', type=int, default=15)
    parser.add_argument('--train_batch_size', type=int, default=36)
    parser.add_argument('--eval_batch_size', type=int, default=60)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=1e-6)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--data',
                        type=str,
                        default='/stage/examples/commonsenseqa/')

    args = parser.parse_args()
    print(args)

    if args.batch_size == -1:
        args.batch_size = 1

    np.random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval and not args.do_test:
        raise ValueError(
            "At least one of `do_train` or `do_eval`  or do_test must be True."
        )

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    special_tokens = [
        '_start_</w>', 'or</w>', '_answer_</w>', '_classify_</w>', '_end_</w>'
    ]
    tokenizer = OpenAIGPTTokenizer.from_pretrained(
        args.model_name, special_tokens=special_tokens)
    special_tokens_ids = list(
        tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    model = OpenAIGPTLMHeadModel.from_pretrained(
        args.model_name, num_special_tokens=len(special_tokens))
    model.to(device)

    datasets = parse_cqa(args.data, args.setting)
    numericalized = [
        CommonsenseExample.numericalize_list(
            CommonsenseExample.tokenize_list(d, tokenizer), tokenizer)
        for d in datasets
    ]

    tensor_datasets = pre_process_datasets(numericalized, *special_tokens_ids)

    #    train_tensor_dataset, eval_tensor_dataset, test_tensor_dataset = tensor_datasets[0], tensor_datasets[1], tensor_datasets[2]
    train_sampler, train_data = None, None
    if args.do_train or args.do_eval_train:
        train_tensor_dataset = tensor_datasets[0]
        train_data = TensorDataset(*train_tensor_dataset)
        train_sampler = RandomSampler(train_data)
        if args.do_eval_train:
            train_sampler = SequentialSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

    if args.do_eval:
        if args.do_eval_train:
            eval_data = train_data
            eval_sampler = train_sampler
        else:
            eval_tensor_dataset = tensor_datasets[1]
            eval_data = TensorDataset(*eval_tensor_dataset)
            eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

    if args.do_test:
        test_tensor_dataset = tensor_datasets[-1]
        test_data = TensorDataset(*test_tensor_dataset)
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=args.eval_batch_size)

    # Prepare optimizer
    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        num_train_optimization_steps = len(
            train_data) * args.num_train_epochs // args.train_batch_size
        optimizer = OpenAIAdam(optimizer_grouped_parameters,
                               lr=args.learning_rate,
                               warmup=args.warmup_proportion,
                               max_grad_norm=args.max_grad_norm,
                               weight_decay=args.weight_decay,
                               t_total=num_train_optimization_steps)

    def trim_unks(x):
        try:
            unk_id = x.index('_end_</w>')
            return x[:unk_id]
        except:
            return x

    def detokenize(x):
        y = ''.join(trim_unks(x))
        y = y.replace('</w>', ' ')
        y = y.replace(' .', '.')
        y = y.replace(' ,', ',')
        y = y.replace(' ?', '?')
        y = y.replace(' !', '!')
        y = y.replace(' \' ', '\'')
        y = y.replace(' \'re', '\'re')
        y = y.replace(' \'s', '\'s')
        y = y.replace(' n\'t', 'n\'t')
        return y

    def detok_batch(x):
        if not isinstance(x, list):
            x = x.tolist()
        return [
            detokenize(
                tokenizer.convert_ids_to_tokens([z for z in y if z >= 0]))
            for y in x
        ]

    if args.do_train:
        best_eval = 0
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in range(int(args.num_train_epochs)):
            tr_loss, train_ppl, n_train_examples = 0, 0, 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            train_pred_strs, train_lab_strs = [], []
            for step, batch in enumerate(tqdm_bar):
                inputs = batch[0].to(device)
                labels = batch[1].to(device)
                loss = model(inputs, lm_labels=labels)
                train_ppl += loss.item() * inputs.size(0)
                n_train_examples += inputs.size(0)
                loss.backward()
                optimizer.step()
                tr_loss += loss.item()
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                nb_tr_steps += 1
                if args.n_train_print > 0:
                    with torch.no_grad():
                        preds = sample(model, batch[2], 10, device)

                    pred_str = detok_batch(preds)
                    label_str = detok_batch(labels)
                    train_lab_strs.extend(label_str)
                    train_pred_strs.extend(pred_str)
                    input_str = detok_batch(inputs)
                    for print_idx in range(
                            min(args.n_train_print, inputs.size(0))):
                        print('INPT: ', input_str[print_idx])
                        print('GOLD: ', label_str[print_idx])
                        print('PRED: ', pred_str[print_idx])
                        print()

            train_bleu = None
            if args.n_train_print > 0:
                train_bleu = computeBLEU(train_pred_strs,
                                         [[x] for x in train_lab_strs])
                train_ppl = math.exp(train_ppl / n_train_examples)

            if args.do_eval:
                model.eval()
                eval_loss, eval_em, eval_ppl = 0, 0, 0
                nb_eval_steps, nb_eval_examples = 0, 0
                label_strs, prediction_strs = [], []
                n_words = 0
                for batch in eval_dataloader:
                    inputs = batch[0].to(device)
                    labels = batch[1].to(device)

                    with torch.no_grad():
                        loss = model(inputs, lm_labels=labels)
                        preds = sample(model, batch[2], args.n_gen, device)

                    eval_loss += loss.item()
                    eval_ppl += loss.item() * inputs.size(0)
                    nb_eval_examples += inputs.size(0)
                    nb_eval_steps += 1
                    pred_str = detok_batch(preds)
                    label_str = detok_batch(labels)
                    label_strs.extend(label_str)
                    prediction_strs.extend(pred_str)
                    input_str = detok_batch(inputs)
                    eval_em += sum(
                        [x == y for x, y in zip(pred_str, label_str)])
                    for print_idx in range(
                            min(inputs.size(0), args.num_eval_print)):
                        print('INPT: ', input_str[print_idx])
                        print('GOLD: ', label_str[print_idx])
                        print('PRED: ', pred_str[print_idx])
                        print()

                eval_bleu = computeBLEU(prediction_strs,
                                        [[x] for x in label_strs])
                eval_ppl = math.exp(eval_ppl / nb_eval_examples)
                eval_em = eval_em / nb_eval_examples
                eval_loss = eval_loss / nb_eval_steps
                train_loss = tr_loss / nb_tr_steps if args.do_train else None
                result = {
                    'eval_loss': eval_loss,
                    'eval_em': eval_em,
                    'eval_bleu': eval_bleu,
                    'eval_ppl': eval_ppl,
                    'train_loss': train_loss,
                    'train_bleu': train_bleu,
                    'train_ppl': train_ppl
                }

                output_eval_file = os.path.join(args.output_dir,
                                                "eval_results.txt")
                with open(output_eval_file, "a") as writer:
                    for key in sorted(result.keys()):
                        logger.info("  %s = %s", key, str(result[key]))
                        writer.write("%s = %s\n" % (key, str(result[key])))

                if eval_bleu > best_eval:
                    best_eval = eval_bleu

                    # Save a trained model
                    model_to_save = model.module if hasattr(
                        model,
                        'module') else model  # Only save the model it-self
                    output_model_file = os.path.join(args.output_dir,
                                                     "pytorch_model.bin")
                    config = model.config
                    torch.save(model_to_save.state_dict(), output_model_file)

    if args.do_eval:
        # Load a trained model that you have fine-tuned
        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
        model_state_dict = torch.load(output_model_file)
        model = OpenAIGPTLMHeadModel(model.config)
        model.load_state_dict(model_state_dict)
        # uncomment to try out the default not finue-tuned model
        #        model = OpenAIGPTLMHeadModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens), cache_dir=os.path.dirname(args.data))
        model.to(device)
        model.eval()
        eval_loss, eval_em, eval_ppl = 0, 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        label_strs, prediction_strs = [], []
        n_words = 0
        for batch in eval_dataloader:
            inputs = batch[0].to(device)
            labels = batch[1].to(device)

            with torch.no_grad():
                loss = model(inputs, lm_labels=labels)
                preds = sample(model, batch[2], args.n_gen, device)

            eval_loss += loss.item()
            eval_ppl += loss.item() * inputs.size(0)
            nb_eval_examples += inputs.size(0)
            nb_eval_steps += 1
            pred_str = detok_batch(preds)
            label_str = detok_batch(labels)
            label_strs.extend(label_str)
            prediction_strs.extend(pred_str)
            input_str = detok_batch(inputs)
            eval_em += sum([x == y for x, y in zip(pred_str, label_str)])
            for print_idx in range(min(inputs.size(0), args.num_eval_print)):
                print('INPT: ', input_str[print_idx])
                print('GOLD: ', label_str[print_idx])
                print('PRED: ', pred_str[print_idx])
                print()

        eval_bleu = computeBLEU(prediction_strs, [[x] for x in label_strs])
        eval_ppl = math.exp(eval_ppl / nb_eval_examples)
        eval_em = eval_em / nb_eval_examples
        eval_loss = eval_loss / nb_eval_steps
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'eval_loss': eval_loss,
            'eval_em': eval_em,
            'eval_bleu': eval_bleu,
            'eval_ppl': eval_ppl,
            'train_loss': train_loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "a") as writer:
            logger.info("***** Best Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        output_preds_file = os.path.join(
            args.output_dir, f"{args.eval_preds_prefix}_{args.setting}.txt")
        with open(output_preds_file, 'w') as writer:
            logger.info("Writing predictions")
            for p in prediction_strs:
                writer.write(p + '\n')

    if args.do_test:
        # Load a trained model that you have fine-tuned
        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
        model_state_dict = torch.load(output_model_file)
        model = OpenAIGPTLMHeadModel(model.config)
        model.load_state_dict(model_state_dict)
        model.to(device)
        model.eval()
        eval_loss, eval_em, eval_ppl = 0, 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        label_strs, prediction_strs = [], []
        n_words = 0
        for batch in test_dataloader:
            inputs = batch[0].to(device)

            with torch.no_grad():
                preds = sample(model, batch[1], args.n_gen, device)

            pred_str = detok_batch(preds)
            prediction_strs.extend(pred_str)

        output_preds_file = os.path.join(
            args.output_dir, f"{args.test_preds_prefix}_{args.setting}.txt")
        with open(output_preds_file, 'w') as writer:
            logger.info("Writing predictions")
            for p in prediction_strs:
                writer.write(f'"{p.strip()}"\n')
Example #20
0
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    if args.single_part:
        special_tokens = ['<BOA>', '<EOA>']
    else:
        special_tokens = ['<BOA>', '<SEP>', '<EOA>']

    model_name = args.model_name
    tokenizer = OpenAIGPTTokenizer.from_pretrained(
        args.model_name, special_tokens=special_tokens)
    special_tokens_ids = list(
        tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    model = OpenAIGPTLMHeadModel.from_pretrained(
        args.model_name, num_special_tokens=len(special_tokens))
    model.to(device)
    original_model = model
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    logger.info("Encoding dataset...")
    train_dataset = json_load(args.train_dataset)
    eval_dataset = json_load(args.eval_dataset)

    tasks = chunk(train_dataset, 20)
    with multiprocessing.Pool(processes=20) as pool:
        if args.single_part:
            sub_results = pool.map(tokenize_and_encode_single_part, tasks)
        else:
            sub_results = pool.map(tokenize_and_encode, tasks)
Example #21
0
text = "Who was Jim Henson ? Jim Henson was a puppeteer"
tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])

# Load pre-trained model (weights)
model = OpenAIGPTModel.from_pretrained('openai-gpt')
model.eval()

# Predict hidden states features for each layer
with torch.no_grad():
    hidden_states = model(tokens_tensor)

# Load pre-trained model (weights)
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
model.eval()

# If you have a GPU, put everything on cuda

# Predict all tokens
with torch.no_grad():
    predictions = model(tokens_tensor)

# get the predicted last token
predicted_index = torch.argmax(predictions[0, -1, :]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
print(predicted_token)
from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
from pytorch_pretrained_bert.modeling import BertConfig, WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.modeling_openai import OpenAIGPTConfig, WEIGHTS_NAME, CONFIG_NAME

model_path = 'openai-gpt'
output_dir = './language-quality-subreward/gpt_output'
WEIGHTS_NAME = 'pytorch_model.bin'
special_tokens = ['_start_', '_delimiter_', '_classify_']
# Load pre-trained model (weights)
with torch.no_grad():
    output_config_file = os.path.join(output_dir, CONFIG_NAME)
    config = OpenAIGPTConfig(output_config_file)

    output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
    model_state_dict = torch.load(output_model_file, map_location='cpu')
    model = OpenAIGPTLMHeadModel(config)
    model.load_state_dict(model_state_dict)

    # model = OpenAIGPTLMHeadModel.from_pretrained(model_path)
    # model.load_state_dict(torch.load(output_model_file, map_location='cpu'))
    model.eval()
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = OpenAIGPTTokenizer.from_pretrained(model_path, cache_dir='./tmp/', special_tokens=special_tokens)

'''
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
model.eval()
# Load pre-trained model tokenizer (vocabulary)
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
'''
Example #23
0
def main():
    # Parse the arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name',
                        type=str,
                        default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=1)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)
    parser.add_argument('--max_seq_length', type=int, default=110)

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Set the seed for random, numpy, PyTorch
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned
    special_tokens = ['<POS>', '<NEG>', '<CON_START>', '<START>', '<END>']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(
        args.model_name, special_tokens=special_tokens)
    start_token_id = tokenizer.convert_tokens_to_ids(['<START>'])[0]
    model = OpenAIGPTLMHeadModel.from_pretrained(
        args.model_name, num_special_tokens=len(special_tokens))
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Load and encode dataset
    def tokenize_and_encode(file_path):
        '''
        This method tokenizes the input data and encodes it using the OpenAIGPTTokenizer
        :param file_path: Path of the input file, dtype: str
        :return: encoded dataset  dtype: list
        '''
        with open(file_path, 'r') as in_fp:
            lines = in_fp.read().splitlines()

        tokenized_dataset = lines
        for i, line in enumerate(tqdm(lines)):
            token = tokenizer.tokenize(line)[:512]
            tokenized_dataset[i] = tokenizer.convert_tokens_to_ids(token)
        return tokenized_dataset

    logger.info("Encoding dataset...")
    train_dataset = tokenize_and_encode(args.train_dataset)
    eval_dataset = tokenize_and_encode(args.eval_dataset)
    print("Training samples = {}".format(len(train_dataset)))
    print("Validation samples = {}".format(len(eval_dataset)))
    print("Example = {}".format(train_dataset[0]))
    time.sleep(2)
    # Compute the mex input length for the Transformer
    train_dataset = [
        x for x in train_dataset
        if len(x) <= args.max_seq_length and start_token_id in x
    ]  # Remove all sentence longer than max_seq_length
    eval_dataset = [
        x for x in eval_dataset
        if len(x) <= args.max_seq_length and start_token_id in x
    ]
    input_length = max(max(len(t) for t in train_dataset),
                       max(len(q) for q in eval_dataset))
    if n_gpu > 1:
        input_length = min(input_length, model.module.config.n_positions)
    else:
        input_length = min(input_length, model.config.n_positions
                           )  # Max size of input for the pre-trained model
    print("Input Length = {}".format(input_length))

    def pre_process_dataset(encoded_dataset, input_length, start_token_id):
        """
        This method is to create torch tensor of input ids and lm labels
        :param encoded_dataset: Input dataset, dtype: list
        :param input_length: Maximum length of sentence from training and eval dataset, dtype: int
        :param start_token_id: id of the '<START>' token, dtype: int
        :return: torch.tensor of size [len(encoded_dataset), 2]
        """

        n_batch = len(encoded_dataset)
        input_ids = np.zeros(shape=(n_batch, input_length), dtype=np.int64)
        lm_labels = np.full(shape=(n_batch, input_length),
                            fill_value=-1,
                            dtype=np.int64)

        for i, tokens in enumerate(encoded_dataset):
            try:
                #tokens = tokens[:input_length]
                start_id_index = tokens.index(start_token_id)
                input_ids[i, :len(tokens)] = tokens
                start_id_index = tokens.index(start_token_id)
                lm_labels[i, start_id_index:len(tokens) -
                          1] = tokens[start_id_index + 1:len(tokens)]
                # LM loss calculate only for tokens after <START> token in the sentence
                #lm_labels[i, :len(tokens)-1] = tokens[1:]
            except ValueError:
                print("Index {} doesn't have start token".format(i))

        input_ids = torch.tensor(input_ids)
        lm_labels = torch.tensor(lm_labels)
        tensor_dataset = (input_ids, lm_labels)
        #tensor_dataset.append(torch.tensor(d) for d in all_inputs)

        return tensor_dataset

    # Prepare input tensors and dataloders
    train_tensor_dataset = pre_process_dataset(train_dataset,
                                               input_length,
                                               start_token_id=start_token_id)
    eval_tensor_dataset = pre_process_dataset(eval_dataset,
                                              input_length,
                                              start_token_id=start_token_id)

    print("Training Example Input ids= {}".format(train_tensor_dataset[0][0]))
    print("Training Example Language Modeling ids = {}".format(
        train_tensor_dataset[1][0]))
    time.sleep(10)
    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = RandomSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    num_train_optimization_steps = len(
        train_data) * args.num_train_epochs // args.train_batch_size
    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                           lr=args.learning_rate,
                           warmup=args.warmup_proportion,
                           max_grad_norm=args.max_grad_norm,
                           weight_decay=args.weight_decay,
                           t_total=num_train_optimization_steps)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, lm_labels = batch
                loss = model(input_ids, lm_labels=lm_labels)
                if n_gpu > 1:
                    loss.mean().backward()
                else:
                    loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                if n_gpu > 1:
                    tmp_loss = loss.mean().item()
                else:
                    tmp_loss = loss.item()
                exp_average_loss = tmp_loss if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * tmp_loss
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    optimizer.get_lr()[0])

            model_to_save = model.module if hasattr(
                model, 'module') else model  # Only save the model it-self
            output_model_file = os.path.join(
                args.output_dir,
                "pytorch_model_zero_grad_{}.bin".format(epoch + 1))
            config = model.module.config if hasattr(model,
                                                    'module') else model.config
            torch.save(model_to_save.state_dict(), output_model_file)

            model_state_dict = torch.load(output_model_file)
            model = OpenAIGPTLMHeadModel(config)
            model.load_state_dict(model_state_dict)
            model.to(device)
            if n_gpu > 1:
                model = torch.nn.DataParallel(model)

    if args.do_eval:
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, lm_labels = batch
            with torch.no_grad():
                lm_loss = model(input_ids, lm_labels=lm_labels)

            eval_loss += lm_loss.mean().item()

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {'eval_loss': eval_loss, 'train_loss': train_loss}

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
Example #24
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--input_file", default=None, type=str, required=True)
    parser.add_argument("--output_file", default=None, type=str, required=True)
    parser.add_argument("--abbr_file", default=None, type=str, required=True)
    parser.add_argument("--freq_file", default=None, type=str, required=False)
    parser.add_argument('--model_name', type=str, default='openai-gpt',
                        help='pretrained model name')

    ## Other parameters
    parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
    parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
    parser.add_argument("--max_seq_length", default=128, type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. Sequences longer "
                            "than this will be truncated, and sequences shorter than this will be padded.")
    parser.add_argument("--batch_size", default=1, type=int, help="Batch size for predictions.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help = "local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(args.local_rank != -1)))

    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name)

    examples,tlist, alist = read_examples(args.input_file, args.abbr_file, args.freq_file, tokenizer)

    features = convert_examples_to_features(
        examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer)

    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature

    model = OpenAIGPTLMHeadModel.from_pretrained(args.model_name)
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_target_ids = torch.tensor([f.target_ids for f in features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)

    eval_data = TensorDataset(all_input_ids, all_input_mask, all_target_ids, all_example_index)
    if args.local_rank == -1:
        eval_sampler = SequentialSampler(eval_data)
    else:
        eval_sampler = DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)

    model.eval()
    with open(args.output_file, "w", encoding='utf-8') as writer:
        for input_ids, input_mask, target_ids, example_indices in eval_dataloader:
            input_ids = input_ids.to(device)
            target_ids=target_ids.to(device)
            input_mask = input_mask.to(device)
            with torch.no_grad():
                loss = model(input_ids, lm_labels=target_ids)
                print(example_indices,loss)
            for b, example_index in enumerate(example_indices):
                feature = features[example_index.item()]
                unique_id = int(feature.unique_id)
                raw_id=int(feature.raw_id)
                # feature = unique_id_to_feature[unique_id]
                output_json = collections.OrderedDict()
                output_json["index"] = unique_id
                output_json['sent_id']=raw_id
                output_json['text']=tlist[unique_id]
                output_json['expansion']=alist[unique_id]
                output_json["loss"] = float(loss)
                writer.write(json.dumps(output_json) + "\n")
Example #25
0
 def __init__(self, pad_idx):
     self.transformer_module = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt',
                                                                    cache_dir='/apdcephfs/share_916081/chencxu/pegg/data/models/gpt_models')
     self.transformer_module.eval()
     self.pad_idx = pad_idx
Example #26
0
class Rewarder():
    def __init__(self, args, tokenizer):

        self.args = args

        self.nli_tokenizer = BertTokenizer.from_pretrained(
            args.bert_model,
            do_lower_case=args.do_lower_case,
            cache_dir='.pytorch_pretrained_bert')
        self.output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        self.output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        self.nli_config = BertConfig(self.output_config_file)
        self.nli_model = BertForSequenceClassification(self.nli_config,
                                                       num_labels=3)
        self.nli_model.load_state_dict(
            torch.load(self.output_model_file,
                       map_location=torch.device('cpu')))
        self.nli_model.to(args.device)
        self.nli_model.eval()

        if args.nli_uu_reward or args.nli_allres_reward:
            uu_output_config_file = os.path.join(args.uu_output_dir,
                                                 CONFIG_NAME)
            uu_output_model_file = os.path.join(args.uu_output_dir,
                                                WEIGHTS_NAME)
            self.uu_nli_config = BertConfig(uu_output_config_file)
            self.uu_nli_model = BertForSequenceClassification(
                self.uu_nli_config, num_labels=3)
            self.uu_nli_model.load_state_dict(
                torch.load(uu_output_model_file,
                           map_location=torch.device('cpu')))
            self.uu_nli_model.to(args.device)
            self.uu_nli_model.eval()

        bert_emb_modelpath = "bert-base-uncased"
        self.bert_emb_tokenizer = BertTokenizer.from_pretrained(
            bert_emb_modelpath, cache_dir='.pytorch_pretrained_bert')
        self.bert_emb_model = BertModel.from_pretrained(
            bert_emb_modelpath,
            cache_dir='.pytorch_pretrained_bert').to(args.device)
        self.bert_emb_model.eval()

        self.tokenizer = tokenizer

        if args.lm_reward:
            lm_model_path = 'openai-gpt'
            lm_output_dir = 'language-quality-subreward/gpt_output'
            lm_special_tokens = ['_start_', '_delimiter_', '_classify_']
            # Load pre-trained model (weights)
            with torch.no_grad():
                lm_output_config_file = os.path.join(lm_output_dir,
                                                     CONFIG_NAME)
                lm_config = OpenAIGPTConfig(lm_output_config_file)

                lm_output_model_file = os.path.join(lm_output_dir,
                                                    WEIGHTS_NAME)
                #lm_model_state_dict = torch.load(lm_output_model_file)
                lm_model_state_dict = torch.load(lm_output_model_file,
                                                 map_location='cpu')
                self.lm_model = OpenAIGPTLMHeadModel(lm_config)
                self.lm_model.load_state_dict(lm_model_state_dict)

                # Load pre-trained model tokenizer (vocabulary)
                self.lm_tokenizer = OpenAIGPTTokenizer.from_pretrained(
                    lm_model_path,
                    special_tokens=lm_special_tokens,
                    cache_dir='.pytorch_pretrained_bert')

            self.special_tokens_ids = list(
                self.lm_tokenizer.convert_tokens_to_ids(token)
                for token in lm_special_tokens)
            self.lm_model.to(args.device)
            self.lm_model.eval()

    def persona_rewarder(self, response, rl_train_personas_org):
        # cancat all the personas
        '''
        personas_org_chain = [''.join(rl_train_personas_org)]
        reward = nli_engine(response, personas_org_chain, nli_tokenizer, nli_model)[0]
        '''
        scores = nli_engine(response, rl_train_personas_org,
                            self.nli_tokenizer, self.nli_model)
        current_persona_reward_0 = (
            (sum(scores) / len(rl_train_personas_org)) + 2) / 3
        current_persona_reward = current_persona_reward_0 * self.args.nli_weight
        logger.info('persona_reward before/after weighting = %f/%f' %
                    (current_persona_reward_0, current_persona_reward))
        return current_persona_reward

    def nli_allres_rewarder(self, response, history):
        # history_chain = list(chain(*history))
        # history_text = tokenizer.decode(history_chain, skip_special_tokens=True, clean_up_tokenization_spaces=False)
        pre_responses = []
        for i in range(-len(history), 0):
            if i % 2 == 0:
                current_text = self.tokenizer.decode(
                    history[i],
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=False)
                pre_responses.append(current_text)
        response_scores = nli_engine(response, pre_responses,
                                     self.nli_tokenizer, self.nli_model)
        if response_scores == []:
            current_response_reward = 0.5  # TODO: test if single allres will work
        else:
            current_response_reward = sum(response_scores) / len(
                response_scores)
        current_response_reward_0 = (current_response_reward + 2) / 3
        current_response_reward = current_response_reward * self.args.nli_allres_weight
        logger.info('allres_reward before/after weighting = %f/%f' %
                    (current_response_reward_0, current_response_reward))
        return current_response_reward

    def cos_sim_bert_rewarder(self, response, history):
        pre_utt = history[-1]
        pre_utt_text = self.tokenizer.decode(
            pre_utt,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False)
        pre_utt_vec = bert_vector(pre_utt_text, self.bert_emb_tokenizer,
                                  self.bert_emb_model, self.args)
        response_vec = bert_vector(response, self.bert_emb_tokenizer,
                                   self.bert_emb_model, self.args)
        cos_sim_bert_score = cosine_similarity(pre_utt_vec.reshape(1, -1),
                                               response_vec.reshape(1,
                                                                    -1))[0][0]
        current_cos_sim_bert_reward = cos_sim_bert_score * self.args.cos_sim_bert_weight
        logger.info('cos_sim_bert before/after weighting = %f/%f' %
                    (cos_sim_bert_score, current_cos_sim_bert_reward))
        return current_cos_sim_bert_reward

    def intern_rep_rewarder(self, response):
        # response = 'i\'m 16 years years years years years old bye bye.'
        # intrep_word
        response_tok = response.split()
        intrep_1gram = intrep_frac(response_tok)
        # intrep_2gram
        response_tok_2gram = get_ngrams(response, 2)
        intrep_2gram = intrep_frac(response_tok_2gram)
        # intrep_3gram
        response_tok_3gram = get_ngrams(response, 3)
        intrep_3gram = intrep_frac(response_tok_3gram)
        current_intern_rep_reward = (
            1 - intrep_1gram
        ) * self.args.intern_rep_weight  # TODO: How to design this reward?
        logger.info('intern_rep before/after weighting = %f/%f' %
                    ((1 - intrep_1gram), current_intern_rep_reward))
        return current_intern_rep_reward

    def extern_rep_rewarder(self, response, history):
        pre_responses = []
        for i in range(-len(history), 0):
            if i % 2 == 0:
                current_text = self.tokenizer.decode(
                    history[i],
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=False)
                pre_responses.append(current_text)

        # extrep_word
        response_tok = response.split()
        prev_tok = [s.split() for s in pre_responses]  # list of list of ints
        prev_tok = list(set(flatten(prev_tok)))  # list of ints, no duplicates
        extrep_1gram = extrep_frac(response_tok, prev_tok)
        # extrep_2gram
        response_tok_2gram = get_ngrams(response, 2)
        prev_2grams = [get_ngrams(prev, 2)
                       for prev in pre_responses]  # list of list of strings
        prev_2grams = list(set(
            flatten(prev_2grams)))  # list of strings, no duplicates
        extrep_2gram = extrep_frac(response_tok_2gram, prev_2grams)
        # extrep_3gram
        response_tok_3gram = get_ngrams(response, 3)
        prev_3grams = [get_ngrams(prev, 3)
                       for prev in pre_responses]  # list of list of strings
        prev_3grams = list(set(
            flatten(prev_3grams)))  # list of strings, no duplicates
        extrep_3gram = extrep_frac(response_tok_3gram, prev_3grams)

        current_extern_rep_reward = 0  # TODO: How to design this reward?
        logger.info('extern_rep before/after weighting = %f/%f' %
                    (current_extern_rep_reward, current_extern_rep_reward))
        return current_extern_rep_reward

    def lm_rewarder(self, response):
        lm_tokenize_input = self.lm_tokenizer.tokenize(response)
        # lm_tensor_input = torch.tensor([lm_tokenizer.convert_tokens_to_ids(lm_tokenize_input)]).to(args.device)
        lm_tensor_input = torch.tensor(
            [[self.special_tokens_ids[0]] +
             self.lm_tokenizer.convert_tokens_to_ids(lm_tokenize_input) +
             [self.special_tokens_ids[-1]]]).to(self.args.device)
        lm_loss = self.lm_model(lm_tensor_input, lm_labels=lm_tensor_input)
        # lm_ppl = math.exp(lm_loss.item())
        nll = -lm_loss.item()
        if nll < -4:
            nll = -4
        current_lm_score = (nll + 4) / 4
        current_lm_reward = current_lm_score * self.args.lm_weight  # TODO: 1/lm_ppl?
        logger.info('lm_reward before/after weighting = %f/%f' %
                    (current_lm_score, current_lm_reward))
        return current_lm_reward

    def qback_rewarder(self, response):
        response_tok = response.split()
        num_in_list = len([w for w in response_tok if w in QN_WORDS])
        current_qback_reward = (num_in_list /
                                len(response_tok)) * self.args.qback_weight
        logger.info('qback_reward before/after weighting = %f/%f' %
                    ((num_in_list / len(response_tok)), current_qback_reward))
        return current_qback_reward

    def get_reward(self, response, rl_train_personas_org, history):

        R = {
            'reward': 0,
            'persona_reward': 0,
            'response_reward': 0,
            'uu_reward': 0,
            'cos_sim_bert_reward': 0,
            'intern_rep_reward': 0,
            'extern_rep_reward': 0,
            'lm_reward': 0,
            'qback_reward': 0,
            'f1_reward': 0,
            'bleu_reward': 0
        }

        if self.args.nli_reward:
            R['persona_reward'] = self.persona_rewarder(
                response, rl_train_personas_org)

        if self.args.nli_allres_reward:
            R['response_reward'] = self.nli_allres_rewarder(response, history)

        if self.args.nli_uu_reward:
            R['uu_reward'] = self.nli_uu_rewarder(response, history)

        if self.args.cos_sim_bert_reward:
            R['cos_sim_bert_reward'] = self.cos_sim_bert_rewarder(
                response, history)

        if self.args.intern_rep_reward:
            R['intern_rep_reward'] = self.intern_rep_rewarder(response)

        if self.args.extern_rep_reward:
            R['extern_rep_reward'] = self.extern_rep_rewarder(
                response, history)

        if self.args.lm_reward:
            R['lm_reward'] = self.lm_rewarder(response)

        if self.args.qback_reward:
            R['qback_reward'] = self.qback_rewarder(response)

        R['reward'] = R['persona_reward'] + \
             R['response_reward'] + \
             R['uu_reward'] + \
             R['cos_sim_bert_reward']+ \
             R['intern_rep_reward'] + \
             R['extern_rep_reward'] + \
             R['lm_reward'] + \
             R['qback_reward'] + \
             R['f1_reward'] + \
             R['bleu_reward']

        return R
import torch, os
from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
from tqdm import tqdm
import numpy as np
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
#from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam
from bertviz.bertviz.pytorch_pretrained_bert import BertModel, BertTokenizer

special_tokens = ['<POS>', '<NEG>', '<CON_START>', '<START>',
                  '<END>']  # Set the special tokens
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt',
                                               special_tokens=special_tokens)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = OpenAIGPTLMHeadModel.from_pretrained(
    'openai-gpt', num_special_tokens=len(special_tokens))
path = os.path.join(os.getcwd(),
                    "./pytorch_model_zero_grad_1.bin")  ## Model Path
model_state_dict = torch.load(path, map_location=device)
model.load_state_dict(model_state_dict)
model.to(device)
model.eval()
bert_classifier_dir = "./bert_classifier/"
model_cls = BertForSequenceClassification.from_pretrained(bert_classifier_dir,
                                                          num_labels=2)
tokenizer_cls = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True)
#model_cls.to(device)
#model_cls.eval()
max_seq_len = 70
sm = torch.nn.Softmax(dim=-1)
Example #28
0
    def __init__(self, args, tokenizer):

        self.args = args

        self.nli_tokenizer = BertTokenizer.from_pretrained(
            args.bert_model,
            do_lower_case=args.do_lower_case,
            cache_dir='.pytorch_pretrained_bert')
        self.output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        self.output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        self.nli_config = BertConfig(self.output_config_file)
        self.nli_model = BertForSequenceClassification(self.nli_config,
                                                       num_labels=3)
        self.nli_model.load_state_dict(
            torch.load(self.output_model_file,
                       map_location=torch.device('cpu')))
        self.nli_model.to(args.device)
        self.nli_model.eval()

        if args.nli_uu_reward or args.nli_allres_reward:
            uu_output_config_file = os.path.join(args.uu_output_dir,
                                                 CONFIG_NAME)
            uu_output_model_file = os.path.join(args.uu_output_dir,
                                                WEIGHTS_NAME)
            self.uu_nli_config = BertConfig(uu_output_config_file)
            self.uu_nli_model = BertForSequenceClassification(
                self.uu_nli_config, num_labels=3)
            self.uu_nli_model.load_state_dict(
                torch.load(uu_output_model_file,
                           map_location=torch.device('cpu')))
            self.uu_nli_model.to(args.device)
            self.uu_nli_model.eval()

        bert_emb_modelpath = "bert-base-uncased"
        self.bert_emb_tokenizer = BertTokenizer.from_pretrained(
            bert_emb_modelpath, cache_dir='.pytorch_pretrained_bert')
        self.bert_emb_model = BertModel.from_pretrained(
            bert_emb_modelpath,
            cache_dir='.pytorch_pretrained_bert').to(args.device)
        self.bert_emb_model.eval()

        self.tokenizer = tokenizer

        if args.lm_reward:
            lm_model_path = 'openai-gpt'
            lm_output_dir = 'language-quality-subreward/gpt_output'
            lm_special_tokens = ['_start_', '_delimiter_', '_classify_']
            # Load pre-trained model (weights)
            with torch.no_grad():
                lm_output_config_file = os.path.join(lm_output_dir,
                                                     CONFIG_NAME)
                lm_config = OpenAIGPTConfig(lm_output_config_file)

                lm_output_model_file = os.path.join(lm_output_dir,
                                                    WEIGHTS_NAME)
                #lm_model_state_dict = torch.load(lm_output_model_file)
                lm_model_state_dict = torch.load(lm_output_model_file,
                                                 map_location='cpu')
                self.lm_model = OpenAIGPTLMHeadModel(lm_config)
                self.lm_model.load_state_dict(lm_model_state_dict)

                # Load pre-trained model tokenizer (vocabulary)
                self.lm_tokenizer = OpenAIGPTTokenizer.from_pretrained(
                    lm_model_path,
                    special_tokens=lm_special_tokens,
                    cache_dir='.pytorch_pretrained_bert')

            self.special_tokens_ids = list(
                self.lm_tokenizer.convert_tokens_to_ids(token)
                for token in lm_special_tokens)
            self.lm_model.to(args.device)
            self.lm_model.eval()
Example #29
0
def run():
    parser = ArgumentParser()
    parser.add_argument("--model_type", type=str, default="gpt", help="gpt or gpt2")
    parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")
    parser.add_argument("--filename", type=str, default="data/instances_dev.pkl", help="File to use for decoding")
    parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length", type=int, default=50, help="Maximum length of the output utterances")
    parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=42, help="Seed")
    parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature")
    parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")

    if args.model_type == 'gpt2':
        tokenizer = GPT2Tokenizer.from_pretrained(args.model_checkpoint)
        model = GPT2LMHeadModel.from_pretrained(args.model_checkpoint)
    else:
        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_checkpoint)
        model = OpenAIGPTLMHeadModel.from_pretrained(args.model_checkpoint)

    model.to(args.device)
    model.eval()

    data = get_dataset_from_file(tokenizer, args.filename)
    final_output_dict = {
        "version": "squash-2.0",
        "data": [{
            "paragraphs": []
        }]
    }
    question_number = 0
    # For all the instances corresponding one paragraph, model input format is: paragraph + answer + question) 
    # Paragraph will be common accross all the instances.
    # "past" can be used to reuse precomputed hidden state for paragraph in a subsequent predictions
    
    imort copy 
    
    previous_para_index = None
    past = None
    for inst in tqdm.tqdm(data):
        with torch.no_grad():
            current_para_index = inst['para_index']
            if current_para_index != prev_para_index:
                past = None
                currrent_inst = copy.deepcopy(inst)
                # only keeping paragraph details in the instance to get its hidden states 
                current_inst['question'] =  []
                current_inst['answer'] = []
                instance, _ = build_input_from_segments(current_inst,tokenizer,with_eos=False)
                input_ids = torch.tensor(instance['input_ids'][:-2],device=args.device).unsqueeze(0)
                token_type_ids = torch.tensor(instance['token_type_ids'][:-2],device=args.device).unsqueeze(0)
                _,past=model(input_ids,toekn_type_ids=toekn_type_ids,past=past) #output "past" will have paragraph embeddings
            output = sample_sequence(inst, tokenizer, model, args,past)

        original_paragraph = tokenizer.decode(output['paragraph'])
        generated_question = tokenizer.decode(output['question'], skip_special_tokens=True)
        original_answer = tokenizer.decode(output['answer'], skip_special_tokens=True)
        para_index = inst['para_index']

        # Output in a SQUAD-like format with questions clumped together under their parent paragraph
        if len(final_output_dict["data"][0]["paragraphs"]) > para_index:
            # verify whether the paragraph text is identical
            assert original_paragraph == final_output_dict["data"][0]["paragraphs"][para_index]['context']
            # append the question answer pair
            final_output_dict["data"][0]["paragraphs"][para_index]['qas'].append({
                'id': 'question_%d' % question_number,
                'question': generated_question,
                'answers': [{
                    'text': original_answer,
                    'answer_start': original_paragraph.index(original_answer)
                }],
                'class': output['class'],
                'algorithm': output['algorithm'],
                'is_impossible': False
            })
        else:
            # add a new question to the list of QA pairs
            final_output_dict['data'][0]['paragraphs'].append({
                'context': original_paragraph,
                'qas': [{
                    'id': 'question_%d' % question_number,
                    'question': generated_question,
                    'answers': [{
                        'text': original_answer,
                        'answer_start': original_paragraph.index(original_answer)
                    }],
                    'class': output['class'],
                    'algorithm': output['algorithm'],
                    'is_impossible': False
                }]
            })

        question_number += 1

    with open("squash/temp/generated_questions.json", "w") as f:
        f.write(json.dumps(final_output_dict))
Example #30
0
 def __init__(self, pad_idx):
     self.transformer_module = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
     self.transformer_module.eval()
     self.pad_idx = pad_idx