def run_pplm_example(pretrained_model="gpt2-medium",
                     cond_text="",
                     uncond=False,
                     num_samples=1,
                     bag_of_words=None,
                     discrim=None,
                     discrim_weights=None,
                     discrim_meta=None,
                     class_label=-1,
                     length=100,
                     stepsize=0.02,
                     temperature=1.0,
                     top_k=10,
                     sample=False,
                     num_iterations=3,
                     grad_length=10000,
                     horizon_length=1,
                     window_length=0,
                     decay=False,
                     gamma=1.5,
                     gm_scale=0.9,
                     kl_scale=0.01,
                     seed=0,
                     colorama=False):
    # set Random seed
    torch.manual_seed(seed)
    np.random.seed(seed)

    # set the device
    device = "cuda" if torch.cuda.is_available() else "cpu"

    if discrim == 'generic':
        set_generic_model_params(discrim_weights, discrim_meta)

    if discrim is not None:
        pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][
            "pretrained_model"]
        print("discrim = {}, pretrained_model set to discriminator's = {}".
              format(discrim, pretrained_model))

    # load pretrained model
    model = GPT2LMHeadModel.from_pretrained(pretrained_model,
                                            output_hidden_states=True)
    model.to(device)
    model.eval()

    # load tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)

    # Freeze GPT-2 weights
    for param in model.parameters():
        param.requires_grad = False

    # figure out conditioning text
    if uncond:
        tokenized_cond_text = tokenizer.encode([tokenizer.bos_token])
    else:
        raw_text = cond_text
        while not raw_text:
            #print("Did you forget to add `--cond_text`? ")
            raw_text = input("Model prompt >>> ")
        tokenized_cond_text = tokenizer.encode(tokenizer.bos_token + raw_text)

    unpert_gen_tok_text, pert_gen_tok_texts, _, _ = full_text_generation(
        model=model,
        tokenizer=tokenizer,
        context=tokenized_cond_text,
        device=device,
        num_samples=num_samples,
        bag_of_words=bag_of_words,
        discrim=discrim,
        class_label=class_label,
        length=length,
        stepsize=stepsize,
        temperature=temperature,
        top_k=top_k,
        sample=sample,
        num_iterations=num_iterations,
        grad_length=grad_length,
        horizon_length=horizon_length,
        window_length=window_length,
        decay=decay,
        gamma=gamma,
        gm_scale=gm_scale,
        kl_scale=kl_scale,
    )

    # untokenize unperturbed text

    bow_word_ids = set()
    if bag_of_words and colorama:
        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"),
                                               tokenizer)
        for single_bow_list in bow_indices:
            # filtering all words in the list composed of more than 1 token
            filtered = list(filter(lambda x: len(x) <= 1, single_bow_list))
            # w[0] because we are sure w has only 1 item because previous fitler
            bow_word_ids.update(w[0] for w in filtered)

    # iterate through the perturbed texts
    for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts):
        try:
            # untokenize unperturbed text
            if colorama:
                import colorama

                pert_gen_text = ''
                for word_id in pert_gen_tok_text.tolist()[0]:
                    if word_id in bow_word_ids:
                        pert_gen_text += '{}{}{}'.format(
                            colorama.Fore.RED, tokenizer.decode([word_id]),
                            colorama.Style.RESET_ALL)
                    else:
                        pert_gen_text += tokenizer.decode([word_id])
            else:
                pert_gen_text = tokenizer.decode(pert_gen_tok_text.tolist()[0])

            model

        except:
            pass

    return pert_gen_text.replace('<|endoftext|>', '')
Beispiel #2
0
def run_model(batch_size, learning_rate, n_ctx, n_head, n_embd, n_layer,
              adaptive, bpe, masked_lm, classification, bpe_model_path,
              datasets, lm_corpus_file, pos_tags, dict_path, rnn, crf,
              config_id):
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, default=2019)
    parser.add_argument("--nsamples", type=int, default=1)
    parser.add_argument("--batch_size", type=int, default=batch_size)
    parser.add_argument("--length", type=int, default=-1)
    parser.add_argument("--temperature", type=int, default=1)
    parser.add_argument("--top_k", type=int, default=0)
    parser.add_argument('--unconditional',
                        action='store_true',
                        help='If true, unconditional generation.')

    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--lr_warmup', type=float, default=0.002)
    parser.add_argument('--lr', type=float, default=learning_rate)
    parser.add_argument('--b1', type=float, default=0.9)
    parser.add_argument('--b2', type=float, default=0.999)
    parser.add_argument('--e', type=float, default=1e-8)
    parser.add_argument('--l2', type=float, default=0.01)
    parser.add_argument('--vector_l2', action='store_true')
    parser.add_argument('--max_grad_norm', type=int, default=1)

    parser.add_argument("--initializer_range", type=float, default=0.02)
    parser.add_argument("--layer_norm_epsilon", type=float, default=1e-6)

    parser.add_argument("--n_ctx", type=int, default=n_ctx)
    parser.add_argument("--n_positions", type=int, default=n_ctx)
    parser.add_argument("--n_embd", type=int, default=n_embd)
    parser.add_argument("--n_head", type=int, default=n_head)
    parser.add_argument("--n_layer", type=int, default=n_layer)
    parser.add_argument("--max_vocab_size",
                        type=int,
                        default=0,
                        help='Zero means no limit.')

    parser.add_argument('--max_step',
                        type=int,
                        default=100000,
                        help='upper epoch limit')
    parser.add_argument('--eta_min',
                        type=float,
                        default=0.0,
                        help='min learning rate for cosine scheduler')
    parser.add_argument('--clip',
                        type=float,
                        default=0.25,
                        help='gradient clipping')
    parser.add_argument('--kw_cut',
                        type=int,
                        default=10,
                        help='Precison and recall @')

    parser.add_argument("--num_epoch", type=int, default=10)

    parser.add_argument('--data_path', type=str, default='data')
    parser.add_argument('--result_path',
                        type=str,
                        default='gpt_results_final.txt')

    parser.add_argument('--adaptive',
                        action='store_true',
                        help='If true, use adaptive softmax.')
    parser.add_argument('--bpe',
                        action='store_true',
                        help='If true, use byte pair encoding.')
    parser.add_argument(
        '--masked_lm',
        action='store_true',
        help=
        'If true, use masked language model objective for pretraining instead of regular language model.'
    )
    parser.add_argument('--transfer_learning',
                        action='store_true',
                        help='If true, use a pretrained language model.')
    parser.add_argument('--POS_tags', action='store_true', help='POS tags')
    parser.add_argument('--classification',
                        action='store_true',
                        help='If true, train a classifier.')
    parser.add_argument(
        '--rnn',
        action='store_true',
        help='If true, use a RNN with attention in classification head.')
    parser.add_argument(
        '--crf',
        action='store_true',
        help=
        'If true, use CRF instead of costum loss function in classification head.'
    )

    parser.add_argument('--bpe_model_path', type=str, default=bpe_model_path)
    parser.add_argument('--datasets', type=str, default=datasets)
    parser.add_argument('--lm_corpus_file', type=str, default=lm_corpus_file)
    parser.add_argument('--trained_language_models_dir',
                        type=str,
                        default='trained_language_models')
    parser.add_argument('--trained_classification_models_dir',
                        type=str,
                        default='trained_classification_models')

    parser.add_argument('--dict_path',
                        type=str,
                        default=dict_path,
                        help='Path to dictionary')
    parser.add_argument('--lang',
                        type=str,
                        default='english',
                        help='Path to dictionary')
    parser.add_argument(
        '--config_id',
        type=str,
        default=config_id,
        help=
        'Used to connect trained language models with classification models')
    parser.add_argument('--cuda',
                        action='store_false',
                        help='If true, use gpu.')

    args = parser.parse_args()
    args.adaptive = adaptive
    args.classification = classification
    args.transfer_learning = True
    args.POS_tags = pos_tags
    args.bpe = bpe
    args.masked_lm = masked_lm
    args.rnn = rnn
    args.crf = crf
    args.cuda = True

    if not os.path.exists(args.trained_classification_models_dir):
        os.makedirs(args.trained_classification_models_dir)

    if not os.path.exists(args.trained_language_models_dir):
        os.makedirs(args.trained_language_models_dir)

    if args.bpe:
        sp = GPT2Tokenizer.from_pretrained("gpt2")
    else:
        sp = None

    if args.crf:
        assert not args.rnn
    if args.rnn:
        assert not args.crf

    print(args)

    if args.lang == 'english':
        stemmer = PorterStemmer()
    elif args.lang == 'estonian':
        stemmer = Lemmatizer('et')
    elif args.lang == 'croatian':
        stemmer = Lemmatizer('hr')

    np.random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    if not args.classification:
        df_data = file_to_df(os.path.join(args.data_path, args.lm_corpus_file),
                             classification=False)
        df_data = df_data.sample(frac=1, random_state=2019)
        val_idx = int(0.8 * df_data.shape[0])
        test_idx = int(0.9 * df_data.shape[0])
        df_train = df_data[:val_idx]
        df_valid = df_data[val_idx:test_idx]
        df_test = df_data[test_idx:]

        print(
            '------------------------------------------------------------------------------------------------------'
        )
        print('Training language model on all data')
        print("Train size: ", df_train.shape, "Valid size: ", df_valid.shape,
              "Test size: ", df_test.shape)
        print(
            '------------------------------------------------------------------------------------------------------'
        )
        print()
        train_test(df_train, df_valid, df_test, args, stemmer, sp)

    else:
        result_file = open(args.result_path, 'a', encoding='utf8')
        result_file.write("Classification results for config " + config_id +
                          ":\n\n")
        result_file.write("Parameters:\n")
        result_file.write(
            str(args) + '\n------------------------------------------------\n')

        for folder in args.datasets.split(';'):

            print(
                '------------------------------------------------------------------------------------------------------'
            )
            print('Training on: ', folder)
            print(
                '------------------------------------------------------------------------------------------------------'
            )

            if folder == 'duc' or folder == 'nus':
                #cross validation
                kf = model_selection.KFold(n_splits=10)
                df_data = file_to_df(os.path.join(args.data_path, folder,
                                                  folder + '_test.json'),
                                     classification=True)
                df_data = df_data.sample(frac=1, random_state=2019)
                print()
                print('Cross validation on duc')

                fold_counter = 0

                total_pred = []
                total_true = []

                for train_index, test_index in kf.split(df_data):
                    fold_counter += 1
                    df_train, df_test = df_data.iloc[
                        train_index], df_data.iloc[test_index]
                    sep_idx = int(df_train.shape[0] / 10)
                    df_valid = df_train[:sep_idx]
                    df_train = df_train[sep_idx:]

                    print("Train fold ", fold_counter, "fold size: ",
                          df_train.shape, "Valid fold size: ", df_valid.shape,
                          "Test fold  size: ", df_test.shape)
                    print()

                    fold_pred, fold_true, num_parameters = train_test(
                        df_train, df_valid, df_test, args, stemmer, sp, folder)
                    total_pred.extend(fold_pred)
                    total_true.extend(fold_true)
                print()
                print(
                    '--------------------------------------------------------------------'
                )
                print('Final CV results:')
                print()

            else:
                df_train = file_to_df(os.path.join(args.data_path, folder,
                                                   folder + '_valid.json'),
                                      classification=True)
                df_train = df_train.sample(frac=1, random_state=2019)
                val_idx = int(0.8 * df_train.shape[0])
                df_valid = df_train[val_idx:]
                df_train = df_train[:val_idx]
                df_test = file_to_df(os.path.join(args.data_path, folder,
                                                  folder + '_test.json'),
                                     classification=True)

                print("Train size: ", df_train.shape, "Valid size: ",
                      df_valid.shape, "Test size: ", df_test.shape)
                print()

                total_pred, total_true, num_parameters = train_test(
                    df_train, df_valid, df_test, args, stemmer, sp, folder)

            p_5, r_5, f_5, p_10, r_10, f_10, p_k, r_k, f_k, p_M, r_M, f_M = eval(
                total_pred, total_true, lang=args.lang)

            result_file.write("Dataset: " + folder + '\n')
            result_file.write('Precision@5: ' + str(p_5) + ' Recall@5: ' +
                              str(r_5) + ' F1@5: ' + str(f_5) + '\n')
            result_file.write('Precision@10: ' + str(p_10) + ' Recall@10: ' +
                              str(r_10) + ' F1@10: ' + str(f_10) + '\n')
            result_file.write('Precision@k: ' + str(p_k) + ' Recall@k: ' +
                              str(r_k) + ' F1@k: ' + str(f_k) + '\n')
            result_file.write('Precision@M: ' + str(p_M) + ' Recall@M: ' +
                              str(r_M) + ' F1@M: ' + str(f_M) + '\n')
            result_file.write('Num. trainable parameters: ' +
                              str(num_parameters) + '\n')

            outputs = []

            for pred, true in zip(total_pred, total_true):
                pred = ";".join(list(pred))
                true = ";".join(list(true))
                outputs.append((pred, true))

            df_preds = pd.DataFrame(outputs, columns=['Predicted', 'True'])
            df_preds.to_csv('predictions/' + folder + '.csv',
                            sep=',',
                            encoding='utf8')

        result_file.write(
            "\n-----------------------------------------------------------\n")
        result_file.write(
            "\n-----------------------End of the run----------------------\n")
        result_file.write(
            "\n-----------------------------------------------------------\n")
        result_file.close()
Beispiel #3
0
def run_model():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model-path',
                        type=str,
                        help='pretrained model path to local checkpoint')
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--nsamples", type=int, default=1)
    parser.add_argument("--batch_size", type=int, default=1)
    parser.add_argument("--length", type=int, default=-1)
    parser.add_argument("--temperature", type=int, default=0.95)
    parser.add_argument('--top_p', type=float, default=0.95)
    parser.add_argument('--top_k', type=int, default=100)
    parser.add_argument('--data-dir', type=str, default='data')
    parser.add_argument('--out-dir', type=str, default='out')

    parser.add_argument('--data_type',
                        type=str,
                        default='t1',
                        choices=['t' + str(i) for i in range(9)],
                        help="t: type")
    parser.add_argument('--model_type',
                        type=str,
                        default='cvae',
                        choices=['cvae', 'ae_vae_fusion'])
    parser.add_argument('--dataset',
                        type=str,
                        default='wi',
                        choices=['wp', 'wi'],
                        help="Dataset to use for training")

    # use GPU
    parser.add_argument('--gpu', default=2, type=int)
    parser.add_argument('--no_gpu', action="store_true")

    parser.add_argument('--add_input', action="store_true")
    parser.add_argument('--add_attn', action="store_true")
    parser.add_argument('--add_softmax', action="store_true")
    parser.add_argument('--attn_proj_vary', action="store_true")

    parser.add_argument('--learn_prior', action="store_true")

    args = parser.parse_args(
        '--model-path out/wi.1.proj_vary_cyc_cvae/model_0030000.pt '
        '--add_input --learn_prior '.split())
    print(args)

    if args.model_type == 'cvae':
        args.learn_prior = True
    else:
        args.learn_prior = False

    # GPU
    if not torch.cuda.is_available(): args.no_gpu = True
    gpu = not args.no_gpu
    if gpu: torch.cuda.set_device(args.gpu)
    device = torch.device(args.gpu if gpu else "cpu")

    # randomness
    np.random.seed(args.seed)
    prng = np.random.RandomState()
    torch.random.manual_seed(args.seed)
    if gpu: torch.cuda.manual_seed(args.seed)

    if args.batch_size == -1:
        args.batch_size = 1
    assert args.nsamples % args.batch_size == 0

    # logging
    save_folder = args.model_path + '.eval/'
    os.makedirs(save_folder, exist_ok=True)
    importlib.reload(logging)
    logging.basicConfig(filename=os.path.join(save_folder, 'eval.log'),
                        level=logging.INFO,
                        format='%(asctime)s--- %(message)s')
    logging.info(
        '\n----------------------------------------------------------------------'
    )

    print('Loading models...')
    cache_dir = os.path.join(args.out_dir, 'model_cache')
    os.makedirs(cache_dir, exist_ok=True)
    # Load pre-trained teacher tokenizer (vocabulary)
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=cache_dir)
    tokenizer.max_len = int(1e12)
    gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2', cache_dir=cache_dir)
    print('gpt2_params:', num_params(gpt2_model))  # gpt2: 124439808
    config = GPT2Config()

    # # add special tokens
    # special_tokens_dict = {
    #     'pad_token': '<|startoftext|>',
    #     'cls_token': '<|startofcond|>',
    #     'sep_token': '<|sepofcond|>',
    #     'mask_token': '<|endofcond|>'
    # }
    # num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    # print('We have added', num_added_toks, 'special tokens')
    # # Notice: resize_token_embeddings expect to receive the full size of the new vocab
    # gpt2_model.resize_token_embeddings(len(tokenizer))
    # assert tokenizer.pad_token == '<|startoftext|>'

    VAE = VAEModel(config,
                   add_input=args.add_input,
                   add_attn=args.add_attn,
                   add_softmax=args.add_softmax,
                   attn_proj_vary=args.attn_proj_vary,
                   learn_prior=args.learn_prior)
    init_para_frompretrained(VAE.transformer,
                             gpt2_model.transformer,
                             share_para=True)
    init_para_frompretrained(VAE.encoder,
                             gpt2_model.transformer,
                             share_para=False)
    if args.learn_prior:
        init_para_frompretrained(VAE.encoder_prior,
                                 VAE.encoder,
                                 share_para=True)
        VAE.encoder_prior.averageSelfAttention.attention_weights = VAE.encoder.averageSelfAttention.attention_weights
    VAE.lm_head.weight = gpt2_model.lm_head.weight
    if VAE.add_softmax:
        VAE.lm_head_rep = Conv1D(*gpt2_model.lm_head.weight.size())
        # VAE.lm_head_rep = LM_head_rep(*gpt2_model.lm_head.weight.size()[::-1])
    print('VAE_params:', num_params(VAE))  # 286694400
    args.load = args.model_path
    if args.load:
        print('Loading model weights...')
        state = torch.load(os.path.join(args.load), map_location='cpu')
        if 'module' in list(state.keys(
        ))[0]:  # model_path is data parallel model with attr 'module'
            state_copy = copy.copy(state)
            keys = state_copy.keys()
            for k in keys:
                state[k.replace('module.', '')] = state.pop(k)
        VAE.load_state_dict(state)
        gc.collect()
    print('Model loaded.')

    print('Setup data...')
    seq_len = VAE.config.n_ctx
    test_loader = prepare_dataset(args.data_dir,
                                  args.dataset,
                                  tokenizer,
                                  1,
                                  seq_len,
                                  1,
                                  seq_len,
                                  args.batch_size,
                                  seq_len,
                                  make_train=False,
                                  make_val=False,
                                  make_test=True,
                                  data_type=args.data_type)[0]
    print('Done.')

    VAE.eval()  # be careful about VAE.eval() vs VAE.train()
    VAE.to(device)
    loss_fn = nn.CrossEntropyLoss(reduction='none')

    logging.info(
        '\n----------------------------------------------------------------------'
    )
    logging.info("Testing loop. batches: %d" % len(test_loader))

    endoftext = tokenizer.convert_tokens_to_ids("<|endoftext|>")
    startofcond = tokenizer.convert_tokens_to_ids("<|startofcond|>")
    endofcond = tokenizer.convert_tokens_to_ids("<|endofcond|>")

    n_samples = 0
    bleu4_sum = 0.0
    rouge_scores_values_sum = [0.0] * 9

    model_type = args.model_type

    # test_iter = iter(test_loader); x_mask, x_tokens, y_mask, y_tokens, input_tokens, target_tokens, mask = next(test_iter)
    with tqdm(total=len(test_loader)) as pbar:
        for i_test, (x_mask, x_tokens, y_mask, y_tokens, input_tokens,
                     target_tokens, mask) in enumerate(test_loader):

            length = args.length
            if length == -1:
                length = VAE.config.n_ctx - 1
            elif length > VAE.config.n_ctx - 1:
                raise ValueError(
                    "Can't get samples longer than window size: %s" %
                    VAE.config.n_ctx)

            eff_samples = []
            n, l = target_tokens.size()
            storys = [tokenizer.decode(target_tokens[i, :]) for i in range(n)]
            storys_str = [
                s[:s.find("<|endoftext|>") +
                  len("<|endoftext|>")] if "<|endoftext|>" in s else s
                for s in storys
            ]

            for _ in range(args.nsamples // args.batch_size):
                # model, batch_size, temperature, top_k, top_p, eos_token, sample = VAE, args.batch_size, args.temperature, args.top_k, args.top_p, tokenizer.encoder['<|endoftext|>'], True
                out, _ = sample_sequence(
                    model=VAE,
                    tokenizer=tokenizer,
                    length=length,
                    batch_size=args.batch_size,
                    x_mask=x_mask,
                    x_tokens=x_tokens,
                    y_mask=y_mask,
                    y_tokens=y_tokens,
                    temperature=args.temperature,
                    top_k=args.top_k,
                    top_p=args.top_p,
                    device=device,
                    eos_token=tokenizer.encoder['<|endoftext|>'],
                    model_type=model_type)
                out = out.tolist()

                # extract story, check metrics
                for i in range(len(out)):
                    text = out[i]
                    text = text[text.index(endoftext) + 1:]

                    if endoftext in text:
                        idx = text.index(endoftext)
                        text = text[:idx]

                    text = tokenizer.decode(text).strip()

                    # score for one long text, higher than 0.075 usually means repetition
                    # rep_score = repeat_score(text.split(), ngram=[3, 4, 5, 6, 7, 8])
                    # if rep_score > 0.075:
                    #     # print(rep_score)
                    #     continue

                    try:
                        # check bleu
                        bleu4 = sentence_bleu(
                            [storys_str[i].split()],
                            text,
                            smoothing_function=SmoothingFunction().method7)

                        # check rouge
                        rouge = Rouge()
                        rouge_scores = rouge.get_scores(text, storys_str[i])
                        rouge_scores_values = [
                            v for k in rouge_scores[0].keys()
                            for v in rouge_scores[0][k].values()
                        ]

                        bleu4_sum += bleu4
                        rouge_scores_values_sum = [
                            v1 + v2 for v1, v2 in zip(rouge_scores_values_sum,
                                                      rouge_scores_values)
                        ]
                        n_samples += 1
                    except:
                        bleu4 = 0.0
                        rouge_scores = [{
                            'rouge-1': {
                                'f': 0.0,
                                'p': 0.0,
                                'r': 0.0
                            },
                            'rouge-2': {
                                'f': 0.0,
                                'p': 0.0,
                                'r': 0.0
                            },
                            'rouge-l': {
                                'f': 0.0,
                                'p': 0.0,
                                'r': 0.0
                            }
                        }]

                    eff_samples.append((text, bleu4, rouge_scores))

                # write samples to file
                samples_file = open(save_folder + 'batch-' + '%04d' % i_test +
                                    '.txt',
                                    'w',
                                    encoding='utf8')
                for i in range(len(eff_samples)):
                    samples_file.write("=" * 50 + " SAMPLE " + str(i) + " " +
                                       "=" * 50)
                    samples_file.write('\n' * 2)

                    samples_file.write("=" * 40 + " Outlines  " + "=" * 40)
                    samples_file.write('\n' * 2)
                    samples_file.write(
                        tokenizer.decode(
                            x_tokens[i, :][x_mask[i, :] == 1].tolist()))
                    samples_file.write('\n' * 2)
                    samples_file.write("=" * 40 + " Story " + "=" * 40)
                    samples_file.write('\n' * 2)
                    samples_file.write(storys_str[i])
                    samples_file.write('\n' * 2)

                    samples_file.write("=" * 40 + " Generated " + "=" * 40)
                    samples_file.write('\n' * 2)
                    samples_file.write(eff_samples[i][0])
                    samples_file.write('\n' * 4)
                    samples_file.flush()

                logging.info('batch %04d finished.', i_test)
                pbar.update(1)

    print('Test complete with %05d samples.' % n_samples)
    logging.info("Test complete with %05d samples.", n_samples)

    bleu4 = round(bleu4_sum / n_samples, 3)
    rouge_scores_values = [
        round(r / n_samples, 3) for r in rouge_scores_values_sum
    ]
    print(' bleu-4:', bleu4)
    print(' rouge :', rouge_scores_values)
    logging.info(' bleu-4: %f', bleu4)
    logging.info(' rouge : %s', str(rouge_scores_values))
    iteration = int(sys.argv[5])

    # Tell pytorch to run this model on the GPU.
    if use_gpu:
        device = torch.device('cuda:' + str(gpu_id))
        secondary_device = torch.device('cuda:' + str(secondary_gpu_id))
    else:
        device = torch.device("cpu")
        secondary_device = torch.device("cpu")

    df = pickle.load(open(pkl_dump_dir + "df_fine.pkl", "rb"))
    parent_to_child = pickle.load(open(pkl_dump_dir + "parent_to_child.pkl", "rb"))

    fine_labels = list(set(df.label.values))

    coarse_tokenizer = GPT2Tokenizer.from_pretrained(coarse_tok_path, do_lower_case=True)
    coarse_model = torch.load(model_path + model_name, map_location=device)
    coarse_model.to(secondary_device)

    seed_val = 42
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    all_true = []
    all_preds = []
    for p in [parent_label]:
        print("Training coarse label:", p)
    random.shuffle(model_files)
    model_files = model_files[:args.max_sample_num]
    print(f"total len of files: {len(model_files)}")
    entropies = []
    max_probs = []
    print(args.spec_name)
    if 'pegasus' in args.model_name:
        from transformers import PegasusTokenizer

        bpe_tokenizer = PegasusTokenizer.from_pretrained(args.model_name)
        EOS_TOK_IDs = [106, bpe_tokenizer.eos_token_id, 2]  # <n>
    elif 'gpt' in args.model_name:
        from transformers import GPT2Tokenizer

        bpe_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        EOS_TOK_IDs = [bpe_tokenizer.eos_token_id]
    elif 'bart' in args.model_name:
        from transformers import BartTokenizer

        bpe_tokenizer = BartTokenizer.from_pretrained(args.model_name)
        EOS_TOK_IDs = [bpe_tokenizer.eos_token_id]
    else:
        raise NotImplementedError
    try:
        outputs = []
        outputs_pos_entropy = []
        for f in model_files:
            with open(os.path.join(args.cur_dir, f), 'rb') as fd:
                data = pickle.load(fd)
            print(f"Finish loading {f}")
Beispiel #6
0
 def __init__(self, model_path='gpt2', tokenizer_path='gpt2'):
     self.tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
     self.tokenizer.pad_token = '<|endoftext|>'
     self.model = GPT2LMHeadModel.from_pretrained(model_path)
    ap.add_argument('--k', type=int, default=1)
    ap.add_argument('--layer', type=int, default=-1)
    ap.add_argument('--out_dir', type=str, default='results')

    args = ap.parse_args()

    algo = args.algo
    k = args.k
    layer = args.layer
    out_dir = args.out_dir
    model_type = args.model_type

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    tokenizer = GPT2Tokenizer.from_pretrained(model_type)
    model = Model(device='cuda')
    DEVICE = 'cuda'

    templates = get_template_list()

    if args.algo == 'topk':
        marg_contrib_path = out_dir + "/marg_contrib.pickle"
        if os.path.exists(marg_contrib_path):
            print('Using cached marginal contribution')
            marg_contrib = pickle.load(open(marg_contrib_path, "rb"))
            layer_list = marg_contrib['layer']
            neuron_list = marg_contrib['neuron']
        else:
            print('Computing marginal contribution')
            layer_list, neuron_list = get_all_contrib(templates, tokenizer,
 def get_tokenizer(self, **kwargs):
     kwargs.update(self.special_tokens_map)
     return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
def main():
    parser = get_parser()
    args = parser.parse_args()

    if not args.model_name:
        args.model_name = args.model_path

    if args.doc_stride >= args.max_seq_length - args.max_query_length:
        logger.warning(
            "WARNING - You've set a doc stride which may be superior to the document length in some "
            "examples. This could result in errors when building features from the examples. Please reduce the doc "
            "stride or increase the maximum length to ensure the features are correctly built."
        )

    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and not args.overwrite_output_dir
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Set device
    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.DEBUG if args.debug else logging.INFO
    )

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    config = GPT2Config.from_pretrained(
        args.config_name if args.config_name else args.model_path,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    tokenizer = GPT2Tokenizer.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else args.model_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    tokenizer.add_tokens(['question:', ':question'])
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.sep_token = tokenizer.eos_token
    tokenizer.encode = partial(tokenizer.encode, is_pretokenized=True, truncation=True)
    tokenizer.encode_plus = partial(tokenizer.encode_plus, is_pretokenized=True, truncation=True)

    model = GPT2LMHeadModel.from_pretrained(
        args.model_path,
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    model.resize_token_embeddings(len(tokenizer))
    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will
    # remove the need for this code, but it is still valid.
    if args.fp16:
        try:
            import apex
            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")

    # Training
    train_dataset = load_and_cache_examples(args, tokenizer, 'quest_gen', evaluate=False, gpt=True)
    train_dataset = preprocess_dataset(train_dataset, tokenizer)

    dev_dataset = load_and_cache_examples(args, tokenizer, 'quest_gen', evaluate=True, gpt=True)
    dev_dataset = preprocess_dataset(dev_dataset, tokenizer)

    train(args, train_dataset, dev_dataset, model, tokenizer)
    logging.info('Finished training !')

    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
    # They can then be reloaded using `from_pretrained()`
    # Good practice: save your training arguments together with the trained model
    logger.info("Saving final model checkpoint to %s", args.output_dir)
    model.save_pretrained(args.output_dir)
    tokenizer.save_pretrained(args.output_dir)
    torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
Beispiel #10
0
 def __init__(self, device='cuda'):
     self.model_name = 'sberbank-ai/rugpt3medium_based_on_gpt2'
     self.model_type = 'gpt2'
     self.tokenizer = GPT2Tokenizer.from_pretrained(self.model_name)
     self.model = GPT2LMHeadModel.from_pretrained(self.model_name)
     self.model.to(device)
import spacy
import xx_ent_wiki_sm
from deeppavlov import build_model, configs

morph = pymorphy2.MorphAnalyzer()

nlp = xx_ent_wiki_sm.load()
nlp.add_pipe(nlp.create_pipe('sentencizer'), first=True)

syntax_model = build_model(configs.syntax.syntax_ru_syntagrus_bert,
                           download=True)

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

tokenizer = GPT2Tokenizer.from_pretrained(
    "sberbank-ai/rugpt3large_based_on_gpt2")

model = GPT2LMHeadModel.from_pretrained(
    "sberbank-ai/rugpt3large_based_on_gpt2")
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
model.to(device)
print("Success!")

russian_restricted_pronouns = "я мной меня мною мне мы нас нам нами ты тебя тебе тобою тобой вы вас вам вами".split(
)
extra_marks = re.compile(r"&[a-zA-Z0-9;]+")
expanding_startings = [
    "В то же время",
def test_fused_upper_triangle_mask_softmax():
    from megatron.model.gpt2_model import (
        gpt2_attention_mask_func as attention_mask_func, )
    from megatron.model.fused_softmax import FusedScaleMaskSoftmax, SoftmaxFusionTypes

    gpt = GPT2Model.from_pretrained("gpt2").cuda().half()
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    test_text = (
        "Hello. How are you? I am fine thank you and you? yes Good. "
        "hi hi hi hi hi hi hi"  # 24
    )

    tokens = tokenizer(
        [test_text] * 4,
        return_tensors="pt",
    )

    attention_mask = tokens["attention_mask"].cuda()
    attention_mask = attention_mask.view(attention_mask.size(0), -1)
    attention_mask = attention_mask[:, None, None, :]
    attention_mask = (1.0 - attention_mask) * -10000.0
    attention_mask = attention_mask.repeat(1, 1, attention_mask.size()[-1], 1)
    attn = gpt.h[0]

    hidden_states = gpt.wte(tokens["input_ids"].cuda())
    q, k, v = attn.attn.c_attn(hidden_states).split(768, dim=-1)
    q = attn.attn._split_heads(q, attn.attn.num_heads, attn.attn.head_dim)
    k = attn.attn._split_heads(k, attn.attn.num_heads, attn.attn.head_dim)
    attn_weights = torch.matmul(q, k.transpose(-1, -2))

    sq, sk = q.size(-2), k.size(-2)
    causal_mask = attn.attn.bias[:, :, sk - sq:sk, :sk].bool()
    total_mask = ~(causal_mask & (attention_mask == 0))
    """
    tensor([[[[False,  True,  True,  ...,  True,  True,  True],
              [False, False,  True,  ...,  True,  True,  True],
              [False, False, False,  ...,  True,  True,  True],
              ...,
              [False, False, False,  ..., False,  True,  True],
              [False, False, False,  ..., False, False,  True],
              [False, False, False,  ..., False, False, False]]]
    """

    fused_softmax = (FusedScaleMaskSoftmax(
        input_in_fp16=True,
        input_in_bf16=False,
        mask_func=attention_mask_func,
        fusion_type=SoftmaxFusionTypes.upper_triang,
        scale=None,
        softmax_in_fp32=False,
    ).cuda().half())

    fused_softmax_output = fused_softmax(
        attn_weights,
        total_mask,
    )

    torch_softmax = (FusedScaleMaskSoftmax(
        input_in_fp16=True,
        input_in_bf16=False,
        fusion_type=SoftmaxFusionTypes.none,
        mask_func=attention_mask_func,
        scale=None,
        softmax_in_fp32=False,
    ).cuda().half())

    torch_softmax_output = torch_softmax(
        attn_weights,
        total_mask,
    )

    test_result = (fused_softmax_output - torch_softmax_output).abs()

    while test_result.dim() != 1:
        test_result = test_result.mean(dim=-1)

    diff = test_result.mean(dim=-1)

    if diff <= 1e-3:
        print(
            f"\n[Success] test_fused_upper_triangle_mask_softmax"
            f"\n > mean_difference={diff}"
            f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}"
            f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}"
        )
    else:
        print(
            f"\n[Fail] test_fused_upper_triangle_mask_softmax"
            f"\n > mean_difference={diff}, "
            f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}, "
            f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}"
        )
Beispiel #13
0
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', pad_token='<PAD>')
# IMPORTANT: Note that setting the <PAD> token like this itn the constructor gives the
# pad_token the pad_token_id = 50256, which normally belongs to <BOS> token_ids in GPT2
# This is a very ugly way that works at the moment of setting the pad_token_id to the <BOS> token that is already included in the vocab size. This will be updated in the coming weeks! # noqa: E501

prompt_text = [
    'in this paper we', 'we are trying to',
    'The purpose of this workshop is to check whether we can'
]

# encode plus batch handles multiple batches and automatically creates attention_masks
seq_len = 11
encodings_dict = tokenizer.batch_encode_plus(prompt_text,
                                             max_length=seq_len,
                                             pad_to_max_length=True)

# ideally we should be able to just input the following two variables to the function model.generate() ... => to be implemented soon!  # noqa: E501
input_ids = torch.tensor(encodings_dict['input_ids'])
attn_mask = torch.tensor(encodings_dict['attention_mask'])

num_tokens_to_produce = 20
pad_token_id = tokenizer.pad_token_id
eos_token_id = tokenizer.eos_token_id
eos_not_in_sents = torch.ones(input_ids.shape[0]).long()

# we need to get the token ids of the last non-padded value
last_non_masked_idx = torch.sum(attn_mask, dim=1) - 1
Beispiel #14
0
def run_model():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model-path', type=str, help='pretrained model path to local checkpoint')
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--nsamples", type=int, default=8)
    parser.add_argument("--batch_size", type=int, default=8)
    parser.add_argument("--length", type=int, default=-1)
    parser.add_argument("--temperature", type=int, default=0.95)
    parser.add_argument('--top_p', type=float, default=0.95)
    parser.add_argument('--top_k', type=int, default=100)
    parser.add_argument('--data-dir', type=str, default='data')
    parser.add_argument('--out-dir', type=str, default='out')

    parser.add_argument('--model_type', type=str, default='m', choices=['b0', 'b1', 'm'], help="b: baseline, m: model")
    parser.add_argument('--dataset', type=str, default='wp', choices=['wp', 'wi'], help="Dataset to use for training")

    # use GPU
    parser.add_argument('--gpu', default=0, type=int)
    parser.add_argument('--no_gpu', action="store_true")

    args = parser.parse_args('--model-path out/wp4.0223/model_latest.pt'.split())
    print(args)

    # GPU
    if not torch.cuda.is_available(): args.no_gpu = True
    gpu = not args.no_gpu
    if gpu: torch.cuda.set_device(args.gpu)
    device = torch.device(args.gpu if gpu else "cpu")

    # randomness
    np.random.seed(args.seed)
    prng = np.random.RandomState()
    torch.random.manual_seed(args.seed)
    if gpu: torch.cuda.manual_seed(args.seed)

    if args.batch_size == -1:
        args.batch_size = 1
    assert args.nsamples % args.batch_size == 0

    # logging
    save_folder = args.model_path + '.eval/'
    os.makedirs(save_folder, exist_ok=True)
    importlib.reload(logging)
    logging.basicConfig(filename=os.path.join(save_folder, 'eval.log'),
                        level=logging.INFO, format='%(asctime)s--- %(message)s')
    logging.info('\n----------------------------------------------------------------------')
    #logging.info("the configuration:")
    #logging.info(str(args).replace(',', '\n'))

    print('Loading models...')
    cache_dir = os.path.join(args.out_dir, 'model_cache')
    os.makedirs(cache_dir, exist_ok=True)
    # Load pre-trained teacher tokenizer (vocabulary)
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=cache_dir)
    tokenizer.max_len = int(1e12)
    model = GPT2LMHeadModel.from_pretrained('gpt2', cache_dir=cache_dir)
    # add special tokens
    special_tokens_dict = {
        'pad_token': '<|startoftext|>',
        'cls_token': '<|startofcond|>',
        'sep_token': '<|sepofcond|>',
        'mask_token': '<|endofcond|>'
    }
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    print('We have added', num_added_toks, 'special tokens')
    # Notice: resize_token_embeddings expect to receive the full size of the new vocab
    model.resize_token_embeddings(len(tokenizer))
    assert tokenizer.pad_token == '<|startoftext|>'
    if args.model_path:
        state = torch.load(args.model_path, map_location='cpu')
        if 'module' in list(state.keys())[0]:  # model_path is data parallel model with attr 'module'
            state_copy = copy.copy(state)
            keys = state_copy.keys()
            for k in keys:
                state[k.replace('module.', '')] = state.pop(k)
        model.load_state_dict(state)
        logging.info('load model from ' + args.model_path)
    model.to(device)
    model.eval()
    print('Model loaded.')

    seq_len = model.config.n_ctx
    test_loader = prepare_dataset(
        args.data_dir, args.dataset, tokenizer,
        1, seq_len, 1, seq_len, args.batch_size, seq_len,
        make_train=False, make_val=False, make_test=True, model_type=args.model_type
    )[0]

    logging.info('\n----------------------------------------------------------------------')
    logging.info("Testing loop. batches: %d" % len(test_loader))

    endoftext = tokenizer.convert_tokens_to_ids("<|endoftext|>")
    startofcond = tokenizer.convert_tokens_to_ids("<|startofcond|>")
    endofcond = tokenizer.convert_tokens_to_ids("<|endofcond|>")

    n_samples = 0
    bleu4_sum = 0.0
    rouge_scores_values_sum = [0.0] * 9

    with tqdm(total=len(test_loader)) as pbar:
        for i_test, (context, context_mask, keys, storys) in enumerate(test_loader):
            # test_iter = iter(test_loader); context, context_mask, keys, storys = next(test_iter)
            if all([len(key)==0 for key in keys]):
                keys = None
            length = args.length
            if length == -1:
                length = model.config.n_ctx - context.size(1)
            elif length > model.config.n_ctx - context.size(1):
                raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)

            eff_samples = []
            storys_str = ['\n\n'.join([tokenizer.decode(s) for s in story]) for story in storys] # use '\n\n' as paragraph separator
            for _ in range(args.nsamples // args.batch_size):
                # batch_size, temperature, top_k, top_p, eos_token, sample = args.batch_size, args.temperature, args.top_k, args.top_p, tokenizer.encoder['<|endoftext|>'], True
                out, _ = sample_sequence(
                    model=model,
                    tokenizer=tokenizer,
                    length=length,
                    batch_size=args.batch_size,
                    context=context,
                    context_mask=context_mask,
                    temperature=args.temperature,
                    top_k=args.top_k,
                    top_p=args.top_p,
                    device = device,
                    eos_token = tokenizer.encoder['<|endoftext|>'],
                    keys=keys
                )
                out = out.tolist()

                # just print
                # generated = 0
                # for i in range(args.batch_size):
                #     generated += 1
                #     text = tokenizer.decode(out[i])
                #     print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
                #     print(text)

                # extract story, check metrics
                for i in range(len(out)):
                    text = out[i]
                    text = text[text.index(endoftext) + 1:]

                    if endoftext in text:
                        idx = text.index(endoftext)
                        text = text[:idx]

                    story_sample = []
                    while startofcond in text and endofcond in text and text.index(startofcond) < text.index(endofcond):
                        idx = text.index(startofcond)
                        story_sample.append(text[:idx])
                        idx = text.index(endofcond)
                        text = text[idx + 1:]
                    if startofcond not in text and endofcond not in text:
                        story_sample.append(text)
                    text = '\n\n'.join([tokenizer.decode(s) for s in story_sample]).strip()

                    # score for one long text, higher than 0.075 usually means repetition
                    # rep_score = repeat_score(text.split(), ngram=[3, 4, 5, 6, 7, 8])
                    # if rep_score > 0.075:
                    #     # print(rep_score)
                    #     continue

                    try:
                        # check bleu
                        bleu4 = sentence_bleu([storys_str[i].split()], text, smoothing_function=SmoothingFunction().method7)

                        # check rouge
                        rouge = Rouge()
                        rouge_scores = rouge.get_scores(text, storys_str[i])
                        rouge_scores_values = [v for k in rouge_scores[0].keys() for v in rouge_scores[0][k].values()]

                        bleu4_sum += bleu4
                        rouge_scores_values_sum = [v1 + v2 for v1, v2 in zip(rouge_scores_values_sum, rouge_scores_values)]
                        n_samples += 1
                    except:
                        bleu4 = 0.0
                        rouge_scores = [{'rouge-1': {'f': 0.0, 'p': 0.0, 'r': 0.0},
                                         'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0},
                                         'rouge-l': {'f': 0.0, 'p': 0.0, 'r': 0.0}}]

                    eff_samples.append((text, bleu4, rouge_scores))

                # write samples to file
                samples_file = open(save_folder + 'batch-' + '%04d' % i_test + '.txt', 'w', encoding='utf8')
                for i in range(len(eff_samples)):
                    samples_file.write("=" * 50 + " SAMPLE " + str(i) + " " + "=" * 50)
                    samples_file.write('\n' * 2)

                    samples_file.write("=" * 40 + " Outlines  " + "=" * 40)
                    samples_file.write('\n' * 2)
                    samples_file.write(tokenizer.decode(context[i, :-1][context_mask[i, :] == 1].tolist()))
                    if keys is not None:
                        samples_file.write('\n\n'.join([tokenizer.decode(s) for s in keys[i]]))
                    samples_file.write('\n' * 2)
                    samples_file.write("=" * 40 + " Story " + "=" * 40)
                    samples_file.write('\n' * 2)
                    samples_file.write(storys_str[i])
                    samples_file.write('\n' * 2)

                    samples_file.write("=" * 40 + " Generated " + "=" * 40)
                    samples_file.write('\n' * 2)
                    samples_file.write(eff_samples[i][0])
                    samples_file.write('\n' * 2)
                    samples_file.write(str(eff_samples[i][1:]))
                    samples_file.write('\n' * 4)
                    samples_file.flush()

                logging.info('batch %04d finished.', i_test)
                pbar.update(1)

    print('Test complete with %05d samples.' % n_samples)
    logging.info("Test complete with %05d samples.", n_samples)

    bleu4 = round(bleu4_sum / n_samples, 3)
    rouge_scores_values = [round(r / n_samples, 3) for r in rouge_scores_values_sum]
    print(' bleu-4:', bleu4)
    print(' rouge :', rouge_scores_values)
    logging.info(' bleu-4: %f', bleu4)
    logging.info(' rouge : %s', str(rouge_scores_values))
Beispiel #15
0
def initialize_model(config, d_out, is_featurizer=False):
    """
    Initializes models according to the config
        Args:
            - config (dictionary): config dictionary
            - d_out (int): the dimensionality of the model output
            - is_featurizer (bool): whether to return a model or a (featurizer, classifier) pair that constitutes a model.
        Output:
            If is_featurizer=True:
            - featurizer: a model that outputs feature Tensors of shape (batch_size, ..., feature dimensionality)
            - classifier: a model that takes in feature Tensors and outputs predictions. In most cases, this is a linear layer.

            If is_featurizer=False:
            - model: a model that is equivalent to nn.Sequential(featurizer, classifier)

        Pretrained weights are loaded according to config.pretrained_model_path using either transformers.from_pretrained (for bert-based models)
        or our own utils.load function (for torchvision models, resnet18-ms, and gin-virtual). 
        There is currently no support for loading pretrained weights from disk for other models.
    """
    if config.model in ('resnet18', 'resnet34', 'resnet50', 'resnet101', 'wideresnet50', 'densenet121'):
        if is_featurizer:
            featurizer = initialize_torchvision_model(
                name=config.model,
                d_out=None,
                **config.model_kwargs)
            classifier = nn.Linear(featurizer.d_out, d_out)
            model = (featurizer, classifier)
        else:
            model = initialize_torchvision_model(
                name=config.model,
                d_out=d_out,
                **config.model_kwargs)

    elif 'bert' in config.model:
        if is_featurizer:
            featurizer = initialize_bert_based_model(config, d_out, is_featurizer)
            classifier = nn.Linear(featurizer.d_out, d_out)
            model = (featurizer, classifier)
        else:
            model = initialize_bert_based_model(config, d_out)

    elif config.model == 'resnet18_ms':  # multispectral resnet 18
        from models.resnet_multispectral import ResNet18
        if is_featurizer:
            featurizer = ResNet18(num_classes=None, **config.model_kwargs)
            classifier = nn.Linear(featurizer.d_out, d_out)
            model = (featurizer, classifier)
        else:
            model = ResNet18(num_classes=d_out, **config.model_kwargs)

    elif config.model == 'gin-virtual':
        from models.gnn import GINVirtual
        if is_featurizer:
            featurizer = GINVirtual(num_tasks=None, **config.model_kwargs)
            classifier = nn.Linear(featurizer.d_out, d_out)
            model = (featurizer, classifier)
        else:
            model = GINVirtual(num_tasks=d_out, **config.model_kwargs)

    elif config.model == 'code-gpt-py':
        from models.code_gpt import GPT2LMHeadLogit, GPT2FeaturizerLMHeadLogit
        from transformers import GPT2Tokenizer
        name = 'microsoft/CodeGPT-small-py'
        tokenizer = GPT2Tokenizer.from_pretrained(name)
        if is_featurizer:
            model = GPT2FeaturizerLMHeadLogit.from_pretrained(name)
            model.resize_token_embeddings(len(tokenizer))
            featurizer = model.transformer
            classifier = model.lm_head
            model = (featurizer, classifier)
        else:
            model = GPT2LMHeadLogit.from_pretrained(name)
            model.resize_token_embeddings(len(tokenizer))

    elif config.model == 'logistic_regression':
        assert not is_featurizer, "Featurizer not supported for logistic regression"
        model = nn.Linear(out_features=d_out, **config.model_kwargs)

    else:
        raise ValueError(f'Model: {config.model} not recognized.')

    # Load pretrained weights from disk using our utils.load function
    # This has only been tested on some models (mostly vision), so run this code iff we're sure it works
    # We've already loaded pretrained weights for bert-based models using the transformers library 
    if config.model not in ('code-gpt-py', 'logistic_regression', 'unet-seq', 'fasterrcnn') and 'bert' not in config.model:
        if config.pretrained_model_path and os.path.exists(config.pretrained_model_path): 
            try:
                if type(model) is tuple: 
                    # load both featurizer and classifier
                    prev_epoch, best_val_metric = load(
                        nn.Sequential(*model), 
                        config.pretrained_model_path, device=config.device
                    )
                else: 
                    prev_epoch, best_val_metric = load(model, config.pretrained_model_path, device=config.device)

                print(
                    (f'Initialized model with pretrained weights from {config.pretrained_model_path} ')
                    + (f'previously trained for {prev_epoch} epochs ' if prev_epoch else '')
                    + (f'with previous val metric {best_val_metric} ' if best_val_metric else '')
                )
            except:
                print('Something went wrong loading the pretrained model.')
                pass

    return model
Beispiel #16
0
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
import torch
import logging
logging.getLogger().setLevel(logging.CRITICAL)

device = 'cpu'

if torch.cuda.is_available():
    device = 'cuda'
pt_model = 'gpt2'
print("Importing " + pt_model)

tokenizer = GPT2Tokenizer.from_pretrained(pt_model)
model = GPT2LMHeadModel.from_pretrained(pt_model)
model = model.to(device)
print(pt_model + "model imported")

# Function to first select topN tokens from the probability list and then based on the selected N word distribution
# get random token ID


def choose_from_top(probs, n=40):
    print("Selecting Word")
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob)  # Normalize
    choice = np.random.choice(n, 1, p=top_prob)
    token_id = ind[choice][0]
    return int(token_id)
Beispiel #17
0
 def __init__(self, config):
     self.device = "cuda" if torch.cuda.is_available() else "cpu"
     print(f"using device: {self.device}")
     self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
     self.model = GPT2LMHeadModel.from_pretrained("gpt2").to(self.device)
Beispiel #18
0
 def __init__(self):
     self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
     self.model = GPT2LMHeadModel.from_pretrained('gpt2')
     self.device = torch.device(
         'cuda:0' if torch.cuda.is_available() else 'cpu')
Beispiel #19
0
def filter_nan(df):
    sel = df["Summary"].notnull()
    summaries = df["Summary"][sel].values.tolist()
    reviews = df["Text"][sel].values.tolist()
    return reviews, summaries


if __name__ == "__main__":

    import pandas as pd
    from transformers import GPT2Tokenizer

    len_dict = {
        "review": AMAZON_REVIEW_LENGTH,
        "summary": AMAZON_SUMMARY_LENGTH
    }

    PRETRAINED_MODEL_NAME = "gpt2"
    REVIEW_PATH = "../../data/amazon_fine_food_review/Reviews.csv"

    df = pd.read_csv(REVIEW_PATH)
    tokenizer = GPT2Tokenizer.from_pretrained(PRETRAINED_MODEL_NAME,
                                              bos_token=BOS,
                                              eos_token=EOS,
                                              pad_token=PAD)

    # build dataset
    reviews, summaries = filter_nan(df)
    dataset = AmazonReviewV1(reviews, summaries, tokenizer, len_dict, BOS, EOS)
def main():
    #print("MODEL NAME, BATCH SIZE, AVG LATENCY (ms), AVG MEM USAGE (MiB)")
    #parser
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name', type=str)
    parser.add_argument('--num_inference', type=int)
    parser.add_argument('--batch_size', type=int)
    parser.add_argument('--gpu', action="store_true", default=False)
    args = parser.parse_args()
    model_name = args.model_name
    num_inference = args.num_inference
    batch_size = args.batch_size
    use_gpu = args.gpu and torch.cuda.is_available()
    # stores latency / memory usage values
    l_inference_latency = list()
    l_memory_capacity = list()
    # call corresponding DNN model...
    # TODO: ADD OTHER MODELS - RESNET50, ...
    # TODO: FIX NLP MODELS' SEQUENCE LENGTH
    if (model_name == "resnet18"):
        with torch.no_grad():
            model = models.resnet18(True, True)
            if use_gpu:
                model = model.cuda()
            # inference
            for i in range(num_inference):
                # input
                inputs = torch.zeros(batch_size, 3, 224, 224)
                if use_gpu:
                    inputs = inputs.to('cuda')
                start_time = time.time()
                _ = model(inputs)
                torch.cuda.synchronize()
                end_time = time.time()
                l_inference_latency.append(end_time - start_time)
                l_memory_capacity.append(torch.cuda.memory_allocated())
            str_avg_inf_time = sec_to_ms(
                average_90_percent(l_inference_latency))
            str_avg_mem_usage = bytes_to_mib(
                average_90_percent(l_memory_capacity))
            print(",".join([
                "RESNET18",
                str(batch_size), str_avg_inf_time, str_avg_mem_usage
            ]))

    elif (model_name == "wide_resnet101_2"):
        with torch.no_grad():
            model = models.wide_resnet101_2(True, True)
            if use_gpu:
                model = model.cuda()
            # inference
            for i in range(num_inference):
                # input
                inputs = torch.zeros(batch_size, 3, 224, 224)
                if use_gpu:
                    inputs = inputs.to('cuda')
                start_time = time.time()
                _ = model(inputs)
                torch.cuda.synchronize()
                end_time = time.time()
                l_inference_latency.append(end_time - start_time)
                l_memory_capacity.append(torch.cuda.memory_allocated())
            str_avg_inf_time = sec_to_ms(
                average_90_percent(l_inference_latency))
            str_avg_mem_usage = bytes_to_mib(
                average_90_percent(l_memory_capacity))
            print(",".join([
                "WIDE-RESNET101-2",
                str(batch_size), str_avg_inf_time, str_avg_mem_usage
            ]))

    elif (model_name == "mobilenet"):
        with torch.no_grad():
            model = models.mobilenet_v2(True, True)
            if use_gpu:
                model = model.cuda()
            # warmup
            for i in range(num_inference):
                inputs = torch.zeros(batch_size, 3, 224, 224)
                if use_gpu:
                    inputs = inputs.to('cuda')
                start_time = time.time()
                _ = model(inputs)
                torch.cuda.synchronize()
                end_time = time.time()
                l_inference_latency.append(end_time - start_time)
                l_memory_capacity.append(torch.cuda.memory_allocated())
            str_avg_inf_time = sec_to_ms(
                average_90_percent(l_inference_latency))
            str_avg_mem_usage = bytes_to_mib(
                average_90_percent(l_memory_capacity))
            print(",".join([
                "MOBILENET_V2",
                str(batch_size), str_avg_inf_time, str_avg_mem_usage
            ]))

    elif (model_name == "bert"):
        with torch.no_grad():
            tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
            model = AutoModel.from_pretrained("bert-base-uncased")
            if use_gpu:
                model = model.cuda()
            # inference
            for i in range(num_inference):
                # BERT maximum sequence length 512
                sample_text = "BERT" * int(512 / 4)
                texts = [sample_text] * batch_size
                inputs = tokenizer(texts, return_tensors="pt")
                if use_gpu:
                    inputs = inputs.to('cuda')
                start_time = time.time()
                _ = model(**inputs)
                torch.cuda.synchronize()
                end_time = time.time()
                l_inference_latency.append(end_time - start_time)
                l_memory_capacity.append(torch.cuda.memory_allocated())
            str_avg_inf_time = sec_to_ms(
                average_90_percent(l_inference_latency))
            str_avg_mem_usage = bytes_to_mib(
                average_90_percent(l_memory_capacity))
            print(",".join([
                "BERT-BASE-UNCASED",
                str(batch_size), str_avg_inf_time, str_avg_mem_usage
            ]))

    elif (model_name == "gpt2"):
        with torch.no_grad():
            tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
            model = GPT2Model.from_pretrained("gpt2")
            if use_gpu:
                model = model.cuda()
            # inference
            for i in range(num_inference):
                # GPT2 maximum sequence length 124
                sample_text = "GPT2" * int(1024 / 4)
                texts = [sample_text] * batch_size
                inputs = tokenizer(texts, return_tensors="pt")
                if use_gpu:
                    inputs = inputs.to('cuda')
                start_time = time.time()
                _ = model(**inputs)
                torch.cuda.synchronize()
                end_time = time.time()
                l_inference_latency.append(end_time - start_time)
                l_memory_capacity.append(torch.cuda.memory_allocated())
            str_avg_inf_time = sec_to_ms(
                average_90_percent(l_inference_latency))
            str_avg_mem_usage = bytes_to_mib(
                average_90_percent(l_memory_capacity))
            print(",".join(
                ["GPT2",
                 str(batch_size), str_avg_inf_time, str_avg_mem_usage]))

    elif (model_name == "dlrm"):
        print("Unimplemented model: DLRM")
        # TODO: MAKE IT WORK... PLEASE
        '''
        with torch.no_grad():
            model = DLRM_Net()
            if use_gpu:
                model = model.cuda()
            # inference
            for i in range(num_inference):
                inputs = ????
                if use_gpu:
                    inputs = inputs.to('cuda')
                start_time = time.time()
                _ = model(**inputs)
                torch.cuda.synchronize()
                end_time = time.time()
                l_inference_latency.append(end_time - start_time)
                l_memory_capacity.append(torch.cuda.memory_allocated())
            str_avg_inf_time = sec_to_ms(average_90_percent(l_inference_latency))
            str_avg_mem_usage = bytes_to_mib(average_90_percent(l_memory_capacity))
            print(",".join(["GPT2", str(batch_size), str_avg_inf_time, str_avg_mem_usage]))
        '''
    else:
        print("Unidentified model name: {}".format(model_name))
        return
Beispiel #21
0
 def tokenizer(self):
     return GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
Beispiel #22
0
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
)

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model1 = GPT2LMHeadModel.from_pretrained("gpt2")


def gen_text(min_length=20,
             max_length=40,
             temperature=1.0,
             sentence_prefix=''):
    min_length = int(min_length)
    max_length = int(max_length)
    temperature = float(temperature)

    input_ids = tokenizer.encode(
        sentence_prefix,
        add_special_tokens=False,
        return_tensors="pt",
        add_space_before_punct_symbol=True,
    )

    output_ids = model1.generate(
        input_ids=input_ids,
        temperature=temperature,
        do_sample=True,
        min_length=min_length,
        max_length=max_length,  # desired output sentence length
        pad_token_id=model1.config.eos_token_id,
Beispiel #23
0
import numpy as np
import pandas as pd
import timeit
import torch

from torch.utils.data import TensorDataset
import transformers
import json
import argparse
import nltk

from helperGPT2 import execute_tokenization
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
assert (transformers.__version__ == '2.6.0')
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')

special_tokens = {
    'bos_token': '<|startoftext|>',
    'eos_token': '<|endoftext|>',
    'pad_token': '<pad>',
    'additional_special_tokens': ['<|keyword|>', '<|summarize|>']
}
tokenizer.add_special_tokens(special_tokens)
assert (len(tokenizer) == 50261)
"""
=================================================
END OF IMPORT AND INITIALIZATION
START OF THE HELPFER FUNCTION SECTION
=================================================
"""
        self.tokenizer = tokenizer
        self.tokenizer.max_len = 1500
        # tokenizer weird behavior
        self.turn_ending = tokenizer.cls_token_id#[628, 198]
        # tokenizer.encode("\n\n\n")        
    def __len__(self):
        return len(self.data)    
    def __getitem__(self, index):
        dial_tokens = tokenizer.encode(self.data[index][0]) + [self.turn_ending]
        cls_token_location = dial_tokens.index(self.tokenizer.cls_token_id)
        dial_act = self.data[index][1]
        return dial_tokens, cls_token_location, dial_act        
    def collate(self, unpacked_data):
        return unpacked_data

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
tokenizer.add_special_tokens({'cls_token': '[CLS]'})


class GPT2DoubleHeadsModel_modified(GPT2DoubleHeadsModel):
    def __init__(self, config):
        super().__init__(config)
        # config.num_labels = 1
        config.num_labels = le.classes_.shape[0]
        self.transformer = GPT2Model(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.multiple_choice_head = SequenceSummary(config)
        self.init_weights()

config = GPT2Config()
config = config.from_pretrained('gpt2-medium')
    return args


if __name__ == "__main__":
    # Load training parameters
    args = parse_args()

    # Claim the GPU (research cluster-specific issue)
    try:
        torch.ones(1).to(args.device)
    except RuntimeError as err:
        logging.error(err)
        sys.exit(1)

    # Load pre-trained OpenAI GPT-2 model
    tokenizer = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
    model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)
    model.to(args.device)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"  # Hack to be able to batch generate

    # Load and batch the prompts for quicker generation
    logging.info(f"Loading prompts from {args.prompt_path}")
    prompt_lst = []
    with open(args.prompt_path, 'r') as f:
        prompt_lst = [f"{line.strip()} [RESPONSE]" for line in f.readlines()]
    logging.info("total number of sentences = {}".format(len(prompt_lst)))
    prompt_lst_batch, prompt_lst_batch_idx = batchify(prompt_lst, args.bsz)
    logging.info("total batch size = {}".format(len(prompt_lst_batch)))

    # Start the output file
Beispiel #26
0
    def __init__(self,
                 model='bert',
                 model_size='base',
                 cased=True,
                 fine_tune=False,
                 use_proj=False,
                 proj_dim=256):
        super(Encoder, self).__init__()
        assert (model in MODEL_LIST)

        self.base_name = model
        self.model = None
        self.tokenizer = None
        self.num_layers = None
        self.hidden_size = None

        # First initialize the model and tokenizer
        model_name = ''
        # Do we want the tokenizer to lower case or not
        do_lower_case = not cased
        # Model is one of the BERT variants
        if 'bert' in model:
            assert (model_size in BERT_MODEL_SIZES)
            model_name = model + "-" + model_size
            if model == 'bert' and not cased:
                # Only original BERT supports uncased models
                model_name += '-uncased'
            elif model == 'roberta':
                # RoBERTa model types have no casing suffix in HuggingFace map
                # So we don't modify the model name
                pass
            else:
                model_name += '-cased'

            if model == 'bert':
                self.model = BertModel.from_pretrained(
                    model_name, output_hidden_states=True)
                self.tokenizer = BertTokenizer.from_pretrained(
                    model_name, do_lower_case=do_lower_case)
            elif model == 'roberta':
                self.model = RobertaModel.from_pretrained(
                    model_name, output_hidden_states=True)
                self.tokenizer = RobertaTokenizer.from_pretrained(
                    model_name, do_lower_case=do_lower_case)
            elif model == 'spanbert':
                # Model is loaded in a different way
                # Earlier "pytorch_transformers" required a .tar.gz URL/file.
                # Updated library "transformers" requires pytorch_model.bin and config.json
                # separately. That's why we have to keep the SpanBERT codebase around and initialize
                # the model using that codebase (based on pytorch_pretrained_bert).
                # NOTE: By default transformer models are initialized to eval() mode!
                # Not using the eval() mode will result in randomness.
                self.model = SpanbertModel.from_pretrained(model_name).eval()
                # SpanBERT uses the same tokenizer as BERT (that's why the slicing in model name).
                # We use the tokenizer from "transformers" since it provides an almost unified API.
                self.tokenizer = BertTokenizer.from_pretrained(
                    model_name[4:], do_lower_case=do_lower_case)

            self.num_layers = self.model.config.num_hidden_layers
            self.hidden_size = self.model.config.hidden_size

        elif model == "xlnet":
            model_name = model + "-" + model_size + "-cased"
            self.model = XLNetModel.from_pretrained(model_name,
                                                    output_hidden_states=True)
            self.tokenizer = XLNetTokenizer.from_pretrained(
                model_name, do_lower_case=do_lower_case)
            self.num_layers = self.model.config.num_hidden_layers
            self.hidden_size = self.model.config.hidden_size
        elif model == 'gpt2':
            assert (model_size in GPT2_MODEL_SIZES)
            model_name = model
            if model_size != "small":
                model_name += "-" + model_size

            self.model = GPT2Model.from_pretrained(model_name,
                                                   output_hidden_states=True)
            # Set the EOS token to be the PAD token since no explicit pad token
            # in GPT2 implementation.
            self.tokenizer = GPT2Tokenizer.from_pretrained(
                model_name,
                do_lower_case=do_lower_case,
                pad_token="<|endoftext|>")

            self.num_layers = self.model.config.n_layer
            self.hidden_size = self.model.config.n_embd

        # Set the model name
        self.model_name = model_name

        # Set shift size due to introduction of special tokens
        if self.base_name == 'xlnet':
            self.start_shift = 0
            self.end_shift = 2
        else:
            self.start_shift = (1 if self.tokenizer._cls_token else 0)
            self.end_shift = (1 if self.tokenizer._sep_token else 0)

        # Set requires_grad to False if not fine tuning
        if not fine_tune:
            for param in self.model.parameters():
                param.requires_grad = False

        if use_proj:
            # Apply a projection layer to output of pretrained models
            self.proj = nn.Linear(self.hidden_size, proj_dim)
            # Update the hidden size
            self.hidden_size = proj_dim
        else:
            self.proj = None
        # Set parameters required on top of pre-trained models
        self.weighing_params = nn.Parameter(torch.ones(self.num_layers))

        # Attention-based Span representation parameters - MIGHT NOT BE USED
        self.attention_params = nn.Linear(self.hidden_size, 1)
        nn.init.constant_(self.attention_params.weight, 0)
Beispiel #27
0
epsilon = 1e-8

# Set the seed value all over the place to make this reproducible.
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

SAVE_PATH = "/mnt/nfs/work1/llcao/zonghaiyao/LM/"

# I'm not really doing anything with the config buheret
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

# Load the GPT tokenizer.
tokenizer = GPT2Tokenizer.from_pretrained(
    'gpt2', pad_token='<|endoftext|>')  #gpt2-medium

# instantiate the model
model = rerankGPT2LMHeadModel_stage1_all_tokens_no_stage2.from_pretrained(
    "gpt2",
    config=configuration,
    MAX_LEN=MAX_LEN,
    CAN_NUM=CAN_NUM,
    num_of_rerank=num_of_rerank)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
Beispiel #28
0
    def test_batch_generation_2heads(self):
        model = GPT2DoubleHeadsModel.from_pretrained("gpt2")
        model.to(torch_device)
        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

        tokenizer.padding_side = "left"

        # This tokenizer has no pad token, so we have to set it in some way
        # Define PAD Token = EOS Token = 50256
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id

        # use different length sentences to test batching
        sentences = [
            "Hello, my dog is a little",
            "Today, I",
        ]

        inputs = tokenizer(sentences, return_tensors="pt", padding=True)
        input_ids = inputs["input_ids"].to(torch_device)
        token_type_ids = torch.cat(
            [
                input_ids.new_full(
                    (input_ids.shape[0], input_ids.shape[1] - 1), 0),
                input_ids.new_full((input_ids.shape[0], 1), 500),
            ],
            dim=-1,
        )

        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=inputs["attention_mask"].to(torch_device),
        )

        outputs_tt = model.generate(
            input_ids=input_ids,
            attention_mask=inputs["attention_mask"].to(torch_device),
            token_type_ids=token_type_ids,
        )

        inputs_non_padded = tokenizer(
            sentences[0], return_tensors="pt").input_ids.to(torch_device)
        output_non_padded = model.generate(input_ids=inputs_non_padded)

        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][
            -1].long().sum().cpu().item()
        inputs_padded = tokenizer(
            sentences[1], return_tensors="pt").input_ids.to(torch_device)
        output_padded = model.generate(input_ids=inputs_padded,
                                       max_length=model.config.max_length -
                                       num_paddings)

        batch_out_sentence = tokenizer.batch_decode(outputs,
                                                    skip_special_tokens=True)
        batch_out_sentence_tt = tokenizer.batch_decode(
            outputs_tt, skip_special_tokens=True)
        non_padded_sentence = tokenizer.decode(output_non_padded[0],
                                               skip_special_tokens=True)
        padded_sentence = tokenizer.decode(output_padded[0],
                                           skip_special_tokens=True)

        expected_output_sentence = [
            "Hello, my dog is a little bit of a mess. I'm not sure if he's going",
            "Today, I'm going to be doing a lot of research on this. I",
        ]
        self.assertListEqual(expected_output_sentence, batch_out_sentence)
        self.assertTrue(
            batch_out_sentence_tt !=
            batch_out_sentence)  # token_type_ids should change output
        self.assertListEqual(expected_output_sentence,
                             [non_padded_sentence, padded_sentence])
Beispiel #29
0
def run_pplm_example(pretrained_model="gpt2-medium",
                     cond_text="",
                     uncond=False,
                     num_samples=1,
                     bag_of_words=None,
                     discrim=None,
                     discrim_weights=None,
                     discrim_meta=None,
                     class_label=-1,
                     length=100,
                     stepsize=0.02,
                     temperature=1.0,
                     top_k=10,
                     sample=True,
                     num_iterations=3,
                     grad_length=10000,
                     horizon_length=1,
                     window_length=0,
                     decay=False,
                     gamma=1.5,
                     gm_scale=0.9,
                     kl_scale=0.01,
                     seed=0,
                     no_cuda=False,
                     colorama=False,
                     verbosity='regular'):
    # set Random seed
    torch.manual_seed(seed)
    np.random.seed(seed)

    # set verbosiry
    verbosity_level = VERBOSITY_LEVELS.get(verbosity.lower(), REGULAR)

    # set the device
    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"

    if discrim == 'generic':
        set_generic_model_params(discrim_weights, discrim_meta)

    if discrim is not None:
        discriminator_pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][
            "pretrained_model"]
        if pretrained_model != discriminator_pretrained_model:
            pretrained_model = discriminator_pretrained_model
            if verbosity_level >= REGULAR:
                print("discrim = {}, pretrained_model set "
                      "to discriminator's = {}".format(discrim,
                                                       pretrained_model))

    # load pretrained model
    model = GPT2LMHeadModel.from_pretrained(pretrained_model,
                                            output_hidden_states=True)
    model.to(device)
    model.eval()

    # load tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)

    # Freeze GPT-2 weights
    for param in model.parameters():
        param.requires_grad = False

    # figure out conditioning text
    if uncond:
        tokenized_cond_text = tokenizer.encode([tokenizer.bos_token],
                                               add_special_tokens=False)
    else:
        raw_text = cond_text
        while not raw_text:
            print("Did you forget to add `--cond_text`? ")
            raw_text = input("Model prompt >>> ")
        tokenized_cond_text = tokenizer.encode(tokenizer.bos_token + raw_text,
                                               add_special_tokens=False)

    print("= Prefix of sentence =")
    print(tokenizer.decode(tokenized_cond_text))
    print()

    # generate unperturbed and perturbed texts

    # full_text_generation returns:
    # unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time
    unpert_gen_tok_text, pert_gen_tok_texts, _, _ = full_text_generation(
        model=model,
        tokenizer=tokenizer,
        context=tokenized_cond_text,
        device=device,
        num_samples=num_samples,
        bag_of_words=bag_of_words,
        discrim=discrim,
        class_label=class_label,
        length=length,
        stepsize=stepsize,
        temperature=temperature,
        top_k=top_k,
        sample=sample,
        num_iterations=num_iterations,
        grad_length=grad_length,
        horizon_length=horizon_length,
        window_length=window_length,
        decay=decay,
        gamma=gamma,
        gm_scale=gm_scale,
        kl_scale=kl_scale,
        verbosity_level=verbosity_level)

    # untokenize unperturbed text
    unpert_gen_text = tokenizer.decode(unpert_gen_tok_text.tolist()[0])

    if verbosity_level >= REGULAR:
        print("=" * 80)
    print("= Unperturbed generated text =")
    print(unpert_gen_text)
    print()

    generated_texts = []

    bow_word_ids = set()
    if bag_of_words and colorama:
        bow_indices = get_bag_of_words_indices(bag_of_words.split(";"),
                                               tokenizer)
        for single_bow_list in bow_indices:
            # filtering all words in the list composed of more than 1 token
            filtered = list(filter(lambda x: len(x) <= 1, single_bow_list))
            # w[0] because we are sure w has only 1 item because previous fitler
            bow_word_ids.update(w[0] for w in filtered)

    # iterate through the perturbed texts
    for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts):
        try:
            # untokenize unperturbed text
            if colorama:
                import colorama

                pert_gen_text = ''
                for word_id in pert_gen_tok_text.tolist()[0]:
                    if word_id in bow_word_ids:
                        pert_gen_text += '{}{}{}'.format(
                            colorama.Fore.RED, tokenizer.decode([word_id]),
                            colorama.Style.RESET_ALL)
                    else:
                        pert_gen_text += tokenizer.decode([word_id])
            else:
                pert_gen_text = tokenizer.decode(pert_gen_tok_text.tolist()[0])

            print("= Perturbed generated text {} =".format(i + 1))
            print(pert_gen_text)
            print()
        except:
            pass

        # keep the prefix, perturbed seq, original seq for each index
        generated_texts.append(
            (tokenized_cond_text, pert_gen_tok_text, unpert_gen_tok_text))

    return
Beispiel #30
0
def compute_gpt_embeddings(annotation_data):
    pretrained_weights = 'gpt2'
    tokenizer = GPT2Tokenizer.from_pretrained(pretrained_weights)
    model = GPT2Model.from_pretrained(pretrained_weights)
    return compute_transformer_embeddings(model, tokenizer, annotation_data)