def main_metrics(args):
    device = utils.get_device_from_arg(args.device)
    print(f'Using device: {device}')

    save_directory = f'./outputs/{utils.get_dataset_name_from_datapath(args.data_dir)}_{utils.get_model_basename(args.model_name)}'
    model, tokenizer = utils.get_model_and_tokenizer(
        model_name=args.model_name, device=device)
    folder = 'ref'
    if args.ds_name is None:
        filename = args.datasplit
    else:
        filename = f'{args.ds_name}_{args.datasplit}'

    ds_tokens = utils.load_and_tokenize_data(tokenizer,
                                             args.data_dir,
                                             args.max_len,
                                             args.max_num_data,
                                             ds_name=args.ds_name,
                                             split=args.datasplit)
    savefilename = f'{save_directory}/metrics/{folder}/all_{filename}.p'
    if os.path.isfile(savefilename) and not args.force:
        print('All metrics already computed. Exiting')
        return

    all_sentences = [x[0].numpy().tolist() for x in ds_tokens]
    is_completed = [True for _ in all_sentences]

    metrics_all = {}

    # Distinct-n
    n_lst = [1, 2, 3, 4, 5, 6]
    unique_ngram_frac = src.metrics.get_unique_ngram_fraction(
        all_sentences, n_lst)
    metrics_all['distinct-n'] = unique_ngram_frac

    # PPL
    samples_2 = [
        torch.LongTensor(x).view(1, -1).to(device) for x in all_sentences
    ]
    ppl = src.metrics.get_perplexity_from_samples(model, samples_2)
    metrics_all['perplexity'] = ppl

    # Zipf
    metrics_all['zipf'] = src.metrics.zipf_coeff(all_sentences)

    # Repetition
    metrics_all['repetition'] = src.metrics.get_repetition_fraction(
        all_sentences)

    # Non-termination
    metrics_all[
        'non-termination-ratio'] = src.metrics.get_nontermination_ratio(
            all_sentences, is_completed)

    # save
    with open(savefilename, 'wb') as f:
        pkl.dump(metrics_all, f)
    print(f'Done. Saved "{savefilename}". Bye!')
Example #2
0
def main_metrics(args):
    print(f'device: {args.device}')
    device = utils.get_device_from_arg(args.device)
    print(f'Using device: {device}')

    save_directory = f'./outputs/{utils.get_dataset_name_from_datapath(args.data_dir)}_{utils.get_model_basename(args.model_name)}'
    filename = f'{args.datasplit}_p{args.top_p}_k{args.top_k}_t{args.temp}_seed{args.generate_seed}'
    folder_name = f'{save_directory}/generations/basic'

    input_file_name = f'{folder_name}/sample_{filename}.p'
    if not os.path.isfile(input_file_name):
        print(f'File {input_file_name} does not exist. Quitting!')
        return
    with open(input_file_name, 'rb') as f:
        all_sentences, is_completed = pkl.load(f)[:2]

    savefilename = f'{save_directory}/metrics/basic/all_L_{filename}.p'
    if os.path.isfile(savefilename) and not args.force:
        print('All metrics already computed. Exiting')
        return

    model, tokenizer = utils.get_model_and_tokenizer(model_name='gpt2-large',
                                                     device=device)

    metrics_all = {}
    # Distinct-n
    n_lst = [1, 2, 3, 4, 5, 6]
    unique_ngram_frac = src.metrics.get_unique_ngram_fraction(
        all_sentences, n_lst)
    metrics_all['distinct-n'] = unique_ngram_frac

    # PPL
    samples_2 = [
        torch.LongTensor(x).view(1, -1).to(device) for x in all_sentences
    ]
    ppl = src.metrics.get_perplexity_from_samples(model, samples_2)
    metrics_all['perplexity'] = ppl

    # Zipf
    metrics_all['zipf'] = src.metrics.zipf_coeff(all_sentences)

    # Repetition
    metrics_all['repetition'] = src.metrics.get_repetition_fraction(
        all_sentences)

    # Non-termination
    metrics_all[
        'non-termination-ratio'] = src.metrics.get_nontermination_ratio(
            all_sentences, is_completed)

    # save
    with open(savefilename, 'wb') as f:
        pkl.dump(metrics_all, f)
    print(f'Done. Saved "{savefilename}". Bye!')
Example #3
0
    def __init__(self, config):
        super().__init__()
        self.save_hyperparameters()
        self.num_classes = config["arch"]["args"]["num_classes"]
        self.model_args = config["arch"]["args"]
        self.model, self.tokenizer = get_model_and_tokenizer(**self.model_args)
        self.bias_loss = False

        if "loss_weight" in config:
            self.loss_weight = config["loss_weight"]
        if "num_main_classes" in config:
            self.num_main_classes = config["num_main_classes"]
            self.bias_loss = True
        else:
            self.num_main_classes = self.num_classes

        self.config = config
def main():
    parser = make_parser()
    args = parser.parse_args()
    print(args)

    device = utils.get_device_from_arg(args.device)
    print(f'Using device: {device}')

    model, tokenizer = utils.get_model_and_tokenizer(
        model_name=args.model_name, device=device)
    save_directory = f'./outputs/{utils.get_dataset_name_from_datapath(args.data_dir)}_{utils.get_model_basename(args.model_name)}'

    ds_tokens = utils.load_and_tokenize_data(tokenizer,
                                             args.data_dir,
                                             args.max_len,
                                             args.max_num_data,
                                             split=args.datasplit)

    metric_fn_lst = src.metrics.get_probs_metric_fn_lst()
    metric_fn_names = src.metrics.get_metric_names()
    print(metric_fn_names)

    for p in [0.8, 0.9, 0.92, 0.95, 0.99]:  # 5
        param = (p, 0, 1.0)
        get_metrics(param, metric_fn_lst, model, ds_tokens, args.datasplit,
                    metric_fn_names, save_directory)

    for k in [1, 5, 10, 50, 100, 500, 1000, 2000, 5000, 10000]:  # 10
        param = (1.0, k, 1.0)
        get_metrics(param, metric_fn_lst, model, ds_tokens, args.datasplit,
                    metric_fn_names, save_directory)

    for t in [0.7, 0.8, 0.9, 0.95, 1.0]:  # 5
        param = (1.0, 0, t)
        get_metrics(param, metric_fn_lst, model, ds_tokens, args.datasplit,
                    metric_fn_names, save_directory)

    for t in [0.75, 0.9]:  # 4
        for k in [10, 100]:
            param = (1.0, k, t)
        get_metrics(param, metric_fn_lst, model, ds_tokens, args.datasplit,
                    metric_fn_names, save_directory)
def main_bleu(args):
    rng = random.Random(args.seed)

    save_directory = f'./outputs/{utils.get_dataset_name_from_datapath(args.data_dir)}_{utils.get_model_basename(args.model_name)}'
    _, tokenizer = utils.get_model_and_tokenizer(model_name=args.model_name,
                                                 device=utils.CPU_DEVICE)
    folder = 'ref'
    if args.ds_name is None:
        filename = args.datasplit
    else:
        filename = f'{args.ds_name}_{args.datasplit}'

    ds_tokens = utils.load_and_tokenize_data(tokenizer,
                                             args.data_dir,
                                             args.max_len,
                                             args.max_num_data,
                                             ds_name=args.ds_name,
                                             split=args.datasplit)
    all_sentences = [x[0].numpy().tolist() for x in ds_tokens]

    savefilename = f'{save_directory}/metrics/{folder}/bleu_{filename}.p'
    if os.path.isfile(savefilename) and not args.force:
        print('Bleu metrics already computed. Exiting')
        return

    smoothing_function = SmoothingFunction().method1

    start_time = time.time()
    if args.parallel_bleu:
        bleu_scores = compute_bleus_parallel(all_sentences, smoothing_function,
                                             rng, args)
    else:
        bleu_scores = compute_bleus_sequential(all_sentences,
                                               smoothing_function, rng, args)
    print('Total time for self bleu:', round(time.time() - start_time), 's')

    # save
    with open(savefilename, 'wb') as f:
        pkl.dump(bleu_scores, f)
    print(f'Done. Saved "{savefilename}". Bye!')
Example #6
0
    folder_name = f'{save_directory}/generations/ref'


    device = utils.get_device_from_arg(args.device)
    print(f'Using device: {device}')


    ###### OLD
    ## featurize samples
    # feats = src.model_utils.featurize_sequential(model, ds_tokens)
    # torch.save(feats, f'{folder_name}/feats_{name}.pt')


    feats_prefix = ''
    if args.use_large_feats:
        model, tokenizer = utils.get_model_and_tokenizer(model_name=args.featurize_model_name, device=device)
        ds_tokens = utils.load_and_tokenize_data(tokenizer, args.data_dir, args.max_len, args.max_num_generations,
                                                 ds_name=args.ds_name, split=args.datasplit)
        for l in {128, 256, 512, args.max_len}:
            feats_prefix = f'L{l}'
            feats_out_fn = f'{folder_name}/feats{feats_prefix}_{name}.pt'
            if os.path.isfile(feats_out_fn):
                print(f'Feats {feats_out_fn} exisits. Skipping')
                continue
            else:
                print(f'Featurizing l = {l}...')
                samples_3 = [x[:, :l] for x in ds_tokens]
                feats = src.model_utils.featurize_sequential(model, samples_3)
                torch.save(feats, feats_out_fn)
    else:  # use features from model
        model, tokenizer = utils.get_model_and_tokenizer(model_name=args.model_name, device=device)