def main_metrics(args): device = utils.get_device_from_arg(args.device) print(f'Using device: {device}') save_directory = f'./outputs/{utils.get_dataset_name_from_datapath(args.data_dir)}_{utils.get_model_basename(args.model_name)}' model, tokenizer = utils.get_model_and_tokenizer( model_name=args.model_name, device=device) folder = 'ref' if args.ds_name is None: filename = args.datasplit else: filename = f'{args.ds_name}_{args.datasplit}' ds_tokens = utils.load_and_tokenize_data(tokenizer, args.data_dir, args.max_len, args.max_num_data, ds_name=args.ds_name, split=args.datasplit) savefilename = f'{save_directory}/metrics/{folder}/all_{filename}.p' if os.path.isfile(savefilename) and not args.force: print('All metrics already computed. Exiting') return all_sentences = [x[0].numpy().tolist() for x in ds_tokens] is_completed = [True for _ in all_sentences] metrics_all = {} # Distinct-n n_lst = [1, 2, 3, 4, 5, 6] unique_ngram_frac = src.metrics.get_unique_ngram_fraction( all_sentences, n_lst) metrics_all['distinct-n'] = unique_ngram_frac # PPL samples_2 = [ torch.LongTensor(x).view(1, -1).to(device) for x in all_sentences ] ppl = src.metrics.get_perplexity_from_samples(model, samples_2) metrics_all['perplexity'] = ppl # Zipf metrics_all['zipf'] = src.metrics.zipf_coeff(all_sentences) # Repetition metrics_all['repetition'] = src.metrics.get_repetition_fraction( all_sentences) # Non-termination metrics_all[ 'non-termination-ratio'] = src.metrics.get_nontermination_ratio( all_sentences, is_completed) # save with open(savefilename, 'wb') as f: pkl.dump(metrics_all, f) print(f'Done. Saved "{savefilename}". Bye!')
def main_metrics(args): print(f'device: {args.device}') device = utils.get_device_from_arg(args.device) print(f'Using device: {device}') save_directory = f'./outputs/{utils.get_dataset_name_from_datapath(args.data_dir)}_{utils.get_model_basename(args.model_name)}' filename = f'{args.datasplit}_p{args.top_p}_k{args.top_k}_t{args.temp}_seed{args.generate_seed}' folder_name = f'{save_directory}/generations/basic' input_file_name = f'{folder_name}/sample_{filename}.p' if not os.path.isfile(input_file_name): print(f'File {input_file_name} does not exist. Quitting!') return with open(input_file_name, 'rb') as f: all_sentences, is_completed = pkl.load(f)[:2] savefilename = f'{save_directory}/metrics/basic/all_L_{filename}.p' if os.path.isfile(savefilename) and not args.force: print('All metrics already computed. Exiting') return model, tokenizer = utils.get_model_and_tokenizer(model_name='gpt2-large', device=device) metrics_all = {} # Distinct-n n_lst = [1, 2, 3, 4, 5, 6] unique_ngram_frac = src.metrics.get_unique_ngram_fraction( all_sentences, n_lst) metrics_all['distinct-n'] = unique_ngram_frac # PPL samples_2 = [ torch.LongTensor(x).view(1, -1).to(device) for x in all_sentences ] ppl = src.metrics.get_perplexity_from_samples(model, samples_2) metrics_all['perplexity'] = ppl # Zipf metrics_all['zipf'] = src.metrics.zipf_coeff(all_sentences) # Repetition metrics_all['repetition'] = src.metrics.get_repetition_fraction( all_sentences) # Non-termination metrics_all[ 'non-termination-ratio'] = src.metrics.get_nontermination_ratio( all_sentences, is_completed) # save with open(savefilename, 'wb') as f: pkl.dump(metrics_all, f) print(f'Done. Saved "{savefilename}". Bye!')
def __init__(self, config): super().__init__() self.save_hyperparameters() self.num_classes = config["arch"]["args"]["num_classes"] self.model_args = config["arch"]["args"] self.model, self.tokenizer = get_model_and_tokenizer(**self.model_args) self.bias_loss = False if "loss_weight" in config: self.loss_weight = config["loss_weight"] if "num_main_classes" in config: self.num_main_classes = config["num_main_classes"] self.bias_loss = True else: self.num_main_classes = self.num_classes self.config = config
def main(): parser = make_parser() args = parser.parse_args() print(args) device = utils.get_device_from_arg(args.device) print(f'Using device: {device}') model, tokenizer = utils.get_model_and_tokenizer( model_name=args.model_name, device=device) save_directory = f'./outputs/{utils.get_dataset_name_from_datapath(args.data_dir)}_{utils.get_model_basename(args.model_name)}' ds_tokens = utils.load_and_tokenize_data(tokenizer, args.data_dir, args.max_len, args.max_num_data, split=args.datasplit) metric_fn_lst = src.metrics.get_probs_metric_fn_lst() metric_fn_names = src.metrics.get_metric_names() print(metric_fn_names) for p in [0.8, 0.9, 0.92, 0.95, 0.99]: # 5 param = (p, 0, 1.0) get_metrics(param, metric_fn_lst, model, ds_tokens, args.datasplit, metric_fn_names, save_directory) for k in [1, 5, 10, 50, 100, 500, 1000, 2000, 5000, 10000]: # 10 param = (1.0, k, 1.0) get_metrics(param, metric_fn_lst, model, ds_tokens, args.datasplit, metric_fn_names, save_directory) for t in [0.7, 0.8, 0.9, 0.95, 1.0]: # 5 param = (1.0, 0, t) get_metrics(param, metric_fn_lst, model, ds_tokens, args.datasplit, metric_fn_names, save_directory) for t in [0.75, 0.9]: # 4 for k in [10, 100]: param = (1.0, k, t) get_metrics(param, metric_fn_lst, model, ds_tokens, args.datasplit, metric_fn_names, save_directory)
def main_bleu(args): rng = random.Random(args.seed) save_directory = f'./outputs/{utils.get_dataset_name_from_datapath(args.data_dir)}_{utils.get_model_basename(args.model_name)}' _, tokenizer = utils.get_model_and_tokenizer(model_name=args.model_name, device=utils.CPU_DEVICE) folder = 'ref' if args.ds_name is None: filename = args.datasplit else: filename = f'{args.ds_name}_{args.datasplit}' ds_tokens = utils.load_and_tokenize_data(tokenizer, args.data_dir, args.max_len, args.max_num_data, ds_name=args.ds_name, split=args.datasplit) all_sentences = [x[0].numpy().tolist() for x in ds_tokens] savefilename = f'{save_directory}/metrics/{folder}/bleu_{filename}.p' if os.path.isfile(savefilename) and not args.force: print('Bleu metrics already computed. Exiting') return smoothing_function = SmoothingFunction().method1 start_time = time.time() if args.parallel_bleu: bleu_scores = compute_bleus_parallel(all_sentences, smoothing_function, rng, args) else: bleu_scores = compute_bleus_sequential(all_sentences, smoothing_function, rng, args) print('Total time for self bleu:', round(time.time() - start_time), 's') # save with open(savefilename, 'wb') as f: pkl.dump(bleu_scores, f) print(f'Done. Saved "{savefilename}". Bye!')
folder_name = f'{save_directory}/generations/ref' device = utils.get_device_from_arg(args.device) print(f'Using device: {device}') ###### OLD ## featurize samples # feats = src.model_utils.featurize_sequential(model, ds_tokens) # torch.save(feats, f'{folder_name}/feats_{name}.pt') feats_prefix = '' if args.use_large_feats: model, tokenizer = utils.get_model_and_tokenizer(model_name=args.featurize_model_name, device=device) ds_tokens = utils.load_and_tokenize_data(tokenizer, args.data_dir, args.max_len, args.max_num_generations, ds_name=args.ds_name, split=args.datasplit) for l in {128, 256, 512, args.max_len}: feats_prefix = f'L{l}' feats_out_fn = f'{folder_name}/feats{feats_prefix}_{name}.pt' if os.path.isfile(feats_out_fn): print(f'Feats {feats_out_fn} exisits. Skipping') continue else: print(f'Featurizing l = {l}...') samples_3 = [x[:, :l] for x in ds_tokens] feats = src.model_utils.featurize_sequential(model, samples_3) torch.save(feats, feats_out_fn) else: # use features from model model, tokenizer = utils.get_model_and_tokenizer(model_name=args.model_name, device=device)