def __init__(self, language, model: str = "bleurt-base-128"): super().__init__(language) # HACK TO SILENCE tensorflow and errors related to tf.FLAGS from silence_tensorflow import silence_tensorflow silence_tensorflow() import tensorflow.compat.v1 as tf flags = tf.flags flags.DEFINE_string("source", "", help="Source segments", required=False) flags.DEFINE_string("s", "", help="Source segments", required=False) flags.DEFINE_string("hypothesis", "", help="MT segments", required=False) flags.DEFINE_string("h", "", help="MT segments", required=False) flags.DEFINE_string("reference", "", help="Reference segments", required=False) flags.DEFINE_string("r", "", help="Reference segments", required=False) flags.DEFINE_string("language", "", help="Language", required=False) flags.DEFINE_string("l", "", help="Language", required=False) flags.DEFINE_string("metric", "", help="Metric to run.", required=False) flags.DEFINE_string("m", "", help="Metric to run.", required=False) self.model = model if not os.path.isdir(telescope_cache_folder() + model): download_file_maybe_extract( url=f"https://storage.googleapis.com/bleurt-oss/{model}.zip", directory=telescope_cache_folder(), ) self.scorer = score.BleurtScorer(telescope_cache_folder() + model) self.system_only = False
def bert_based(gts, res): refs, cands = [], [] for refers in gts.values(): sub_refs = [] for ref in refers: sub_refs.append(ref + '.') refs.append(sub_refs) for cand in res.values(): cands.append(cand[0] + '.') scorer = BERTScorer(lang="en", rescale_with_baseline=True) P, R, F1 = scorer.score(cands, refs, verbose=True) out_file.write('BERTScore = %s' % F1.mean().item() + "\n") BERTScore = F1.mean().item() total_bleurt_score = [] scorer = bleurt_sc.BleurtScorer(bleurt_checkpoint) for ref_caption, cand in zip(refs, cands): bleurt_score_per_img = [] for ref in ref_caption: bleurt_score_per_img.append( scorer.score([ref], [cand], batch_size=None)[0]) total_bleurt_score.append(max(bleurt_score_per_img)) out_file.write('BLEURT =%s' % statistics.mean(total_bleurt_score))
def bleurt_eval(candidates, references, verbose=False): checkpoint = "bleurt/bleurt-base-128" scorer = bleurt_score.BleurtScorer(checkpoint) scores = scorer.score(references=references, candidates=candidates) if verbose: print("BLEURT scores:", scores) return mean(scores)
def _download_and_prepare(self, dl_manager): # check that config name specifies a valid BLEURT model if self.config_name == "default": logger.warning( "Using default BLEURT-Base checkpoint for sequence maximum length 128. " "You can use a bigger model for better results with e.g.: datasets.load_metric('bleurt', 'bleurt-large-512')." ) self.config_name = "bleurt-base-128" if self.config_name.lower() in CHECKPOINT_URLS: checkpoint_name = self.config_name.lower() elif self.config_name.upper() in CHECKPOINT_URLS: checkpoint_name = self.config_name.upper() else: raise KeyError( f"{self.config_name} model not found. You should supply the name of a model checkpoint for bleurt in {CHECKPOINT_URLS.keys()}" ) # download the model checkpoint specified by self.config_name and set up the scorer model_path = dl_manager.download_and_extract( CHECKPOINT_URLS[checkpoint_name]) self.scorer = score.BleurtScorer( os.path.join(model_path, checkpoint_name))
def test_bleurt_nulls(self): bleurt = score.BleurtScorer() test_references = [] test_candidates = [] scores = bleurt.score(references=test_references, candidates=test_candidates) self.assertLen(scores, 0)
def test_bleurt_empty(self): bleurt = score.BleurtScorer() test_references = [""] test_candidates = [""] scores = bleurt.score(references=test_references, candidates=test_candidates) self.assertLen(scores, 1)
def __init__(self, model_type, model_path): self.config_name = model_type if self.config_name not in CHECKPOINT_URLS.keys(): if self.config_name == 'default': logger.warning("Using default BLEURT-Base checkpoint for sequence maximum length 128. " "You can use a bigger model for better results with e.g.: nlp.load_metric('bleurt', 'bleurt-large-512').") self.config_name = "bleurt-tiny-128" else: raise KeyError( f"{self.config_name} model not found. You should supply the name of a model checkpoint for bleurt in {CHECKPOINT_URLS.keys()}") try: if model_path.endswith(self.config_name): self.scorer = score.BleurtScorer(model_path) else: self.scorer = score.BleurtScorer(os.path.join(model_path, self.config_name)) except Exception as e: raise Exception(str( e) + f". You can download the checkpoint for {self.config_name} model from {CHECKPOINT_URLS[self.config_name]}")
def score_files(generator, bleurt_checkpoint): """Computes BLEURT scores from a sentence pairs generator. Requires that a JSONL file containing both candidate and reference sentences or two individual candidate and reference text files be specified, with the former overriding the latter if both flags are specified. Args: generator: A generator yielding reference and candidate sentences. bleurt_checkpoint: BLEURT checkpoint used for scoring. """ ref_buffer = [] cand_buffer = [] scores_buffer = [] if not FLAGS.batch_same_length: scorer = score_lib.BleurtScorer(bleurt_checkpoint) else: logging.warning( "Enabling same length batching. BEWARE: this is an experimental " "feature.") scorer = score_lib.LengthBatchingBleurtScorer(bleurt_checkpoint) def _consume_buffer(): scores = scorer.score(references=ref_buffer, candidates=cand_buffer, batch_size=FLAGS.bleurt_batch_size) del ref_buffer[:] del cand_buffer[:] scores_buffer.extend(scores) logging.info("Computing BLEURT scores...") for ref_sentence, cand_sentence in generator: ref_buffer.append(ref_sentence) cand_buffer.append(cand_sentence) if len(ref_buffer) >= FLAGS.read_buffer_size: _consume_buffer() if ref_buffer: _consume_buffer() logging.info("BLEURT scores computed.") if FLAGS.scores_file: logging.info("Writing to disk.") with tf.io.gfile.GFile(FLAGS.scores_file, "w+") as score_file: for s in scores_buffer: score_file.write("{}\n".format(str(s))) else: for s in scores_buffer: print("{}".format(str(s))) logging.info("Done.")
def __init__(self, decode_fn): import tensorflow as tf gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: try: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) except RuntimeError as e: print(e) from bleurt import score as bleurt_score super(BleurtDiscriminator, self).__init__() checkpoint = "bleurt/bleurt-base-128" self.scorer = bleurt_score.BleurtScorer(checkpoint) self.decode_fn = decode_fn
def run_bleurt(self): ''' Computes the BLEURT scores between the set of hypothesis and reference summaries. ''' print('\n===== BLEURT =====\n') sys.argv = [sys.argv[0]] checkpoint = self.bleurt_model bleurt = score.BleurtScorer(checkpoint) for hyps_path, refs_path in zip(self.hyps_paths, self.refs_paths): self.load_summs(hyps_path, refs_path) scores = bleurt.score(self.hyps, self.refs, batch_size=64) self.df_scores.loc[self.df_scores['hyps_path'] == hyps_path, 'bleurt'] = scores self.save_temp_csv() print(np.mean(scores)) del bleurt, scores, checkpoint torch.cuda.empty_cache()
def main(_): multi_references = (_text_reference_reader(FLAGS.reference_path)) generations = _text_reader(FLAGS.generation_path) # FLAGS.bleurt_checkpoint is defined in the BLEURT library. Importing the # BLEURT scoring module automatically imports the flags. scorer = score.BleurtScorer(FLAGS.bleurt_checkpoint) multi_bleurt_scores = [] for references in multi_references: assert len(references) == len(generations) # Maximize parallelism. bleurt_scores = scorer.score(references=references, candidates=generations) multi_bleurt_scores.append(bleurt_scores) if len(multi_references) == 1: avg_bleurt_score = np.mean(multi_bleurt_scores[0]) else: assert len(multi_references) == 3 avg_bleurt_scores = [] for i in range(len(generations)): # All examples have atleast two references but some do not have three. assert multi_references[0][i] and multi_references[1][i] r2 = multi_references[2][i] if r2: # Take average over 3 references. score_i = (multi_bleurt_scores[0][i] + multi_bleurt_scores[1][i] + multi_bleurt_scores[2][i]) / 3 else: print("only two refs") # Take average over two references. score_i = (multi_bleurt_scores[0][i] + multi_bleurt_scores[1][i]) / 2 avg_bleurt_scores.append(score_i) avg_bleurt_score = np.mean(avg_bleurt_scores) print("Evaluated %d examples." % len(generations)) print("Average BLEURT score = %.4f" % avg_bleurt_score)
def __init__(self, args, task): super().__init__(args, task) self.eps = args.label_smoothing from fairseq.sequence_generator import SequenceGenerator self.gen = SequenceGenerator(task.target_dictionary, beam_size=args.beam_size) if args.reward == "bleurt": from fairseq.distributed_utils import get_rank sys.argv = sys.argv[:1] my_rank = 0 if torch.cuda.device_count() <= 1 else get_rank() os.environ["CUDA_VISIBLE_DEVICES"] = str(my_rank % 4) from bleurt import score from transformers import cached_path import tensorflow as tf gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: this_gpu = gpus[my_rank % 4] tf.config.set_visible_devices([this_gpu], 'GPU') try: tf.config.experimental.set_memory_growth(this_gpu, True) tf.config.experimental.set_virtual_device_configuration( this_gpu, [ tf.config.experimental.VirtualDeviceConfiguration( memory_limit=2048) ]) logical_devices = tf.config.list_logical_devices('GPU') self.logical_device = tf.device(logical_devices[0].name) print("num of logical gpus", len(logical_devices)) except RuntimeError as e: print(e) with self.logical_device: self.bleurt_scorer = score.BleurtScorer( os.path.join( cached_path( "https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip", extract_compressed_file=True), "bleurt-base-128"))
def run_bleurt(candidates: list, references: list, checkpoint: str = "bleurt/bleurt-large-512"): scorer = score.BleurtScorer(checkpoint) scores = scorer.score(references, candidates) return scores
def _scoring_fun(test_df): scorer = score.BleurtScorer(export_dir) return scorer.score(references=test_df.reference.tolist(), candidates=test_df.candidate.tolist())
def evaluate_metric(metric, stem, remove_stop, prompt='overall'): ''' Compute the correlation between the human eval scores and the scores awarded by the eval metric. ''' assert metric in ['ROUGE-1-F', 'ROUGE-2-F', 'ROUGE-L-F', 'bert-human', 'bert-score', 'bart-score', 'bleurt-base', 'bleurt-lg', 'mover-1', 'mover-2', 'mover-smd', 'bert-avg-score'] stemmed_str = "_stem" if stem else "" stop_str = "_removestop" if remove_stop else "" ranks_file_path = os.path.join('learned_eval/outputs', 'wref_{}{}{}_{}_rank_correlation.csv'.format(metric, stemmed_str, stop_str, prompt)) print('\n====={}=====\n'.format(ranks_file_path)) ranks_file = open(ranks_file_path, 'w') ranks_file.write('article,summ_id, human_score, metric_score\n') sorted_scores = read_sorted_scores() input_articles, _ = read_articles() corr_data = np.zeros((len(sorted_scores), 3)) stopwords_list = set(stopwords.words("english")) stemmer = PorterStemmer() # Init the metric if metric == 'bert-human': rewarder = Rewarder(os.path.join(MODEL_WEIGHT_DIR, 'sample.model')) elif metric.endswith('score'): from bert_score import BERTScorer if 'bert-score' == metric: rewarder = BERTScorer(lang="en", rescale_with_baseline=True, model_type='roberta-large-mnli') elif 'bart-score' == metric: rewarder = BERTScorer(lang="en", model_type="facebook/bart-large-mnli", num_layers=12) elif 'bert-avg' in metric: r1 = BERTScorer(lang="en", rescale_with_baseline=False, model_type='roberta-large') r2 = BERTScorer(lang="en", rescale_with_baseline=False, model_type='albert-xxlarge-v2') r3 = BERTScorer(lang="en", rescale_with_baseline=False, model_type='bart-large-mnli', num_layers=12) elif metric.startswith('bleurt'): from bleurt import score if 'base' in metric: checkpoint = "bleurt-base-512" elif 'lg' in metric: checkpoint = "bleurt-large-512" rewarder = score.BleurtScorer(checkpoint) elif metric.startswith('mover'): from moverscore import get_idf_dict, word_mover_score hyps = [s['sys_summ'] for score in sorted_scores.values() for s in score if s['sys_name'] != 'reference'] refs = [s['sys_summ'] for score in sorted_scores.values() for s in score if s['sys_name'] == 'reference'] idf_dict_hyp = get_idf_dict(hyps) idf_dict_ref = get_idf_dict(refs) elif 'rouge' in metric.lower(): from rouge_score import rouge_scorer from rouge_score.scoring import BootstrapAggregator # Loop over each article and compute the correlation between human judgement # and the metric scores. for i, (article_id, scores) in tqdm(enumerate(sorted_scores.items())): scores_list = [s for s in scores if s['sys_name'] != 'reference'] human_ranks = [s['scores'][prompt] for s in scores_list] if len(human_ranks) < 2: continue # Must be at least 2 scores to compute the correlation ref_summ = scores_list[0]['ref'] article = [entry['article'] for entry in input_articles if entry['id']==article_id][0] # Pre-processing (if necessary) if stem and remove_stop: sys_summs = [" ".join(sent2stokens_wostop(s['sys_summ'], stemmer, stopwords_list, 'english', True)) for s in scores_list] ref_summ = " ".join(sent2stokens_wostop(ref_summ, stemmer, stopwords_list, 'english', True)) article = " ".join(sent2stokens_wostop(article, stemmer, stopwords_list, 'english', True)) elif not stem and remove_stop: sys_summs = [" ".join(sent2tokens_wostop(s['sys_summ'], stopwords_list, 'english', True)) for s in scores_list] ref_summ = " ".join(sent2tokens_wostop(ref_summ, stopwords_list, 'english', True)) article = " ".join(sent2tokens_wostop(article, stopwords_list, 'english', True)) elif not remove_stop and stem: sys_summs = [" ".join(sent2stokens(s['sys_summ'], stemmer, 'english', True)) for s in scores_list] ref_summ = " ".join(sent2stokens(ref_summ, stemmer, 'english', True)) article = " ".join(sent2stokens(article, stemmer, 'english', True)) else: sys_summs = [s['sys_summ'] for s in scores_list] # Clean summaries summ_ids = [s['summ_id'] for s in scores_list] sys_summs = [text_normalization(s) for s in sys_summs] ref_summ = text_normalization(ref_summ) article = text_normalization(article) # Compute metric scores if 'rouge' in metric.lower(): auto_metric_ranks = [] if '1' in metric: rouge_metric = 'rouge1' elif '2' in metric: rouge_metric = 'rouge2' elif 'L' in metric: rouge_metric = 'rougeL' rew_rouge = rouge_scorer.RougeScorer([rouge_metric], use_stemmer=True) for ss in sys_summs: ss = ss.replace('. ', '\n') ref_summ = ref_summ.replace('. ', '\n') score = rew_rouge.score(ref_summ, ss) auto_metric_ranks.append(score[rouge_metric].fmeasure) if metric == 'bert-human': auto_metric_ranks = [rewarder(ref_summ,ss) for ss in sys_summs] elif metric.endswith('score'): if 'bert-score' == metric: auto_metric_ranks = [rewarder.score([ref_summ], [ss])[-1].item() for ss in sys_summs] elif 'bart-score' == metric: auto_metric_ranks = [rewarder.score([ref_summ], [ss])[-1].item() for ss in sys_summs] elif 'bert-avg' in metric: rewarder_scores = [] for rewarder in [r1, r2, r3]: r_scores = np.array([rewarder.score([ref_summ], [ss])[-1].item() for ss in sys_summs]) r_scores = (r_scores - np.min(r_scores)) / (np.max(r_scores) - np.min(r_scores)) rewarder_scores.append(r_scores) auto_metric_ranks = list(np.mean(rewarder_scores, axis=0)) elif metric.startswith('bleurt'): auto_metric_ranks = [rewarder.score([ref_summ], [ss])[0] for ss in sys_summs] elif metric.startswith('mover'): if '1' in metric: n_gram = 1 elif '2' in metric: n_gram = 2 else: raise ValueError("smd not implemented currently") auto_metric_ranks = [word_mover_score([ref_summ], [ss], idf_dict_ref, idf_dict_hyp, stop_words=[], n_gram=n_gram, remove_subwords=True)[0] for ss in sys_summs] for sid, amr, hr in zip(summ_ids, auto_metric_ranks, human_ranks): ranks_file.write('{},{},{:.2f},{:.4f}\n'.format(article_id, sid, hr, amr)) # Compute correlations spearmanr_result = spearmanr(human_ranks, auto_metric_ranks) pearsonr_result = pearsonr(human_ranks, auto_metric_ranks) kendalltau_result = kendalltau(human_ranks, auto_metric_ranks) corr_data[i, :] = [spearmanr_result[0], pearsonr_result[0], kendalltau_result[0]] corr_mean_all = np.nanmean(corr_data, axis=0) corr_std_all = np.nanstd(corr_data, axis=0) print('\n====={}=====\n'.format(ranks_file_path)) print("Correlation mean on all data spearman/pearsonr/kendall: {}".format(corr_mean_all)) print("Correlation std on all data spearman/pearsonr/kendall: {}".format(corr_std_all)) ranks_file.flush() ranks_file.close() return ranks_file_path
from bleurt import score checkpoint = "bleurt/test_checkpoint" refrences = [] candidates = [] refrences.append(input('Enter the refrence sentences : ')) candidates.append(input('Enter the candidate sentences : ')) scorer = score.BleurtScorer(checkpoint) scores = scorer.score(refrences, candidates) assert type(scores) == list and len(scores) == 1 print(scores)
def _scoring_fun(test_df): scorer = score.BleurtScorer(export_dir) return scorer.score(test_df.reference, test_df.candidate)
def __init__(self): self.scorer = score.BleurtScorer()
def main(args, checkpoint_name="best"): assert args.path is not None, '--path required for generation!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert args.replace_unk is None or args.raw_text, \ '--replace-unk requires a raw text dataset (--raw-text)' if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 print(args) use_cuda = torch.cuda.is_available() and not args.cpu torch.manual_seed(args.seed) # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) print('| {} {} {} examples'.format(args.data, args.gen_subset, len(task.dataset(args.gen_subset)))) args.taskobj = task sys.argv = sys.argv[:1] import tensorflow as tf gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: try: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) except RuntimeError as e: print(e) bleurt_scorer = score.BleurtScorer(os.path.join( cached_path( "https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip", extract_compressed_file=True ), "bleurt-base-128" )) # Set dictionaries #src_dict = task.source_dictionary tgt_dict = task.target_dictionary dict = tgt_dict # Load decoding strategy strategy = strategies.setup_strategy(args) # Load ensemble if args.path.startswith("nsml://"): print("| loading nsml checkpoint", args.path) import nsml session = args.path.replace("nsml://", "") model = task.build_model(args) def load(dir_path): state = torch.load(os.path.join(dir_path, 'best.pt')) state_dict = state["model"] model.load_state_dict(state_dict) print("loaded") nsml.load(args.checkpoint_name, load_fn=load, session=session) models = [model.cuda()] elif args.path == "pretrain": from nsml import DATASET_PATH from fairseq import checkpoint_utils data_token = "en-de" pretrained_path = "{}/train/pretrained_models/maskPredict_{}/checkpoint_best.pt".format(DATASET_PATH, data_token.split(".")[-1].replace("-", "_")) print("| loading", pretrained_path) model = task.build_model(args) state = checkpoint_utils.load_checkpoint_to_cpu(pretrained_path) model.load_state_dict(state["model"], strict=True) models = [model.cuda()] elif args.path.startswith("wb://"): print("| loading wb checkpoint", args.path) import wandb wandb.restore("best.pt", args.path.replace("wb://", ""), root="/tmp/") assert os.path.exists("/tmp/best.pt") state = torch.load("/tmp/best.pt") model = task.build_model(args) model.load_state_dict(state["model"]) models = [model.cuda()] elif args.path.startswith("http://"): print("| loading http checkpoint", args.path) url = "http://trains.deeplearn.org:8081/{}".format(args.path.replace("http://", "")) os.system("curl -o /tmp/model.pt {}".format(url)) state = torch.load("/tmp/model.pt") model = task.build_model(args) model.load_state_dict(state["model"]) models = [model.cuda()] else: print('| loading model(s) from {}'.format(args.path)) models, _ = utils.load_ensemble_for_inference(args.path.split(':'), task, model_arg_overrides=eval(args.model_overrides)) models = [model.cuda() for model in models] # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models] ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=8, num_shards=args.num_shards, shard_id=args.shard_id, ).next_epoch_itr(shuffle=False) results = [] scorer = pybleu.PyBleuScorer() num_sentences = 0 has_target = True timer = TimeMeter() with progress_bar.build_progress_bar(args, itr) as t: translations = generate_batched_itr(t, strategy, models, tgt_dict, length_beam_size=args.length_beam, use_gold_target_len=args.gold_target_len) for sample_id, src_tokens, target_tokens, hypos in translations: has_target = target_tokens is not None target_tokens = target_tokens.int().cpu() if has_target else None # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset(args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset(args.gen_subset).tgt.get_original_text(sample_id) else: src_str = dict.string(src_tokens, args.remove_bpe) if args.dehyphenate: src_str = dehyphenate(src_str) if has_target: target_str = dict.string(target_tokens, args.remove_bpe, escape_unk=True) if args.dehyphenate: target_str = dehyphenate(target_str) if not args.quiet or True: # print('S-{}\t{}'.format(sample_id, src_str)) if has_target: # print('T-{}\t{}'.format(sample_id, target_str)) hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypos.int().cpu(), src_str=src_str, alignment= None, align_dict=align_dict, tgt_dict=dict, remove_bpe=args.remove_bpe, ) if args.dehyphenate: hypo_str = dehyphenate(hypo_str) if not args.quiet: print('H-{}\t{}'.format(sample_id, hypo_str)) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join(map(lambda x: str(utils.item(x)), alignment)) )) # print() # Score only the top hypothesis if has_target: if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line(target_str, add_if_not_exist=True) results.append((target_str, hypo_str)) num_sentences += 1 if has_target: print('Time = {}'.format(timer.elapsed_time)) ref, out = zip(*results) from fairseq.criterions.lib_sbleu import smoothed_bleu sbleu = np.mean([smoothed_bleu(p[0].split(), p[1].split()) for p in results]) print("| SBLEU = {:.2f}".format(sbleu)) bleurt_scores = bleurt_scorer.score([p[0] for p in results], [p[1] for p in results]) print("| BLEURT = {:.4f}".format(np.mean((np.array(bleurt_scores))))) print('| Generate {} with beam={}: BLEU4 = {:2.2f}, '.format(args.gen_subset, args.length_beam, scorer.score(ref, out)))
def get_scores(nrows, metrics=None): ''' Get correlations between metric similarity and label similarity ''' df = pd.read_csv(QQP_DATA_PATH, nrows=nrows) start_time = time() if not metrics: metrics = [ 'mover-1', 'mover-2', 'bleurt', 'bertscore', 'bartscore', 'rouge1', 'rouge2', 'rougeLsum', ] for m in tqdm(metrics): if m.startswith('rouge'): scorer = rouge_scorer.RougeScorer( [met for met in metrics if met.startswith('rouge')], use_stemmer=True) scores = [ scorer.score(r, c)[m].fmeasure for c, r in zip(df.question1, df.question2) ] elif m == 'bertscore': scorer = BERTScorer(lang="en", rescale_with_baseline=True, model_type='roberta-large-mnli') _, _, scores = scorer.score(df.question1.tolist(), df.question2.tolist()) elif m == 'bartscore': scorer = BERTScorer(lang="en", model_type="facebook/bart-large-mnli", num_layers=12) _, _, scores = scorer.score(df.question1.tolist(), df.question2.tolist()) elif m == 'bleurt': checkpoint = "bleurt-large-512" scorer = score.BleurtScorer(checkpoint) scores = scorer.score(df.question1, df.question2, batch_size=50) elif m.startswith('mover'): # Truncate long questions else moverscore gets OOM q1 = df['question1'].apply(lambda s: s[:300]).tolist() q2 = df['question2'].apply(lambda s: s[:300]).tolist() idf_dict_hyp = get_idf_dict(q1) idf_dict_ref = get_idf_dict(q2) if '1' in m: n_gram = 1 else: n_gram = 2 scores = word_mover_score(q2, q1, idf_dict_ref, idf_dict_hyp, stop_words=[], n_gram=n_gram, remove_subwords=True, batch_size=64) df[m] = scores print('\n' * 10, m, '\n' * 10) df.to_csv(QQP_OUT_PATH)
n = n.replace('word.embeddings', 'word_embeddings') n = n.replace('position.embeddings', 'position_embeddings') n = n.replace('token.type.embeddings', 'token_type_embeddings') n = n + '.weight' state_dict[n] = torch.from_numpy(data) config = transformers.BertConfig() bleurt_model = BleurtModel(config) bleurt_model.load_state_dict( state_dict, strict=False) # strict=False added otherwise crashes. # Should be safe, according to this https://github.com/huggingface/transformers/issues/6882#issuecomment-884730078 for param in bleurt_model.parameters(): param.requires_grad = False bleurt_model.eval() scorer = bleurt_score.BleurtScorer(checkpoint) ## this is what the answer should be (using bleurt's python API) references = ["a bird chirps by the window and decided to quit"] candidates = ["a bird chirps by the window but then changed its mind"] scores = scorer.score(references=references, candidates=candidates) print(scores) with open(f'{checkpoint}/bleurt_config.json', 'r') as f: bleurt_config = json.load(f) max_seq_length = bleurt_config["max_seq_length"] vocab_file = f'{checkpoint}/{bleurt_config["vocab_file"]}' do_lower_case = bleurt_config["do_lower_case"] # tokenizer = bleurt.lib.tokenization.FullTokenizer( # this is what was before
def test_default_bleurt_score(self): bleurt = score.BleurtScorer() scores = bleurt.score(references=references, candidates=candidates) self.assertLen(scores, 2) self.assertAllClose(scores, ref_scores)
def test_positional_args_error(self): bleurt = score.BleurtScorer() with self.assertRaises(AssertionError): _ = bleurt.score(references, candidates)
def test_bleurt_score_with_checkpoint(self): checkpoint = get_test_checkpoint() bleurt = score.BleurtScorer(checkpoint) scores = bleurt.score(references=references, candidates=candidates) self.assertLen(scores, 2) self.assertAllClose(scores, ref_scores)
def main(args): global score assert args.path is not None, '--path required for generation!' assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert args.replace_unk is None or args.raw_text, \ '--replace-unk requires a raw text dataset (--raw-text)' utils.import_user_module(args) if args.max_tokens is None and args.max_sentences is None: args.max_tokens = 12000 print(args) use_cuda = torch.cuda.is_available() and not args.cpu if args.reward == "bleurt" or args.eval_bleurt: sys.argv = sys.argv[:1] import tensorflow as tf gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: try: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) except RuntimeError as e: print(e) bleurt_scorer = score.BleurtScorer( os.path.join( cached_path( "https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip", extract_compressed_file=True), "bleurt-base-128")) # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionaries try: src_dict = getattr(task, 'source_dictionary', None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble print('| loading model(s) from {}'.format(args.path)) if args.path.startswith("nsml://"): # NSML session = args.path.replace("nsml://", "") model = task.build_model(args) if ".pt" in session: session = session.replace(".pt", "") session, checkpoint_name = session.rsplit("/", 1) else: checkpoint_name = "best" if "-" in checkpoint_name: start, end = checkpoint_name.replace("epoch", "").split("-") checkpoints = [ "epoch{}".format(i) for i in range(int(start), int(end) + 1) ] print("| checkpoint average:", checkpoints) state_dict = None def load(dir_path): nonlocal state_dict, checkpoints state = torch.load(os.path.join(dir_path, 'best.pt')) model_state = state["model"] for k in model_state: model_state[k] = model_state[k] / float(len(checkpoints)) if state_dict is None: state_dict = model_state else: for k in state_dict: state_dict[k] += model_state[k] print("checkpoint loaded") for checkpoint_name in checkpoints: nsml.load(checkpoint_name, load_fn=load, session=session) model.load_state_dict(state_dict) else: def load(dir_path): state = torch.load(os.path.join(dir_path, 'best.pt')) state_dict = state["model"] model.load_state_dict(state_dict) print("loaded") nsml.load(checkpoint_name, load_fn=load, session=session) models = [model.cuda()] elif "-" in args.path: model = task.build_model(args) print("loading model from", args.path) state_dict = None dir_path = os.path.dirname(args.path) fn = os.path.basename(args.path) if "-" in fn: start, end = fn.replace("epoch", "").replace(".pt", "").split("-") checkpoint_fns = [ "epoch{}.pt".format(i) for i in range(int(start), int(end) + 1) ] else: checkpoint_fns = [fn] for fn in checkpoint_fns: state = torch.load(os.path.join(dir_path, fn)) model_state = state["model"] for k in model_state: model_state[k] = model_state[k] / float(len(checkpoint_fns)) if state_dict is None: state_dict = model_state else: for k in state_dict: state_dict[k] += model_state[k] print("checkpoint loaded") model.load_state_dict(state_dict) models = [model.cuda()] else: model = task.build_model(args) state = torch.load(args.path) model_state = state["model"] model.load_state_dict(model_state) models = [model.cuda()] # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.max_sentences, max_positions=utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models]), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) # Initialize generator gen_timer = StopwatchMeter() generator = task.build_generator(args) # Generate and compute BLEU score # if args.sacrebleu: # scorer = bleu.SacrebleuScorer() # else: # scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk()) scorer = pybleu.PyBleuScorer() num_sentences = 0 has_target = True results = [] best_rank_list = [] if args.save_path: outf = open(args.save_path, "w") total_n = 0 with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if 'net_input' not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample['target'][:, :args.prefix_size] gen_timer.start() hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos) gen_timer.stop(num_generated_tokens) hypo_target_pairs = [] for i, sample_id in enumerate(sample['id'].tolist()): total_n += 1 has_target = sample['target'] is not None # Remove padding src_tokens = utils.strip_pad( sample['net_input']['src_tokens'][i, :], tgt_dict.pad()) target_tokens = None if has_target: target_tokens = utils.strip_pad( sample['target'][i, :], tgt_dict.pad()).int().cpu() # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset( args.gen_subset).src.get_original_text(sample_id) target_str = task.dataset( args.gen_subset).tgt.get_original_text(sample_id) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.remove_bpe) else: src_str = "" if has_target: target_str = tgt_dict.string(target_tokens, args.remove_bpe, escape_unk=True) if not args.quiet: if src_dict is not None: print('S-{}\t{}'.format(sample_id, src_str)) if has_target: print('T-{}\t{}'.format(sample_id, target_str)) if args.reward_sample or args.reward_check: # Get sample hypo_strs = [] rewards = [] for j, hypo in enumerate(hypos[i]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=None, ) hypo_strs.append(hypo_str) if args.reward == "sbleu": for hypo_str in hypo_strs: hypo_str_nobpe = hypo_str.replace("@@ ", "") rewards.append( compute_reward(hypo_str_nobpe, target_str)) best_idx = np.array(rewards).argmax() if args.reward_check: best_rank_list.append(best_idx) if args.save_path: if args.output_all: for hypo_i in range(len(hypo_strs)): outf.write("{} | {:.4f} | {}\n".format( sample_id, rewards[hypo_i], hypo_strs[hypo_i])) else: outf.write("{} | {}\n".format( sample_id, hypo_strs[best_idx])) else: if args.output_all: for hypo_i in range(len(hypo_strs)): print("{} | {:.4f} | {}".format( sample_id, rewards[hypo_i], hypo_strs[hypo_i])) else: print("{} | {}".format(sample_id, hypo_strs[best_idx])) sys.stdout.flush() elif args.reward == "bleurt": hypo_target_pairs.append( (sample_id, target_str, hypo_strs)) else: # Normal translation # Process top predictions for j, hypo in enumerate(hypos[i][:args.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) if not args.quiet: print('H-{}\t{}\t{}'.format( sample_id, hypo['score'], hypo_str)) print('P-{}\t{}'.format( sample_id, ' '.join( map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), )))) if args.print_alignment: print('A-{}\t{}'.format( sample_id, ' '.join( map(lambda x: str(utils.item(x)), alignment)))) # Score only the top hypothesis results.append( (sample_id, target_str, hypo_str, float(hypo["positional_scores"].mean()))) if has_target and j == 0 and not args.reward_sample: pass # if align_dict is not None or args.remove_bpe is not None: # Convert back to tokens for evaluation with unk replacement and/or without BPE # target_tokens = tgt_dict.encode_line(target_str, add_if_not_exist=True) # if args.save_path: # outf.write("{} | {}\n".format(sample_id, hypo_str)) # if j == 0 and not args.no_eval: # results.append((sample_id, target_str, hypo_str)) # if hasattr(scorer, 'add_string'): # scorer.add_string(target_str, hypo_str) # else: # scorer.add(target_tokens, hypo_tokens) if args.save_amount > 0 and total_n > args.save_amount: break if args.reward_sample and bool(hypo_target_pairs): hypo_batch = [] target_batch = [] for _, target, hypo_strs in hypo_target_pairs: hypo_batch.extend( [h.replace("@@ ", "") for h in hypo_strs]) target_batch.extend([target_str] * len(hypo_strs)) rewards = np.array( bleurt_scorer.score(target_batch, hypo_batch)) base_i = 0 for sample_id, _, hypo_strs in hypo_target_pairs: start = base_i end = base_i + len(hypo_strs) best_idx = rewards[start:end].argmax() if args.save_path: if args.output_all: for idx in range(start, end): outf.write("{} | {:.4f} | {}\n".format( sample_id, float(rewards[idx]), hypo_strs[idx - start])) else: outf.write("{} | {}\n".format( sample_id, hypo_strs[best_idx])) else: if args.output_all: for idx in range(start, end): print("{} | {:.4f} | {}".format( sample_id, float(rewards[idx]), hypo_strs[idx - start])) else: print("{} | {}".format(sample_id, hypo_strs[best_idx])) sys.stdout.flush() base_i += len(hypo_strs) wps_meter.update(num_generated_tokens) t.log({'wps': round(wps_meter.avg)}) num_sentences += sample['nsentences'] print( '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)' .format(num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1. / gen_timer.avg)) if args.save_path and not args.reward_check and not args.reward_sample: results.sort() for sample_id, tgt, hyp, score in results: outf.write("{}\t{}\t{}\n".format(sample_id, score, hyp)) print("results saved to", args.save_path) if args.reward_check: print("avg ranking of the best sample:", np.array(best_rank_list).mean()) print("ratio of best sample ranked in the top:", (np.array(best_rank_list) == 0).mean()) if has_target and not args.reward_sample and not args.reward_check and not args.no_eval: _, ref, out, _ = zip(*results) from fairseq.criterions.lib_sbleu import smoothed_bleu sbleu = np.mean( [smoothed_bleu(p[1].split(), p[2].split()) for p in results]) print("| SBLEU = {:.2f}".format(sbleu)) if args.eval_bleurt: bleurt_scores = bleurt_scorer.score( references=[p[1] for p in results], candidates=[p[2] for p in results]) print("| BLEURT = {:.4f}".format(np.mean( (np.array(bleurt_scores))))) print('| Generate {} with beam={}: {}'.format(args.gen_subset, args.beam, scorer.score(ref, out))) return scorer
import tensorflow as tf tf.compat.v1.flags.DEFINE_integer('batch_size', 1, 'batch_size') tf.compat.v1.flags.DEFINE_float("lr", 5e-5, "learning rate") tf.compat.v1.flags.DEFINE_integer("seed", 42, "seed to replicate results") tf.compat.v1.flags.DEFINE_integer("n_gpu", 1, "no of gpu available") tf.compat.v1.flags.DEFINE_integer("gradient_accumulation_steps", 32, "gradient_accumulation_steps") tf.compat.v1.flags.DEFINE_integer("num_workers", 4, "num of cpus available") tf.compat.v1.flags.DEFINE_integer("device", -1, "torch.device object") tf.compat.v1.flags.DEFINE_integer("num_train_epochs", 5, "no of epochs of training") tf.compat.v1.flags.DEFINE_string("output_dir", './output', "path to save evaluation results") tf.compat.v1.flags.DEFINE_string("model_dir", './weights', "path to save trained model") tf.compat.v1.flags.DEFINE_float("max_grad_norm", 1.0, "max gradient norm.") tf.compat.v1.flags.DEFINE_string("root_dir", './CNN-DM/gpt2_1024_data', "location of json dataset.") tf.compat.v1.flags.DEFINE_string("ids_file", './CNN-DM/ids.json', "location of train, valid and test file indexes") from bleurt import score from rouge_score import rouge_scorer rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) with tf.device('cpu'): bleurt_scorer = score.BleurtScorer('../bleurt/bleurt/bleurt-base-512') def calc_metrics(reference, candidate): r_scores = rouge_scorer.score(reference, candidate) b_score = bleurt_scorer.score([reference], [candidate], batch_size=1) metrics = {'r1': r_scores['rouge1'][2], 'r2': r_scores['rouge2'][2], 'rl': r_scores['rougeL'][2], 'bleurt': b_score[0]} return metrics
def __init__(self, data, bleurt_device): self.data = data self.bleurt_device = bleurt_device with tf.device(self.bleurt_device): self.bleurt_scorer = bleurt_score.BleurtScorer(bleurt_model) self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)