Example #1
0
    def __init__(self, language, model: str = "bleurt-base-128"):
        super().__init__(language)
        # HACK TO SILENCE tensorflow and errors related to tf.FLAGS
        from silence_tensorflow import silence_tensorflow

        silence_tensorflow()
        import tensorflow.compat.v1 as tf

        flags = tf.flags
        flags.DEFINE_string("source", "", help="Source segments", required=False)
        flags.DEFINE_string("s", "", help="Source segments", required=False)
        flags.DEFINE_string("hypothesis", "", help="MT segments", required=False)
        flags.DEFINE_string("h", "", help="MT segments", required=False)
        flags.DEFINE_string("reference", "", help="Reference segments", required=False)
        flags.DEFINE_string("r", "", help="Reference segments", required=False)
        flags.DEFINE_string("language", "", help="Language", required=False)
        flags.DEFINE_string("l", "", help="Language", required=False)
        flags.DEFINE_string("metric", "", help="Metric to run.", required=False)
        flags.DEFINE_string("m", "", help="Metric to run.", required=False)

        self.model = model
        if not os.path.isdir(telescope_cache_folder() + model):
            download_file_maybe_extract(
                url=f"https://storage.googleapis.com/bleurt-oss/{model}.zip",
                directory=telescope_cache_folder(),
            )
        self.scorer = score.BleurtScorer(telescope_cache_folder() + model)
        self.system_only = False
Example #2
0
def bert_based(gts, res):
    refs, cands = [], []
    for refers in gts.values():
        sub_refs = []
        for ref in refers:
            sub_refs.append(ref + '.')
        refs.append(sub_refs)
    for cand in res.values():
        cands.append(cand[0] + '.')

    scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    P, R, F1 = scorer.score(cands, refs, verbose=True)
    out_file.write('BERTScore = %s' % F1.mean().item() + "\n")
    BERTScore = F1.mean().item()

    total_bleurt_score = []
    scorer = bleurt_sc.BleurtScorer(bleurt_checkpoint)

    for ref_caption, cand in zip(refs, cands):
        bleurt_score_per_img = []
        for ref in ref_caption:
            bleurt_score_per_img.append(
                scorer.score([ref], [cand], batch_size=None)[0])
        total_bleurt_score.append(max(bleurt_score_per_img))
    out_file.write('BLEURT =%s' % statistics.mean(total_bleurt_score))
Example #3
0
def bleurt_eval(candidates, references, verbose=False):
    checkpoint = "bleurt/bleurt-base-128"
    scorer = bleurt_score.BleurtScorer(checkpoint)
    scores = scorer.score(references=references, candidates=candidates)
    if verbose:
        print("BLEURT scores:", scores)
    return mean(scores)
Example #4
0
    def _download_and_prepare(self, dl_manager):

        # check that config name specifies a valid BLEURT model
        if self.config_name == "default":
            logger.warning(
                "Using default BLEURT-Base checkpoint for sequence maximum length 128. "
                "You can use a bigger model for better results with e.g.: datasets.load_metric('bleurt', 'bleurt-large-512')."
            )
            self.config_name = "bleurt-base-128"

        if self.config_name.lower() in CHECKPOINT_URLS:
            checkpoint_name = self.config_name.lower()

        elif self.config_name.upper() in CHECKPOINT_URLS:
            checkpoint_name = self.config_name.upper()

        else:
            raise KeyError(
                f"{self.config_name} model not found. You should supply the name of a model checkpoint for bleurt in {CHECKPOINT_URLS.keys()}"
            )

        # download the model checkpoint specified by self.config_name and set up the scorer
        model_path = dl_manager.download_and_extract(
            CHECKPOINT_URLS[checkpoint_name])
        self.scorer = score.BleurtScorer(
            os.path.join(model_path, checkpoint_name))
Example #5
0
 def test_bleurt_nulls(self):
     bleurt = score.BleurtScorer()
     test_references = []
     test_candidates = []
     scores = bleurt.score(references=test_references,
                           candidates=test_candidates)
     self.assertLen(scores, 0)
Example #6
0
 def test_bleurt_empty(self):
     bleurt = score.BleurtScorer()
     test_references = [""]
     test_candidates = [""]
     scores = bleurt.score(references=test_references,
                           candidates=test_candidates)
     self.assertLen(scores, 1)
Example #7
0
 def __init__(self, model_type, model_path):
     self.config_name = model_type
     if self.config_name not in CHECKPOINT_URLS.keys():
         if self.config_name == 'default':
             logger.warning("Using default BLEURT-Base checkpoint for sequence maximum length 128. "
                            "You can use a bigger model for better results with e.g.: nlp.load_metric('bleurt', 'bleurt-large-512').")
             self.config_name = "bleurt-tiny-128"
         else:
             raise KeyError(
                 f"{self.config_name} model not found. You should supply the name of a model checkpoint for bleurt in {CHECKPOINT_URLS.keys()}")
     try:
         if model_path.endswith(self.config_name):
             self.scorer = score.BleurtScorer(model_path)
         else:
             self.scorer = score.BleurtScorer(os.path.join(model_path, self.config_name))
     except Exception as e:
         raise Exception(str(
             e) + f". You can download the checkpoint for {self.config_name} model from {CHECKPOINT_URLS[self.config_name]}")
Example #8
0
def score_files(generator, bleurt_checkpoint):
    """Computes BLEURT scores from a sentence pairs generator.

  Requires that a JSONL file containing both candidate and reference
  sentences or two individual candidate and reference text files be specified,
  with the former overriding the latter if both flags are specified.

  Args:
    generator: A generator yielding reference and candidate sentences.
    bleurt_checkpoint: BLEURT checkpoint used for scoring.
  """
    ref_buffer = []
    cand_buffer = []
    scores_buffer = []

    if not FLAGS.batch_same_length:
        scorer = score_lib.BleurtScorer(bleurt_checkpoint)
    else:
        logging.warning(
            "Enabling same length batching. BEWARE: this is an experimental "
            "feature.")
        scorer = score_lib.LengthBatchingBleurtScorer(bleurt_checkpoint)

    def _consume_buffer():
        scores = scorer.score(references=ref_buffer,
                              candidates=cand_buffer,
                              batch_size=FLAGS.bleurt_batch_size)
        del ref_buffer[:]
        del cand_buffer[:]
        scores_buffer.extend(scores)

    logging.info("Computing BLEURT scores...")
    for ref_sentence, cand_sentence in generator:
        ref_buffer.append(ref_sentence)
        cand_buffer.append(cand_sentence)
        if len(ref_buffer) >= FLAGS.read_buffer_size:
            _consume_buffer()
    if ref_buffer:
        _consume_buffer()
    logging.info("BLEURT scores computed.")

    if FLAGS.scores_file:
        logging.info("Writing to disk.")
        with tf.io.gfile.GFile(FLAGS.scores_file, "w+") as score_file:
            for s in scores_buffer:
                score_file.write("{}\n".format(str(s)))
    else:
        for s in scores_buffer:
            print("{}".format(str(s)))
    logging.info("Done.")
Example #9
0
    def __init__(self, decode_fn):
        import tensorflow as tf
        gpus = tf.config.experimental.list_physical_devices('GPU')
        if gpus:
            try:
                for gpu in gpus:
                    tf.config.experimental.set_memory_growth(gpu, True)

            except RuntimeError as e:
                print(e)
        from bleurt import score as bleurt_score
        super(BleurtDiscriminator, self).__init__()
        checkpoint = "bleurt/bleurt-base-128"
        self.scorer = bleurt_score.BleurtScorer(checkpoint)
        self.decode_fn = decode_fn
Example #10
0
    def run_bleurt(self):
        ''' Computes the BLEURT scores between the set of hypothesis 
            and reference summaries.
        '''
        print('\n===== BLEURT =====\n')
        sys.argv = [sys.argv[0]]
        checkpoint = self.bleurt_model
        bleurt = score.BleurtScorer(checkpoint)

        for hyps_path, refs_path in zip(self.hyps_paths, self.refs_paths):
            self.load_summs(hyps_path, refs_path)
            scores = bleurt.score(self.hyps, self.refs, batch_size=64)
            self.df_scores.loc[self.df_scores['hyps_path'] == hyps_path,
                               'bleurt'] = scores
            self.save_temp_csv()
            print(np.mean(scores))

        del bleurt, scores, checkpoint
        torch.cuda.empty_cache()
def main(_):
    multi_references = (_text_reference_reader(FLAGS.reference_path))
    generations = _text_reader(FLAGS.generation_path)

    # FLAGS.bleurt_checkpoint is defined in the BLEURT library. Importing the
    # BLEURT scoring module automatically imports the flags.
    scorer = score.BleurtScorer(FLAGS.bleurt_checkpoint)
    multi_bleurt_scores = []

    for references in multi_references:
        assert len(references) == len(generations)

        # Maximize parallelism.
        bleurt_scores = scorer.score(references=references,
                                     candidates=generations)
        multi_bleurt_scores.append(bleurt_scores)

    if len(multi_references) == 1:
        avg_bleurt_score = np.mean(multi_bleurt_scores[0])
    else:
        assert len(multi_references) == 3
        avg_bleurt_scores = []
        for i in range(len(generations)):
            # All examples have atleast two references but some do not have three.
            assert multi_references[0][i] and multi_references[1][i]
            r2 = multi_references[2][i]
            if r2:
                # Take average over 3 references.
                score_i = (multi_bleurt_scores[0][i] +
                           multi_bleurt_scores[1][i] +
                           multi_bleurt_scores[2][i]) / 3
            else:
                print("only two refs")
                # Take average over two references.
                score_i = (multi_bleurt_scores[0][i] +
                           multi_bleurt_scores[1][i]) / 2
            avg_bleurt_scores.append(score_i)
        avg_bleurt_score = np.mean(avg_bleurt_scores)

    print("Evaluated %d examples." % len(generations))
    print("Average BLEURT score = %.4f" % avg_bleurt_score)
Example #12
0
 def __init__(self, args, task):
     super().__init__(args, task)
     self.eps = args.label_smoothing
     from fairseq.sequence_generator import SequenceGenerator
     self.gen = SequenceGenerator(task.target_dictionary,
                                  beam_size=args.beam_size)
     if args.reward == "bleurt":
         from fairseq.distributed_utils import get_rank
         sys.argv = sys.argv[:1]
         my_rank = 0 if torch.cuda.device_count() <= 1 else get_rank()
         os.environ["CUDA_VISIBLE_DEVICES"] = str(my_rank % 4)
         from bleurt import score
         from transformers import cached_path
         import tensorflow as tf
         gpus = tf.config.experimental.list_physical_devices('GPU')
         if gpus:
             this_gpu = gpus[my_rank % 4]
             tf.config.set_visible_devices([this_gpu], 'GPU')
             try:
                 tf.config.experimental.set_memory_growth(this_gpu, True)
                 tf.config.experimental.set_virtual_device_configuration(
                     this_gpu, [
                         tf.config.experimental.VirtualDeviceConfiguration(
                             memory_limit=2048)
                     ])
                 logical_devices = tf.config.list_logical_devices('GPU')
                 self.logical_device = tf.device(logical_devices[0].name)
                 print("num of logical gpus", len(logical_devices))
             except RuntimeError as e:
                 print(e)
         with self.logical_device:
             self.bleurt_scorer = score.BleurtScorer(
                 os.path.join(
                     cached_path(
                         "https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip",
                         extract_compressed_file=True), "bleurt-base-128"))
def run_bleurt(candidates: list,
               references: list,
               checkpoint: str = "bleurt/bleurt-large-512"):
    scorer = score.BleurtScorer(checkpoint)
    scores = scorer.score(references, candidates)
    return scores
Example #14
0
 def _scoring_fun(test_df):
     scorer = score.BleurtScorer(export_dir)
     return scorer.score(references=test_df.reference.tolist(),
                         candidates=test_df.candidate.tolist())
Example #15
0
def evaluate_metric(metric, stem, remove_stop, prompt='overall'):
    ''' Compute the correlation between the human eval scores and the scores awarded by the
        eval metric.
    '''
    assert metric in ['ROUGE-1-F', 'ROUGE-2-F', 'ROUGE-L-F', 'bert-human', 'bert-score', 'bart-score', 
        'bleurt-base', 'bleurt-lg', 'mover-1', 'mover-2', 'mover-smd', 'bert-avg-score']
    stemmed_str = "_stem" if stem else ""
    stop_str = "_removestop" if remove_stop else ""
    ranks_file_path = os.path.join('learned_eval/outputs', 'wref_{}{}{}_{}_rank_correlation.csv'.format(metric, stemmed_str, stop_str, prompt))
    print('\n====={}=====\n'.format(ranks_file_path))

    ranks_file = open(ranks_file_path, 'w')
    ranks_file.write('article,summ_id, human_score, metric_score\n')

    sorted_scores = read_sorted_scores()
    input_articles, _ = read_articles()
    corr_data = np.zeros((len(sorted_scores), 3))

    stopwords_list = set(stopwords.words("english"))
    stemmer = PorterStemmer()

    # Init the metric
    if metric == 'bert-human':
        rewarder = Rewarder(os.path.join(MODEL_WEIGHT_DIR, 'sample.model'))
    elif metric.endswith('score'):   
        from bert_score import BERTScorer
        if 'bert-score' == metric:
            rewarder = BERTScorer(lang="en", rescale_with_baseline=True, model_type='roberta-large-mnli')
        elif 'bart-score' == metric:
            rewarder = BERTScorer(lang="en", model_type="facebook/bart-large-mnli", num_layers=12)
        elif 'bert-avg' in metric:
            r1 = BERTScorer(lang="en", rescale_with_baseline=False, model_type='roberta-large')
            r2 = BERTScorer(lang="en", rescale_with_baseline=False, model_type='albert-xxlarge-v2')
            r3 = BERTScorer(lang="en", rescale_with_baseline=False, model_type='bart-large-mnli', num_layers=12)
    elif metric.startswith('bleurt'):
        from bleurt import score
        if 'base' in metric: 
            checkpoint = "bleurt-base-512"
        elif 'lg' in metric: 
            checkpoint = "bleurt-large-512"
        rewarder = score.BleurtScorer(checkpoint)
    elif metric.startswith('mover'):
        from moverscore import get_idf_dict, word_mover_score
        hyps = [s['sys_summ'] for score in sorted_scores.values() for s in score if s['sys_name'] != 'reference']
        refs = [s['sys_summ'] for score in sorted_scores.values() for s in score if s['sys_name'] == 'reference']
        idf_dict_hyp = get_idf_dict(hyps)
        idf_dict_ref = get_idf_dict(refs)
    elif 'rouge' in metric.lower():
        from rouge_score import rouge_scorer
        from rouge_score.scoring import BootstrapAggregator

    # Loop over each article and compute the correlation between human judgement
    # and the metric scores. 
    for i, (article_id, scores) in tqdm(enumerate(sorted_scores.items())):
        scores_list = [s for s in scores if s['sys_name'] != 'reference']
        human_ranks = [s['scores'][prompt] for s in scores_list]
        if len(human_ranks) < 2: 
            continue    # Must be at least 2 scores to compute the correlation
        ref_summ = scores_list[0]['ref']
        article = [entry['article'] for entry in input_articles if entry['id']==article_id][0]

        # Pre-processing (if necessary)
        if stem and remove_stop:
            sys_summs = [" ".join(sent2stokens_wostop(s['sys_summ'], stemmer, stopwords_list, 'english', True)) for s in scores_list]
            ref_summ = " ".join(sent2stokens_wostop(ref_summ, stemmer, stopwords_list, 'english', True))
            article = " ".join(sent2stokens_wostop(article, stemmer, stopwords_list, 'english', True))
        elif not stem and remove_stop:
            sys_summs = [" ".join(sent2tokens_wostop(s['sys_summ'], stopwords_list, 'english', True)) for s in scores_list]
            ref_summ = " ".join(sent2tokens_wostop(ref_summ, stopwords_list, 'english', True))
            article = " ".join(sent2tokens_wostop(article, stopwords_list, 'english', True))
        elif not remove_stop and stem:
            sys_summs = [" ".join(sent2stokens(s['sys_summ'], stemmer, 'english', True)) for s in scores_list]
            ref_summ = " ".join(sent2stokens(ref_summ, stemmer, 'english', True))
            article = " ".join(sent2stokens(article, stemmer, 'english', True))
        else:
            sys_summs = [s['sys_summ'] for s in scores_list]

        # Clean summaries
        summ_ids = [s['summ_id'] for s in scores_list]
        sys_summs = [text_normalization(s) for s in sys_summs]
        ref_summ = text_normalization(ref_summ)
        article = text_normalization(article)

        # Compute metric scores
        if 'rouge' in metric.lower():
            auto_metric_ranks = []
            if '1' in metric:
                rouge_metric = 'rouge1'
            elif '2' in metric:
                rouge_metric = 'rouge2'
            elif 'L' in metric:
                rouge_metric = 'rougeL'
            rew_rouge = rouge_scorer.RougeScorer([rouge_metric], use_stemmer=True)
            for ss in sys_summs:
                ss = ss.replace('. ', '\n')
                ref_summ = ref_summ.replace('. ', '\n')
                score = rew_rouge.score(ref_summ, ss)
                auto_metric_ranks.append(score[rouge_metric].fmeasure)
        if metric == 'bert-human':
            auto_metric_ranks = [rewarder(ref_summ,ss) for ss in sys_summs]
        elif metric.endswith('score'):   
            if 'bert-score' == metric:
                auto_metric_ranks = [rewarder.score([ref_summ], [ss])[-1].item() for ss in sys_summs]
            elif 'bart-score' == metric:
                auto_metric_ranks = [rewarder.score([ref_summ], [ss])[-1].item() for ss in sys_summs]
            elif 'bert-avg' in metric:
                rewarder_scores = []
                for rewarder in [r1, r2, r3]:
                    r_scores = np.array([rewarder.score([ref_summ], [ss])[-1].item() for ss in sys_summs])
                    r_scores = (r_scores - np.min(r_scores)) / (np.max(r_scores) - np.min(r_scores))
                    rewarder_scores.append(r_scores)
                auto_metric_ranks = list(np.mean(rewarder_scores, axis=0))
        elif metric.startswith('bleurt'):
            auto_metric_ranks = [rewarder.score([ref_summ], [ss])[0] for ss in sys_summs]
        elif metric.startswith('mover'):
            if '1' in metric: 
                n_gram = 1
            elif '2' in metric: 
                n_gram = 2
            else: 
                raise ValueError("smd not implemented currently")
            auto_metric_ranks = [word_mover_score([ref_summ], [ss], idf_dict_ref, idf_dict_hyp,
                                stop_words=[], n_gram=n_gram, remove_subwords=True)[0] for ss in sys_summs]
   
        for sid, amr, hr in zip(summ_ids, auto_metric_ranks, human_ranks):
            ranks_file.write('{},{},{:.2f},{:.4f}\n'.format(article_id, sid, hr, amr))

        # Compute correlations
        spearmanr_result = spearmanr(human_ranks, auto_metric_ranks)
        pearsonr_result = pearsonr(human_ranks, auto_metric_ranks)
        kendalltau_result = kendalltau(human_ranks, auto_metric_ranks)
        corr_data[i, :] = [spearmanr_result[0], pearsonr_result[0], kendalltau_result[0]]

    corr_mean_all = np.nanmean(corr_data, axis=0)
    corr_std_all = np.nanstd(corr_data, axis=0)
    print('\n====={}=====\n'.format(ranks_file_path))
    print("Correlation mean on all data spearman/pearsonr/kendall: {}".format(corr_mean_all))
    print("Correlation std on all data spearman/pearsonr/kendall: {}".format(corr_std_all))

    ranks_file.flush()
    ranks_file.close()

    return ranks_file_path
Example #16
0
from bleurt import score
checkpoint = "bleurt/test_checkpoint"
refrences = []
candidates = []
refrences.append(input('Enter the refrence sentences : '))
candidates.append(input('Enter the candidate sentences : '))
scorer = score.BleurtScorer(checkpoint)
scores = scorer.score(refrences, candidates)
assert type(scores) == list and len(scores) == 1
print(scores)
Example #17
0
 def _scoring_fun(test_df):
     scorer = score.BleurtScorer(export_dir)
     return scorer.score(test_df.reference, test_df.candidate)
Example #18
0
 def __init__(self):
     self.scorer = score.BleurtScorer()
Example #19
0
def main(args, checkpoint_name="best"):
    assert args.path is not None, '--path required for generation!'
    assert not args.sampling or args.nbest == args.beam, \
        '--sampling requires --nbest to be equal to --beam'
    assert args.replace_unk is None or args.raw_text, \
        '--replace-unk requires a raw text dataset (--raw-text)'
    
    if args.max_tokens is None and args.max_sentences is None:
        args.max_tokens = 12000
    print(args)
    
    use_cuda = torch.cuda.is_available() and not args.cpu
    torch.manual_seed(args.seed)

    # Load dataset splits
    task = tasks.setup_task(args)
    task.load_dataset(args.gen_subset)
    print('| {} {} {} examples'.format(args.data, args.gen_subset, len(task.dataset(args.gen_subset))))
    args.taskobj = task

    sys.argv = sys.argv[:1]
    import tensorflow as tf
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
        except RuntimeError as e:
            print(e)
    bleurt_scorer = score.BleurtScorer(os.path.join(
        cached_path(
            "https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip",
            extract_compressed_file=True
        ), "bleurt-base-128"
    ))
    # Set dictionaries
    #src_dict = task.source_dictionary
    tgt_dict = task.target_dictionary
    dict = tgt_dict
    
    # Load decoding strategy
    strategy = strategies.setup_strategy(args)

    # Load ensemble
    if args.path.startswith("nsml://"):
        print("| loading nsml checkpoint", args.path)
        import nsml
        session = args.path.replace("nsml://", "")
        model = task.build_model(args)
        def load(dir_path):
            state = torch.load(os.path.join(dir_path, 'best.pt'))
            state_dict = state["model"]
            model.load_state_dict(state_dict)
            print("loaded")
        nsml.load(args.checkpoint_name, load_fn=load, session=session)
        models = [model.cuda()]
    elif args.path == "pretrain":
        from nsml import DATASET_PATH
        from fairseq import checkpoint_utils
        data_token = "en-de"
        pretrained_path = "{}/train/pretrained_models/maskPredict_{}/checkpoint_best.pt".format(DATASET_PATH, data_token.split(".")[-1].replace("-", "_"))
        print("| loading", pretrained_path)
        model = task.build_model(args)
        state = checkpoint_utils.load_checkpoint_to_cpu(pretrained_path)
        model.load_state_dict(state["model"], strict=True)
        models = [model.cuda()]
    elif args.path.startswith("wb://"):
        print("| loading wb checkpoint", args.path)
        import wandb
        wandb.restore("best.pt", args.path.replace("wb://", ""), root="/tmp/")
        assert os.path.exists("/tmp/best.pt")
        state = torch.load("/tmp/best.pt")
        model = task.build_model(args)
        model.load_state_dict(state["model"])
        models = [model.cuda()]
    elif args.path.startswith("http://"):
        print("| loading http checkpoint", args.path)
        url = "http://trains.deeplearn.org:8081/{}".format(args.path.replace("http://", ""))
        os.system("curl -o /tmp/model.pt {}".format(url))
        state = torch.load("/tmp/model.pt")
        model = task.build_model(args)
        model.load_state_dict(state["model"])
        models = [model.cuda()]
    else:
        print('| loading model(s) from {}'.format(args.path))
        models, _ = utils.load_ensemble_for_inference(args.path.split(':'), task, model_arg_overrides=eval(args.model_overrides))
        models = [model.cuda() for model in models]

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )
        if args.fp16:
            model.half()

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    # Load dataset (possibly sharded)
    itr = task.get_batch_iterator(
        dataset=task.dataset(args.gen_subset),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=utils.resolve_max_positions(
            task.max_positions(),
            *[model.max_positions() for model in models]
        ),
        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
        required_batch_size_multiple=8,
        num_shards=args.num_shards,
        shard_id=args.shard_id,
    ).next_epoch_itr(shuffle=False)
    
    results = []
    scorer = pybleu.PyBleuScorer()
    num_sentences = 0
    has_target = True
    timer = TimeMeter()

    with progress_bar.build_progress_bar(args, itr) as t:

        translations = generate_batched_itr(t, strategy, models, tgt_dict, length_beam_size=args.length_beam, use_gold_target_len=args.gold_target_len)
        for sample_id, src_tokens, target_tokens, hypos in translations:
            has_target = target_tokens is not None
            target_tokens = target_tokens.int().cpu() if has_target else None

            # Either retrieve the original sentences or regenerate them from tokens.
            if align_dict is not None:
                src_str = task.dataset(args.gen_subset).src.get_original_text(sample_id)
                target_str = task.dataset(args.gen_subset).tgt.get_original_text(sample_id)
            else:
                src_str = dict.string(src_tokens, args.remove_bpe)
                if args.dehyphenate:
                    src_str = dehyphenate(src_str)
                if has_target:
                    target_str = dict.string(target_tokens, args.remove_bpe, escape_unk=True)
                    if args.dehyphenate:
                        target_str = dehyphenate(target_str)

            if not args.quiet or True:
                # print('S-{}\t{}'.format(sample_id, src_str))
                if has_target:
                    # print('T-{}\t{}'.format(sample_id, target_str))
                    hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                        hypo_tokens=hypos.int().cpu(),
                        src_str=src_str,
                        alignment= None,
                        align_dict=align_dict,
                        tgt_dict=dict,
                        remove_bpe=args.remove_bpe,
                    )
                    if args.dehyphenate:
                        hypo_str = dehyphenate(hypo_str)

                    if not args.quiet:
                        print('H-{}\t{}'.format(sample_id, hypo_str))
                        if args.print_alignment:
                            print('A-{}\t{}'.format(
                                sample_id,
                                ' '.join(map(lambda x: str(utils.item(x)), alignment))
                            ))
                        # print()
                        
                        # Score only the top hypothesis
                        if has_target:
                            if align_dict is not None or args.remove_bpe is not None:
                                # Convert back to tokens for evaluation with unk replacement and/or without BPE
                                target_tokens = tgt_dict.encode_line(target_str, add_if_not_exist=True)

                    results.append((target_str, hypo_str))
                    num_sentences += 1
        if has_target:
            print('Time = {}'.format(timer.elapsed_time))
            ref, out = zip(*results)
            from fairseq.criterions.lib_sbleu import smoothed_bleu
            sbleu = np.mean([smoothed_bleu(p[0].split(), p[1].split()) for p in results])
            print("| SBLEU = {:.2f}".format(sbleu))
            bleurt_scores = bleurt_scorer.score([p[0] for p in results], [p[1] for p in results])
            print("| BLEURT = {:.4f}".format(np.mean((np.array(bleurt_scores)))))
            print('| Generate {} with beam={}: BLEU4 = {:2.2f}, '.format(args.gen_subset, args.length_beam, scorer.score(ref, out)))
Example #20
0
def get_scores(nrows, metrics=None):
    ''' Get correlations between metric similarity and label similarity '''
    df = pd.read_csv(QQP_DATA_PATH, nrows=nrows)
    start_time = time()
    if not metrics:
        metrics = [
            'mover-1',
            'mover-2',
            'bleurt',
            'bertscore',
            'bartscore',
            'rouge1',
            'rouge2',
            'rougeLsum',
        ]
    for m in tqdm(metrics):
        if m.startswith('rouge'):
            scorer = rouge_scorer.RougeScorer(
                [met for met in metrics if met.startswith('rouge')],
                use_stemmer=True)
            scores = [
                scorer.score(r, c)[m].fmeasure
                for c, r in zip(df.question1, df.question2)
            ]
        elif m == 'bertscore':
            scorer = BERTScorer(lang="en",
                                rescale_with_baseline=True,
                                model_type='roberta-large-mnli')
            _, _, scores = scorer.score(df.question1.tolist(),
                                        df.question2.tolist())
        elif m == 'bartscore':
            scorer = BERTScorer(lang="en",
                                model_type="facebook/bart-large-mnli",
                                num_layers=12)
            _, _, scores = scorer.score(df.question1.tolist(),
                                        df.question2.tolist())
        elif m == 'bleurt':
            checkpoint = "bleurt-large-512"
            scorer = score.BleurtScorer(checkpoint)
            scores = scorer.score(df.question1, df.question2, batch_size=50)
        elif m.startswith('mover'):
            # Truncate long questions else moverscore gets OOM
            q1 = df['question1'].apply(lambda s: s[:300]).tolist()
            q2 = df['question2'].apply(lambda s: s[:300]).tolist()
            idf_dict_hyp = get_idf_dict(q1)
            idf_dict_ref = get_idf_dict(q2)
            if '1' in m:
                n_gram = 1
            else:
                n_gram = 2
            scores = word_mover_score(q2,
                                      q1,
                                      idf_dict_ref,
                                      idf_dict_hyp,
                                      stop_words=[],
                                      n_gram=n_gram,
                                      remove_subwords=True,
                                      batch_size=64)

        df[m] = scores
        print('\n' * 10, m, '\n' * 10)
        df.to_csv(QQP_OUT_PATH)
Example #21
0
        n = n.replace('word.embeddings', 'word_embeddings')
        n = n.replace('position.embeddings', 'position_embeddings')
        n = n.replace('token.type.embeddings', 'token_type_embeddings')
        n = n + '.weight'
    state_dict[n] = torch.from_numpy(data)

config = transformers.BertConfig()
bleurt_model = BleurtModel(config)
bleurt_model.load_state_dict(
    state_dict, strict=False)  # strict=False added otherwise crashes.
# Should be safe, according to this https://github.com/huggingface/transformers/issues/6882#issuecomment-884730078
for param in bleurt_model.parameters():
    param.requires_grad = False
bleurt_model.eval()

scorer = bleurt_score.BleurtScorer(checkpoint)

## this is what the answer should be (using bleurt's python API)
references = ["a bird chirps by the window and decided to quit"]
candidates = ["a bird chirps by the window but then changed its mind"]
scores = scorer.score(references=references, candidates=candidates)
print(scores)

with open(f'{checkpoint}/bleurt_config.json', 'r') as f:
    bleurt_config = json.load(f)

max_seq_length = bleurt_config["max_seq_length"]
vocab_file = f'{checkpoint}/{bleurt_config["vocab_file"]}'
do_lower_case = bleurt_config["do_lower_case"]

# tokenizer = bleurt.lib.tokenization.FullTokenizer(    # this is what was before
Example #22
0
 def test_default_bleurt_score(self):
     bleurt = score.BleurtScorer()
     scores = bleurt.score(references=references, candidates=candidates)
     self.assertLen(scores, 2)
     self.assertAllClose(scores, ref_scores)
Example #23
0
 def test_positional_args_error(self):
     bleurt = score.BleurtScorer()
     with self.assertRaises(AssertionError):
         _ = bleurt.score(references, candidates)
Example #24
0
 def test_bleurt_score_with_checkpoint(self):
     checkpoint = get_test_checkpoint()
     bleurt = score.BleurtScorer(checkpoint)
     scores = bleurt.score(references=references, candidates=candidates)
     self.assertLen(scores, 2)
     self.assertAllClose(scores, ref_scores)
Example #25
0
def main(args):
    global score
    assert args.path is not None, '--path required for generation!'
    assert not args.sampling or args.nbest == args.beam, \
        '--sampling requires --nbest to be equal to --beam'
    assert args.replace_unk is None or args.raw_text, \
        '--replace-unk requires a raw text dataset (--raw-text)'

    utils.import_user_module(args)

    if args.max_tokens is None and args.max_sentences is None:
        args.max_tokens = 12000
    print(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    if args.reward == "bleurt" or args.eval_bleurt:
        sys.argv = sys.argv[:1]
        import tensorflow as tf
        gpus = tf.config.experimental.list_physical_devices('GPU')
        if gpus:
            try:
                for gpu in gpus:
                    tf.config.experimental.set_memory_growth(gpu, True)
            except RuntimeError as e:
                print(e)
        bleurt_scorer = score.BleurtScorer(
            os.path.join(
                cached_path(
                    "https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip",
                    extract_compressed_file=True), "bleurt-base-128"))

    # Load dataset splits
    task = tasks.setup_task(args)
    task.load_dataset(args.gen_subset)

    # Set dictionaries
    try:
        src_dict = getattr(task, 'source_dictionary', None)
    except NotImplementedError:
        src_dict = None
    tgt_dict = task.target_dictionary

    # Load ensemble
    print('| loading model(s) from {}'.format(args.path))
    if args.path.startswith("nsml://"):
        # NSML
        session = args.path.replace("nsml://", "")
        model = task.build_model(args)
        if ".pt" in session:
            session = session.replace(".pt", "")
            session, checkpoint_name = session.rsplit("/", 1)
        else:
            checkpoint_name = "best"
        if "-" in checkpoint_name:
            start, end = checkpoint_name.replace("epoch", "").split("-")
            checkpoints = [
                "epoch{}".format(i) for i in range(int(start),
                                                   int(end) + 1)
            ]
            print("| checkpoint average:", checkpoints)
            state_dict = None

            def load(dir_path):
                nonlocal state_dict, checkpoints
                state = torch.load(os.path.join(dir_path, 'best.pt'))
                model_state = state["model"]
                for k in model_state:
                    model_state[k] = model_state[k] / float(len(checkpoints))
                if state_dict is None:
                    state_dict = model_state
                else:
                    for k in state_dict:
                        state_dict[k] += model_state[k]
                print("checkpoint loaded")

            for checkpoint_name in checkpoints:
                nsml.load(checkpoint_name, load_fn=load, session=session)
            model.load_state_dict(state_dict)
        else:

            def load(dir_path):
                state = torch.load(os.path.join(dir_path, 'best.pt'))
                state_dict = state["model"]
                model.load_state_dict(state_dict)
                print("loaded")

            nsml.load(checkpoint_name, load_fn=load, session=session)
        models = [model.cuda()]
    elif "-" in args.path:
        model = task.build_model(args)
        print("loading model from", args.path)
        state_dict = None
        dir_path = os.path.dirname(args.path)
        fn = os.path.basename(args.path)
        if "-" in fn:
            start, end = fn.replace("epoch", "").replace(".pt", "").split("-")
            checkpoint_fns = [
                "epoch{}.pt".format(i) for i in range(int(start),
                                                      int(end) + 1)
            ]
        else:
            checkpoint_fns = [fn]
        for fn in checkpoint_fns:
            state = torch.load(os.path.join(dir_path, fn))
            model_state = state["model"]
            for k in model_state:
                model_state[k] = model_state[k] / float(len(checkpoint_fns))
            if state_dict is None:
                state_dict = model_state
            else:
                for k in state_dict:
                    state_dict[k] += model_state[k]
            print("checkpoint loaded")
        model.load_state_dict(state_dict)
        models = [model.cuda()]
    else:
        model = task.build_model(args)
        state = torch.load(args.path)
        model_state = state["model"]
        model.load_state_dict(model_state)
        models = [model.cuda()]

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )
        if args.fp16:
            model.half()
        if use_cuda:
            model.cuda()

    # Load alignment dictionary for unknown word replacement
    # (None if no unknown word replacement, empty if no path to align dictionary)
    align_dict = utils.load_align_dict(args.replace_unk)

    # Load dataset (possibly sharded)
    itr = task.get_batch_iterator(
        dataset=task.dataset(args.gen_subset),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=utils.resolve_max_positions(
            task.max_positions(),
            *[model.max_positions() for model in models]),
        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
        required_batch_size_multiple=args.required_batch_size_multiple,
        num_shards=args.num_shards,
        shard_id=args.shard_id,
        num_workers=args.num_workers,
    ).next_epoch_itr(shuffle=False)

    # Initialize generator
    gen_timer = StopwatchMeter()
    generator = task.build_generator(args)

    # Generate and compute BLEU score
    # if args.sacrebleu:
    #     scorer = bleu.SacrebleuScorer()
    # else:
    #     scorer = bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk())
    scorer = pybleu.PyBleuScorer()
    num_sentences = 0
    has_target = True
    results = []
    best_rank_list = []
    if args.save_path:
        outf = open(args.save_path, "w")
    total_n = 0
    with progress_bar.build_progress_bar(args, itr) as t:
        wps_meter = TimeMeter()
        for sample in t:
            sample = utils.move_to_cuda(sample) if use_cuda else sample
            if 'net_input' not in sample:
                continue

            prefix_tokens = None
            if args.prefix_size > 0:
                prefix_tokens = sample['target'][:, :args.prefix_size]

            gen_timer.start()
            hypos = task.inference_step(generator, models, sample,
                                        prefix_tokens)
            num_generated_tokens = sum(len(h[0]['tokens']) for h in hypos)
            gen_timer.stop(num_generated_tokens)

            hypo_target_pairs = []
            for i, sample_id in enumerate(sample['id'].tolist()):
                total_n += 1
                has_target = sample['target'] is not None

                # Remove padding
                src_tokens = utils.strip_pad(
                    sample['net_input']['src_tokens'][i, :], tgt_dict.pad())
                target_tokens = None
                if has_target:
                    target_tokens = utils.strip_pad(
                        sample['target'][i, :], tgt_dict.pad()).int().cpu()

                # Either retrieve the original sentences or regenerate them from tokens.
                if align_dict is not None:
                    src_str = task.dataset(
                        args.gen_subset).src.get_original_text(sample_id)
                    target_str = task.dataset(
                        args.gen_subset).tgt.get_original_text(sample_id)
                else:
                    if src_dict is not None:
                        src_str = src_dict.string(src_tokens, args.remove_bpe)
                    else:
                        src_str = ""
                    if has_target:
                        target_str = tgt_dict.string(target_tokens,
                                                     args.remove_bpe,
                                                     escape_unk=True)

                if not args.quiet:
                    if src_dict is not None:
                        print('S-{}\t{}'.format(sample_id, src_str))
                    if has_target:
                        print('T-{}\t{}'.format(sample_id, target_str))

                if args.reward_sample or args.reward_check:
                    # Get sample
                    hypo_strs = []
                    rewards = []
                    for j, hypo in enumerate(hypos[i]):
                        hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                            hypo_tokens=hypo['tokens'].int().cpu(),
                            src_str=src_str,
                            alignment=None,
                            align_dict=align_dict,
                            tgt_dict=tgt_dict,
                            remove_bpe=None,
                        )
                        hypo_strs.append(hypo_str)
                    if args.reward == "sbleu":
                        for hypo_str in hypo_strs:
                            hypo_str_nobpe = hypo_str.replace("@@ ", "")
                            rewards.append(
                                compute_reward(hypo_str_nobpe, target_str))
                        best_idx = np.array(rewards).argmax()
                        if args.reward_check:
                            best_rank_list.append(best_idx)
                        if args.save_path:
                            if args.output_all:
                                for hypo_i in range(len(hypo_strs)):
                                    outf.write("{} | {:.4f} | {}\n".format(
                                        sample_id, rewards[hypo_i],
                                        hypo_strs[hypo_i]))
                            else:
                                outf.write("{} | {}\n".format(
                                    sample_id, hypo_strs[best_idx]))
                        else:
                            if args.output_all:
                                for hypo_i in range(len(hypo_strs)):
                                    print("{} | {:.4f} | {}".format(
                                        sample_id, rewards[hypo_i],
                                        hypo_strs[hypo_i]))
                            else:
                                print("{} | {}".format(sample_id,
                                                       hypo_strs[best_idx]))
                            sys.stdout.flush()
                    elif args.reward == "bleurt":
                        hypo_target_pairs.append(
                            (sample_id, target_str, hypo_strs))
                else:
                    # Normal translation
                    # Process top predictions
                    for j, hypo in enumerate(hypos[i][:args.nbest]):
                        hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
                            hypo_tokens=hypo['tokens'].int().cpu(),
                            src_str=src_str,
                            alignment=hypo['alignment'].int().cpu()
                            if hypo['alignment'] is not None else None,
                            align_dict=align_dict,
                            tgt_dict=tgt_dict,
                            remove_bpe=args.remove_bpe,
                        )

                        if not args.quiet:
                            print('H-{}\t{}\t{}'.format(
                                sample_id, hypo['score'], hypo_str))
                            print('P-{}\t{}'.format(
                                sample_id, ' '.join(
                                    map(
                                        lambda x: '{:.4f}'.format(x),
                                        hypo['positional_scores'].tolist(),
                                    ))))

                            if args.print_alignment:
                                print('A-{}\t{}'.format(
                                    sample_id, ' '.join(
                                        map(lambda x: str(utils.item(x)),
                                            alignment))))

                        # Score only the top hypothesis
                        results.append(
                            (sample_id, target_str, hypo_str,
                             float(hypo["positional_scores"].mean())))
                        if has_target and j == 0 and not args.reward_sample:
                            pass
                            # if align_dict is not None or args.remove_bpe is not None:
                            # Convert back to tokens for evaluation with unk replacement and/or without BPE
                            # target_tokens = tgt_dict.encode_line(target_str, add_if_not_exist=True)
                            # if args.save_path:
                            #     outf.write("{} | {}\n".format(sample_id, hypo_str))
                            # if j == 0 and not args.no_eval:
                            #     results.append((sample_id, target_str, hypo_str))
                            # if hasattr(scorer, 'add_string'):
                            #     scorer.add_string(target_str, hypo_str)
                            # else:
                            #     scorer.add(target_tokens, hypo_tokens)
            if args.save_amount > 0 and total_n > args.save_amount:
                break
            if args.reward_sample and bool(hypo_target_pairs):
                hypo_batch = []
                target_batch = []
                for _, target, hypo_strs in hypo_target_pairs:
                    hypo_batch.extend(
                        [h.replace("@@ ", "") for h in hypo_strs])
                    target_batch.extend([target_str] * len(hypo_strs))
                rewards = np.array(
                    bleurt_scorer.score(target_batch, hypo_batch))
                base_i = 0
                for sample_id, _, hypo_strs in hypo_target_pairs:
                    start = base_i
                    end = base_i + len(hypo_strs)
                    best_idx = rewards[start:end].argmax()
                    if args.save_path:
                        if args.output_all:
                            for idx in range(start, end):
                                outf.write("{} | {:.4f} | {}\n".format(
                                    sample_id, float(rewards[idx]),
                                    hypo_strs[idx - start]))
                        else:
                            outf.write("{} | {}\n".format(
                                sample_id, hypo_strs[best_idx]))
                    else:
                        if args.output_all:
                            for idx in range(start, end):
                                print("{} | {:.4f} | {}".format(
                                    sample_id, float(rewards[idx]),
                                    hypo_strs[idx - start]))
                        else:
                            print("{} | {}".format(sample_id,
                                                   hypo_strs[best_idx]))
                        sys.stdout.flush()
                    base_i += len(hypo_strs)
            wps_meter.update(num_generated_tokens)
            t.log({'wps': round(wps_meter.avg)})
            num_sentences += sample['nsentences']

    print(
        '| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)'
        .format(num_sentences, gen_timer.n, gen_timer.sum,
                num_sentences / gen_timer.sum, 1. / gen_timer.avg))
    if args.save_path and not args.reward_check and not args.reward_sample:
        results.sort()
        for sample_id, tgt, hyp, score in results:
            outf.write("{}\t{}\t{}\n".format(sample_id, score, hyp))
        print("results saved to", args.save_path)

    if args.reward_check:
        print("avg ranking of the best sample:",
              np.array(best_rank_list).mean())
        print("ratio of best sample ranked in the top:",
              (np.array(best_rank_list) == 0).mean())
    if has_target and not args.reward_sample and not args.reward_check and not args.no_eval:
        _, ref, out, _ = zip(*results)
        from fairseq.criterions.lib_sbleu import smoothed_bleu
        sbleu = np.mean(
            [smoothed_bleu(p[1].split(), p[2].split()) for p in results])
        print("| SBLEU = {:.2f}".format(sbleu))
        if args.eval_bleurt:
            bleurt_scores = bleurt_scorer.score(
                references=[p[1] for p in results],
                candidates=[p[2] for p in results])
            print("| BLEURT = {:.4f}".format(np.mean(
                (np.array(bleurt_scores)))))
        print('| Generate {} with beam={}: {}'.format(args.gen_subset,
                                                      args.beam,
                                                      scorer.score(ref, out)))
    return scorer
Example #26
0
import tensorflow as tf
tf.compat.v1.flags.DEFINE_integer('batch_size', 1, 'batch_size')
tf.compat.v1.flags.DEFINE_float("lr", 5e-5, "learning rate")
tf.compat.v1.flags.DEFINE_integer("seed", 42, "seed to replicate results")
tf.compat.v1.flags.DEFINE_integer("n_gpu", 1, "no of gpu available")
tf.compat.v1.flags.DEFINE_integer("gradient_accumulation_steps", 32, "gradient_accumulation_steps")
tf.compat.v1.flags.DEFINE_integer("num_workers", 4, "num of cpus available")
tf.compat.v1.flags.DEFINE_integer("device", -1, "torch.device object")
tf.compat.v1.flags.DEFINE_integer("num_train_epochs", 5, "no of epochs of training")
tf.compat.v1.flags.DEFINE_string("output_dir", './output', "path to save evaluation results")
tf.compat.v1.flags.DEFINE_string("model_dir", './weights', "path to save trained model")
tf.compat.v1.flags.DEFINE_float("max_grad_norm", 1.0, "max gradient norm.")
tf.compat.v1.flags.DEFINE_string("root_dir", './CNN-DM/gpt2_1024_data', "location of json dataset.")
tf.compat.v1.flags.DEFINE_string("ids_file", './CNN-DM/ids.json', "location of train, valid and test file indexes")
from bleurt import score
from rouge_score import rouge_scorer
rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
with tf.device('cpu'):
    bleurt_scorer = score.BleurtScorer('../bleurt/bleurt/bleurt-base-512')


def calc_metrics(reference, candidate):
    r_scores = rouge_scorer.score(reference, candidate)
    b_score = bleurt_scorer.score([reference], [candidate], batch_size=1)
    metrics = {'r1': r_scores['rouge1'][2],
               'r2': r_scores['rouge2'][2],
               'rl': r_scores['rougeL'][2],
               'bleurt': b_score[0]}
    return metrics
Example #27
0
 def __init__(self, data, bleurt_device):
     self.data = data
     self.bleurt_device = bleurt_device
     with tf.device(self.bleurt_device):
         self.bleurt_scorer = bleurt_score.BleurtScorer(bleurt_model)
     self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)