Esempio n. 1
0
def main(args):
    metrics_list = load_metrics(args.input_jsonls)
    metrics_list = merge_metrics(metrics_list)
    for metrics in metrics_list:
        metrics.flatten_keys()
    metrics_list = filter_metrics(metrics_list, 'peer', *args.metrics)
    for metrics in metrics_list:
        metrics.select_metrics(args.metrics)
        metrics.average_values()

    Xs = convert_to_matrices(metrics_list, *args.metrics)
    results = {}
    for name, X in zip(args.metrics, Xs):
        X_global = X.flatten()
        X_system = X.mean(axis=1)
        X_summaries = []
        for j in range(X.shape[1]):
            X_summaries.append(X[:, j])

        p_global = shapiro(X_global)[1]
        p_system = shapiro(X_system)[1]
        p_summaries = [shapiro(X[:, j])[1] for j in range(X.shape[1])]
        summaries_count = sum(1 if p <= 0.05 else 0 for p in p_summaries)
        summary_prop = summaries_count / X.shape[1]

        results[name] = {
            'global': p_global,
            'system': p_system,
            'summary_proportion': summary_prop
        }

    os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
    with open(args.output_file, 'w') as out:
        out.write(json.dumps(results, indent=2))
Esempio n. 2
0
def main(args):
    metrics_list = load_metrics(args.metrics_jsonls)
    metrics_list = merge_metrics(metrics_list)
    for metrics in metrics_list:
        metrics.flatten_keys()
    metrics_list = filter_metrics(metrics_list, 'peer', args.ground_truth,
                                  'ROUGE-1')
    for metrics in metrics_list:
        metrics.select_metrics([args.ground_truth, 'ROUGE-1'])
        metrics.average_values()
    metrics_file = '/dev/shm/metrics.jsonl'
    _save_to_jsonl(metrics_list, metrics_file)

    instances = JsonlReader(args.summaries_jsonl).read()
    instances = _filter_instances(instances, metrics_list)
    instances = preprocess_instances_for_rouge_simulation(instances)
    summaries_file = '/dev/shm/summaries.json'
    _save_to_json(instances, summaries_file)

    num_iterations = 1000
    alpha = 0.05
    seed = 5

    for level_name, level in zip(['system_level', 'summary_level'],
                                 [system_level_corr, summary_level_corr]):
        results_dict = defaultdict(lambda: defaultdict(dict))
        for coef_name, coef_func in zip(['pearson'], [pearsonr]):
            corr_func = functools.partial(level, coef_func)
            for proportion in [
                    0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90,
                    1.00
            ]:
                for method in [
                        'williams', 'bootstrap-both', 'permutation-both'
                ]:
                    job_results = Parallel(n_jobs=args.num_processes)(
                        delayed(_run_simulation)
                        (summaries_file, metrics_file, corr_func, proportion,
                         method, args.ground_truth, args.rouge_variant, alpha,
                         seed + i) for i in range(num_iterations))
                    power = sum(job_results) / len(job_results)
                    seed += len(job_results)
                    print(level_name, coef_name, method, proportion, power)
                    results_dict[coef_name][method][proportion] = power

        os.makedirs(args.output_dir, exist_ok=True)
        with open(f'{args.output_dir}/{level_name}.json', 'w') as out:
            out.write(json.dumps(results_dict, indent=2))
Esempio n. 3
0
def main(args):
    metrics_list = load_metrics(args.metrics_jsonls)
    metrics_list = merge_metrics(metrics_list)

    for metrics in metrics_list:
        metrics.flatten_keys()

    metrics_list = filter_metrics(metrics_list, 'peer', *args.metrics)
    for metrics in metrics_list:
        metrics.select_metrics(args.metrics)
        metrics.average_values()
    X, Y = convert_to_matrices(metrics_list, *args.metrics)

    num_iterations = 1000
    alpha = 0.05
    seed = 10

    results_dict = defaultdict(lambda: defaultdict(dict))
    for coef_name, coef_func in zip(['pearson', 'spearman', 'kendall'],
                                    [pearsonr, spearmanr, kendalltau]):
        for level_name, level in zip(['system_level', 'summary_level'],
                                     [system_level_corr, summary_level_corr]):
            corr_func = functools.partial(level, coef_func)
            for method in [
                    'bootstrap-system', 'bootstrap-input', 'bootstrap-both',
                    'fisher'
            ]:
                results = Parallel(n_jobs=args.num_processes)(delayed(
                    _run_simulation)(X, Y, corr_func, method, alpha, seed +
                                     i) for i in range(num_iterations))
                counts = Counter(results)
                proportions = {
                    key: value / len(results)
                    for key, value in counts.items()
                }
                results_dict[level_name][coef_name][method] = proportions
                print(level_name, coef_name, method, proportions)

    os.makedirs(os.path.dirname(args.output_json), exist_ok=True)
    with open(args.output_json, 'w') as out:
        out.write(json.dumps(results_dict, indent=2))
Esempio n. 4
0
def main(args):
    metrics_list = load_metrics(args.metrics_jsonls)
    metrics_list = merge_metrics(metrics_list)
    for metrics in metrics_list:
        metrics.flatten_keys()

    with JsonlWriter(args.output_jsonl) as out:
        for metrics in metrics_list:
            if metrics.summarizer_type != 'peer':
                continue

            if args.dataset == 'tac':
                metrics.metrics = MetricsDict({
                    'Responsiveness': metrics.metrics['overall_responsiveness'],
                    'ROUGE-1': metrics.metrics['rouge-1_recall'],
                    'ROUGE-2': metrics.metrics['rouge-2_recall'],
                    'ROUGE-L': metrics.metrics['rouge-l_recall'],
                    'ROUGE-SU4': metrics.metrics['rouge-su4_recall'],
                    'BEwTE': metrics.metrics['BEwTE_recall'],
                    'QAEval': metrics.metrics['qa-eval_f1'],
                    'AutoSummENG': metrics.metrics['AutoSummENG'],
                    'MeMoG': metrics.metrics['MeMoG'],
                    'NPowER': metrics.metrics['NPowER'],
                    'BERTScore': metrics.metrics['bertscore_recall'],
                    'METEOR': metrics.metrics['METEOR'],
                    'MoverScore': metrics.metrics['MoverScore'],
                    'S3': metrics.metrics['s3_resp']
                })

            elif args.dataset == 'fabbri2020':
                metrics.metrics = MetricsDict({
                    'Responsiveness': metrics.metrics['expert_relevance'],
                    'ROUGE-1': metrics.metrics['rouge-1_f1'],
                    'ROUGE-2': metrics.metrics['rouge-2_f1'],
                    'ROUGE-L': metrics.metrics['rouge-l_f1'],
                    'ROUGE-SU4': metrics.metrics['rouge-su4_f1'],
                    'BEwTE': metrics.metrics['BEwTE_f1'],
                    'QAEval': metrics.metrics['qa-eval_f1'],
                    'AutoSummENG': metrics.metrics['AutoSummENG'],
                    'MeMoG': metrics.metrics['MeMoG'],
                    'NPowER': metrics.metrics['NPowER'],
                    'BERTScore': metrics.metrics['bertscore_recall'],
                    'METEOR': metrics.metrics['METEOR'],
                    'MoverScore': metrics.metrics['MoverScore'],
                    'S3': metrics.metrics['s3_resp']
                })

            elif args.dataset == 'bhandari2020':
                metrics.metrics = MetricsDict({
                    'Responsiveness': metrics.metrics['litepyramid_recall'],
                    'ROUGE-1': metrics.metrics['rouge-1_recall'],
                    'ROUGE-2': metrics.metrics['rouge-2_recall'],
                    'ROUGE-L': metrics.metrics['rouge-l_recall'],
                    'ROUGE-SU4': metrics.metrics['rouge-su4_recall'],
                    'BEwTE': metrics.metrics['BEwTE_recall'],
                    'QAEval': metrics.metrics['qa-eval_f1'],
                    'AutoSummENG': metrics.metrics['AutoSummENG'],
                    'MeMoG': metrics.metrics['MeMoG'],
                    'NPowER': metrics.metrics['NPowER'],
                    'BERTScore': metrics.metrics['bertscore_recall'],
                    'METEOR': metrics.metrics['METEOR'],
                    'MoverScore': metrics.metrics['MoverScore'],
                    'S3': metrics.metrics['s3_resp']
                })
            else:
                raise Exception(f'Unknown dataset {args.dataset}')

            out.write(metrics)
Esempio n. 5
0
def run_hypothesis_tests(metrics_jsonl_files_or_metrics_list: Union[
    str, List[str], List[Metrics]],
                         dependent_metric: str,
                         metric_A: str,
                         metric_B: str,
                         summarizer_type: str,
                         test_method: str = 'permutation-both',
                         alpha: float = 0.05,
                         two_tailed: bool = True,
                         skip_summary_level: bool = False,
                         skip_system_level: bool = False,
                         skip_global: bool = False) -> Dict:
    if isinstance(metrics_jsonl_files_or_metrics_list, str):
        # A single file
        metrics_list = load_metrics([metrics_jsonl_files_or_metrics_list])
    elif isinstance(metrics_jsonl_files_or_metrics_list, list) and all(
            isinstance(item, str)
            for item in metrics_jsonl_files_or_metrics_list):
        # A list of files
        metrics_list = load_metrics(metrics_jsonl_files_or_metrics_list)
    else:
        # A list of metrics
        assert isinstance(metrics_jsonl_files_or_metrics_list, list) and all(
            isinstance(item, Metrics)
            for item in metrics_jsonl_files_or_metrics_list)
        metrics_list = metrics_jsonl_files_or_metrics_list

    # Merge duplicate metrics objects into one
    metrics_list = merge_metrics(metrics_list)

    for metrics in metrics_list:
        metrics.flatten_keys()

    metrics_list = filter_metrics(metrics_list, summarizer_type,
                                  dependent_metric, metric_A, metric_B)
    for metrics in metrics_list:
        metrics.select_metrics([dependent_metric, metric_A, metric_B])
        metrics.average_values()

    # Follow the math in the paper: the dependent metric is Z
    X, Y, Z = convert_to_matrices(metrics_list, metric_A, metric_B,
                                  dependent_metric)

    H0, H1 = _get_hypotheses(two_tailed, dependent_metric, metric_A, metric_B)
    results = {
        'dependent_metric': dependent_metric,
        'metric_A': metric_A,
        'metric_B': metric_B,
        'summarizer_type': summarizer_type,
        'test_method': test_method,
        'alpha': alpha,
        'two_tailed': two_tailed,
        'H0': H0,
        'H1': H1
    }
    if not skip_summary_level:
        results['summary_level'] = _run_test(summary_level_corr, X, Y, Z,
                                             test_method, alpha, two_tailed)

    if not skip_system_level:
        results['system_level'] = _run_test(system_level_corr, X, Y, Z,
                                            test_method, alpha, two_tailed)

    if not skip_global:
        results['global'] = _run_test(global_corr, X, Y, Z, test_method, alpha,
                                      two_tailed)

    return results