def main(args): metrics_list = load_metrics(args.input_jsonls) metrics_list = merge_metrics(metrics_list) for metrics in metrics_list: metrics.flatten_keys() metrics_list = filter_metrics(metrics_list, 'peer', *args.metrics) for metrics in metrics_list: metrics.select_metrics(args.metrics) metrics.average_values() Xs = convert_to_matrices(metrics_list, *args.metrics) results = {} for name, X in zip(args.metrics, Xs): X_global = X.flatten() X_system = X.mean(axis=1) X_summaries = [] for j in range(X.shape[1]): X_summaries.append(X[:, j]) p_global = shapiro(X_global)[1] p_system = shapiro(X_system)[1] p_summaries = [shapiro(X[:, j])[1] for j in range(X.shape[1])] summaries_count = sum(1 if p <= 0.05 else 0 for p in p_summaries) summary_prop = summaries_count / X.shape[1] results[name] = { 'global': p_global, 'system': p_system, 'summary_proportion': summary_prop } os.makedirs(os.path.dirname(args.output_file), exist_ok=True) with open(args.output_file, 'w') as out: out.write(json.dumps(results, indent=2))
def main(args): metrics_list = load_metrics(args.metrics_jsonls) metrics_list = merge_metrics(metrics_list) for metrics in metrics_list: metrics.flatten_keys() metrics_list = filter_metrics(metrics_list, 'peer', args.ground_truth, 'ROUGE-1') for metrics in metrics_list: metrics.select_metrics([args.ground_truth, 'ROUGE-1']) metrics.average_values() metrics_file = '/dev/shm/metrics.jsonl' _save_to_jsonl(metrics_list, metrics_file) instances = JsonlReader(args.summaries_jsonl).read() instances = _filter_instances(instances, metrics_list) instances = preprocess_instances_for_rouge_simulation(instances) summaries_file = '/dev/shm/summaries.json' _save_to_json(instances, summaries_file) num_iterations = 1000 alpha = 0.05 seed = 5 for level_name, level in zip(['system_level', 'summary_level'], [system_level_corr, summary_level_corr]): results_dict = defaultdict(lambda: defaultdict(dict)) for coef_name, coef_func in zip(['pearson'], [pearsonr]): corr_func = functools.partial(level, coef_func) for proportion in [ 0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 1.00 ]: for method in [ 'williams', 'bootstrap-both', 'permutation-both' ]: job_results = Parallel(n_jobs=args.num_processes)( delayed(_run_simulation) (summaries_file, metrics_file, corr_func, proportion, method, args.ground_truth, args.rouge_variant, alpha, seed + i) for i in range(num_iterations)) power = sum(job_results) / len(job_results) seed += len(job_results) print(level_name, coef_name, method, proportion, power) results_dict[coef_name][method][proportion] = power os.makedirs(args.output_dir, exist_ok=True) with open(f'{args.output_dir}/{level_name}.json', 'w') as out: out.write(json.dumps(results_dict, indent=2))
def _run_simulation(summaries_file: str, metrics_file: str, corr_func, proportion: float, method: str, ground_truth: str, rouge_variant: str, alpha: float, random_seed: int) -> int: random.seed(random_seed) instances = json.load(open(summaries_file, 'r')) metrics_list = JsonlReader(metrics_file, Metrics).read() metrics_list.extend( score_instances_with_ablated_rouge(instances, proportion, rouge_variant)) metrics_list = merge_metrics(metrics_list) X, Y, Z = convert_to_matrices(metrics_list, 'ROUGE-1', 'ablated_rouge', ground_truth) pvalue = corr_diff_test(corr_func, X, Y, Z, method, False) if pvalue <= alpha: return 1 return 0
def main(args): metrics_list = load_metrics(args.metrics_jsonls) metrics_list = merge_metrics(metrics_list) for metrics in metrics_list: metrics.flatten_keys() metrics_list = filter_metrics(metrics_list, 'peer', *args.metrics) for metrics in metrics_list: metrics.select_metrics(args.metrics) metrics.average_values() X, Y = convert_to_matrices(metrics_list, *args.metrics) num_iterations = 1000 alpha = 0.05 seed = 10 results_dict = defaultdict(lambda: defaultdict(dict)) for coef_name, coef_func in zip(['pearson', 'spearman', 'kendall'], [pearsonr, spearmanr, kendalltau]): for level_name, level in zip(['system_level', 'summary_level'], [system_level_corr, summary_level_corr]): corr_func = functools.partial(level, coef_func) for method in [ 'bootstrap-system', 'bootstrap-input', 'bootstrap-both', 'fisher' ]: results = Parallel(n_jobs=args.num_processes)(delayed( _run_simulation)(X, Y, corr_func, method, alpha, seed + i) for i in range(num_iterations)) counts = Counter(results) proportions = { key: value / len(results) for key, value in counts.items() } results_dict[level_name][coef_name][method] = proportions print(level_name, coef_name, method, proportions) os.makedirs(os.path.dirname(args.output_json), exist_ok=True) with open(args.output_json, 'w') as out: out.write(json.dumps(results_dict, indent=2))
def main(args): metrics_list = load_metrics(args.metrics_jsonls) metrics_list = merge_metrics(metrics_list) for metrics in metrics_list: metrics.flatten_keys() with JsonlWriter(args.output_jsonl) as out: for metrics in metrics_list: if metrics.summarizer_type != 'peer': continue if args.dataset == 'tac': metrics.metrics = MetricsDict({ 'Responsiveness': metrics.metrics['overall_responsiveness'], 'ROUGE-1': metrics.metrics['rouge-1_recall'], 'ROUGE-2': metrics.metrics['rouge-2_recall'], 'ROUGE-L': metrics.metrics['rouge-l_recall'], 'ROUGE-SU4': metrics.metrics['rouge-su4_recall'], 'BEwTE': metrics.metrics['BEwTE_recall'], 'QAEval': metrics.metrics['qa-eval_f1'], 'AutoSummENG': metrics.metrics['AutoSummENG'], 'MeMoG': metrics.metrics['MeMoG'], 'NPowER': metrics.metrics['NPowER'], 'BERTScore': metrics.metrics['bertscore_recall'], 'METEOR': metrics.metrics['METEOR'], 'MoverScore': metrics.metrics['MoverScore'], 'S3': metrics.metrics['s3_resp'] }) elif args.dataset == 'fabbri2020': metrics.metrics = MetricsDict({ 'Responsiveness': metrics.metrics['expert_relevance'], 'ROUGE-1': metrics.metrics['rouge-1_f1'], 'ROUGE-2': metrics.metrics['rouge-2_f1'], 'ROUGE-L': metrics.metrics['rouge-l_f1'], 'ROUGE-SU4': metrics.metrics['rouge-su4_f1'], 'BEwTE': metrics.metrics['BEwTE_f1'], 'QAEval': metrics.metrics['qa-eval_f1'], 'AutoSummENG': metrics.metrics['AutoSummENG'], 'MeMoG': metrics.metrics['MeMoG'], 'NPowER': metrics.metrics['NPowER'], 'BERTScore': metrics.metrics['bertscore_recall'], 'METEOR': metrics.metrics['METEOR'], 'MoverScore': metrics.metrics['MoverScore'], 'S3': metrics.metrics['s3_resp'] }) elif args.dataset == 'bhandari2020': metrics.metrics = MetricsDict({ 'Responsiveness': metrics.metrics['litepyramid_recall'], 'ROUGE-1': metrics.metrics['rouge-1_recall'], 'ROUGE-2': metrics.metrics['rouge-2_recall'], 'ROUGE-L': metrics.metrics['rouge-l_recall'], 'ROUGE-SU4': metrics.metrics['rouge-su4_recall'], 'BEwTE': metrics.metrics['BEwTE_recall'], 'QAEval': metrics.metrics['qa-eval_f1'], 'AutoSummENG': metrics.metrics['AutoSummENG'], 'MeMoG': metrics.metrics['MeMoG'], 'NPowER': metrics.metrics['NPowER'], 'BERTScore': metrics.metrics['bertscore_recall'], 'METEOR': metrics.metrics['METEOR'], 'MoverScore': metrics.metrics['MoverScore'], 'S3': metrics.metrics['s3_resp'] }) else: raise Exception(f'Unknown dataset {args.dataset}') out.write(metrics)
def run_hypothesis_tests(metrics_jsonl_files_or_metrics_list: Union[ str, List[str], List[Metrics]], dependent_metric: str, metric_A: str, metric_B: str, summarizer_type: str, test_method: str = 'permutation-both', alpha: float = 0.05, two_tailed: bool = True, skip_summary_level: bool = False, skip_system_level: bool = False, skip_global: bool = False) -> Dict: if isinstance(metrics_jsonl_files_or_metrics_list, str): # A single file metrics_list = load_metrics([metrics_jsonl_files_or_metrics_list]) elif isinstance(metrics_jsonl_files_or_metrics_list, list) and all( isinstance(item, str) for item in metrics_jsonl_files_or_metrics_list): # A list of files metrics_list = load_metrics(metrics_jsonl_files_or_metrics_list) else: # A list of metrics assert isinstance(metrics_jsonl_files_or_metrics_list, list) and all( isinstance(item, Metrics) for item in metrics_jsonl_files_or_metrics_list) metrics_list = metrics_jsonl_files_or_metrics_list # Merge duplicate metrics objects into one metrics_list = merge_metrics(metrics_list) for metrics in metrics_list: metrics.flatten_keys() metrics_list = filter_metrics(metrics_list, summarizer_type, dependent_metric, metric_A, metric_B) for metrics in metrics_list: metrics.select_metrics([dependent_metric, metric_A, metric_B]) metrics.average_values() # Follow the math in the paper: the dependent metric is Z X, Y, Z = convert_to_matrices(metrics_list, metric_A, metric_B, dependent_metric) H0, H1 = _get_hypotheses(two_tailed, dependent_metric, metric_A, metric_B) results = { 'dependent_metric': dependent_metric, 'metric_A': metric_A, 'metric_B': metric_B, 'summarizer_type': summarizer_type, 'test_method': test_method, 'alpha': alpha, 'two_tailed': two_tailed, 'H0': H0, 'H1': H1 } if not skip_summary_level: results['summary_level'] = _run_test(summary_level_corr, X, Y, Z, test_method, alpha, two_tailed) if not skip_system_level: results['system_level'] = _run_test(system_level_corr, X, Y, Z, test_method, alpha, two_tailed) if not skip_global: results['global'] = _run_test(global_corr, X, Y, Z, test_method, alpha, two_tailed) return results