def cli_download( from_file: click.File, url_list: Tuple[str, ...], verbose: bool = False ) -> None: urls = [(x, None) for x in url_list] if from_file: urls += [(x.strip("\n"), None) for x in from_file.readlines()] download(urls, verbose)
def cli_simple( from_file: click.File, author: str, title: str, url_list: Tuple[str, ...], verbose: bool = False, ): urls = [x for x in url_list] if from_file: urls += [x.strip("\n") for x in from_file.readlines()] if not urls: click.echo("You must provide at least one URL to download.") return story = HTMLStory( chapters=urls, author=author, title=title, url=furl("http://httpbin.org/status/200"), verbose=verbose, ) story.run()
def readlines(ctx, param, file: click.File) -> List[str]: return [l.strip() for l in file.readlines()]
def compare( source: click.File, system_x: click.File, system_y: click.File, reference: click.File, language: str, metric: Union[Tuple[str], str], filter: Union[Tuple[str], str], length_min_val: float, length_max_val: float, seg_metric: str, output_folder: str, bootstrap: bool, num_splits: int, sample_ratio: float, ): testset = PairwiseTestset( src=[l.strip() for l in source.readlines()], system_x=[l.strip() for l in system_x.readlines()], system_y=[l.strip() for l in system_y.readlines()], ref=[l.strip() for l in reference.readlines()], language_pair="X-" + language, filenames=[source.name, system_x.name, system_y.name, reference.name], ) corpus_size = len(testset) if filter: filters = [ available_filters[f](testset) for f in filter if f != "length" ] if "length" in filter: filters.append(available_filters["length"]( testset, int(length_min_val * 100), int(length_max_val * 100))) for filter in filters: testset.apply_filter(filter) if (1 - (len(testset) / corpus_size)) * 100 == 100: click.secho("The current filters reduce the Corpus on 100%!", fg="ref") return click.secho( "Filters Successfully applied. Corpus reduced in {:.2f}%.".format( (1 - (len(testset) / corpus_size)) * 100), fg="green", ) if seg_metric not in metric: metric = tuple([ seg_metric, ] + list(metric)) else: # Put COMET in first place metric = list(metric) metric.remove(seg_metric) metric = tuple([ seg_metric, ] + metric) results = { m: available_metrics[m]( language=testset.target_language).pairwise_comparison(testset) for m in metric } # results_dict = PairwiseResult.results_to_dict(list(results.values())) results_df = PairwiseResult.results_to_dataframe(list(results.values())) if bootstrap: bootstrap_results = [] for m in metric: bootstrap_result = available_metrics[m].bootstrap_resampling( testset, num_splits, sample_ratio, results[m]) bootstrap_results.append(available_metrics[m].bootstrap_resampling( testset, num_splits, sample_ratio, results[m]).stats) bootstrap_results = { k: [dic[k] for dic in bootstrap_results] for k in bootstrap_results[0] } for k, v in bootstrap_results.items(): results_df[k] = v click.secho(str(results_df), fg="yellow") if output_folder != "": if not output_folder.endswith("/"): output_folder += "/" results_df.to_json(output_folder + "results.json", orient="index", indent=4) plot_segment_comparison(results[seg_metric], output_folder) plot_pairwise_distributions(results[seg_metric], output_folder) plot_bucket_comparison(results[seg_metric], output_folder)