Ejemplo n.º 1
0
def ocr_pdf_merge():
    jpg_files = [
        f for f in os.listdir('.')
        if os.path.isfile(f) and f.startswith('page') and f.endswith('.jpg')
    ]
    for jpg_file in jpg_files:
        tesseract_ocr(jpg_file)
        os.remove(jpg_file)

    pdf_files = [
        f for f in os.listdir('.')
        if os.path.isfile(f) and f.startswith('page') and f.endswith('.pdf')
    ]
    pdf_merge(pdf_files, 'ocred_document.pdf', delete=True)
Ejemplo n.º 2
0
    )
    # input
    parser.add_argument(
        "inputs", type=str, default=None, nargs="+", help="list of input files"
    )

    # output
    parser.add_argument(
        "-o",
        "--output",
        type=str,
        default=None,
        help="filename of the output file",
        required=True,
    )

    # delete
    parser.add_argument(
        "-d",
        "--delete",
        action="store_true",
        help="delete input files after merge",
    )

    return parser.parse_args(args)


if __name__ == "__main__":
    args = process_arguments(sys.argv[1:])
    pdf_merge(args.inputs, args.output, args.delete)
Ejemplo n.º 3
0
def main():
    incremental_results_dir = path.join(DATA_DIR, "results", "incremental")

    if path.isdir(incremental_results_dir):
        print("Creating plots for incremental...")
        for language in scandir(incremental_results_dir):
            print(f" {language.name}")
            for result_file in scandir(language.path):
                csv_basename = ".".join(result_file.name.split(".")[:-1])
                print(f"  {csv_basename}")

                result_data = read_csv(result_file.path)
                result_data[0]["Added"] = None
                result_data_except_first = result_data[1:]

                if len([x for x in result_data if any(x[parser_type] for parser_type in parser_types())]) <= 0:
                    print("    No data found, skipping")
                    continue

                figure_path = path.join(FIGURES_DIR, "incremental", language.name, csv_basename)
                makedirs(figure_path, exist_ok=True)

                figures = [
                    (plot_times(result_data, parser_types(True, "implode" in csv_basename)), "report"),
                    (plot_times(result_data_except_first, parser_types(False, "implode" in csv_basename)), "report-except-first"),
                    (plot_times_vs_changes(result_data_except_first, "bytes", "Added", "Removed"),
                     "report-time-vs-bytes"),
                    (plot_times_vs_changes(result_data_except_first, "chunks", "Changes"), "report-time-vs-changes"),
                    (plot_times_vs_changes_3d(result_data_except_first), "report-time-vs-changes-3D"),
                ]

                for fig, name in figures:
                    fig.savefig(path.join(figure_path, name + ".pdf"))
                    fig.savefig(path.join(figure_path, name + ".svg"))

                plt.close("all")

                merged_path = path.join(figure_path, "merged.pdf")
                if path.exists(merged_path):
                    remove(merged_path)
                pdftools.pdf_merge([path.join(figure_path, name + ".pdf") for _, name in figures], merged_path)

    memory_benchmarks_dir = path.join(DATA_DIR, "memoryBenchmarks")
    
    if path.isdir(memory_benchmarks_dir):
        print("Creating plots for memory benchmarks...")
        for language in scandir(memory_benchmarks_dir):
            print(f"  {language.name}")
            result_data_batch = read_csv(path.join(language.path, "batch.csv"))
            result_data_incremental = read_csv(path.join(language.path, "incremental.csv"))

            figure_path = path.join(FIGURES_DIR, "memoryBenchmarks", language.name)
            makedirs(figure_path, exist_ok=True)

            figures = [
                (plot_memory_batch(result_data_batch, "incl"), "report-allocations-batch"),
                (plot_memory_batch(result_data_batch, "excl"), "report-cache-size-batch"),
                (plot_memory_incremental(result_data_incremental, "incl"), "report-allocations-incremental"),
                (plot_memory_incremental(result_data_incremental, "excl"), "report-cache-size-incremental"),
            ]

            for fig, name in figures:
                fig.savefig(path.join(figure_path, name + ".pdf"))
                fig.savefig(path.join(figure_path, name + ".svg"))

            plt.close("all")

            merged_path = path.join(figure_path, "merged.pdf")
            if path.exists(merged_path):
                remove(merged_path)
            pdftools.pdf_merge([path.join(figure_path, name + ".pdf") for _, name in figures], merged_path)
Ejemplo n.º 4
0
    parser = argparse.ArgumentParser(
        parents=[parentparser],
        description="Merge the pages of multiple input files in one output file.")
    # input
    parser.add_argument('inputs',
                        type=str,
                        default=None,
                        nargs='+',
                        help='list of input files')

    # output
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        default=None,
                        help='filename of the output file',
                        required=True)

    # delete
    parser.add_argument('-d',
                        '--delete',
                        action='store_true',
                        help='delete input files after merge')

    return parser.parse_args(args)


if __name__ == "__main__":
    args = process_arguments(sys.argv[1:])
    pdf_merge(args.inputs, args.output, args.delete)