Exemple #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'filename',
        help='PDF file to process.',
        type=str,
    )
    parser.add_argument(
        '-v',
        '--verbose',
        action='store_true',
        dest='verbose',
        help='Print status messages.',
    )
    parser.add_argument(
        '-t',
        '--test',
        action='store_true',
        help=
        "Test only. Don't output any files. Use with debug options to see test output.",
    )
    parser.add_argument(
        '-d',
        '--debug',
        action='store_true',
        dest='debug',
        help="Show debug output.",
        default=0,
    )
    parser.add_argument(
        '-o',
        '--output-file',
        dest='destination',
        type=str,
        help='Use supplied filename as filename template for output files.',
    )
    args = parser.parse_args()

    # Split filename from extension before passing to the various functions. Use input filename for template
    # if no output filename specified.
    if args.destination:
        output_file, output_extension = os.path.splitext(args.destination)
    else:
        output_file, output_extension = os.path.splitext(args.filename)

    # Fetch metadata from import Excel file
    title, start_page, start_pdf_page, end_pdf_page, author, section = importxl(
        args.filename)
    # Export CSV file, or show what output would be if test flag is set
    exportcsvnew(output_file,
                 args.verbose,
                 args.debug,
                 args.test,
                 title,
                 start_page,
                 start_pdf_page,
                 end_pdf_page,
                 author,
                 section=section)
Exemple #2
0
def main():
    parser = argparse.ArgumentParser(
        description='Function to extract metadata and split a PDF containing multiple journal articles. Set up for '
                    'Buffalo Law Review book reviews.'
    )
    parser.add_argument('filename',
                        help='PDF file to analyze and split.',
                        type=str,
                        )
    parser.add_argument('-v', '--verbose',
                        action='store_true',
                        dest='verbose',
                        help='Print status messages.',
                        )
    parser.add_argument('-t', '--test',
                        action='store_true',
                        help="Test only. Don't output any files. Use with debug options to see test output.",
                        )
    parser.add_argument('--write-csv-only',
                        action="store_true",
                        dest="csvOnly",
                        help="Write CSV file, but don't split PDFs. Test flag takes precedence.",
                        )
    parser.add_argument('-d', '--debug',
                        dest='debug',
                        type=int,
                        help="Set debug level (1-6). Levels 1-4 offer increasing levels of output. Level 5 displays "
                             "the PDF text. Level 6 prints all of the records.",
                        default=0,
                        )
    parser.add_argument('-o', '--output-file',
                        dest='destination',
                        type=str,
                        help='Use supplied filename as filename template for output files.',
                        )
    parser.add_argument('-i', '--input-file',
                        dest='input_file',
                        type=str,
                        help="Import CSV file to be used for PDF splitting. Must be in same format as export.")
    args = parser.parse_args()

    # Split filename from extension before passing to the various functions. Use input filename for template
    # if no output filename specified.
    if args.destination:
        output_file, output_extension = os.path.splitext(args.destination)
    else:
        output_file, output_extension = os.path.splitext(args.filename)

    # If importCSV is specified, read that file and get start_pdf_page and end_pdf_page to pass to SplitPDFs
    # If no importCSV is selected, process args.filename
    if args.input_file:
        start_pdf_page, end_pdf_page = journaltools.importcsv(args.input_file, args.debug)
    else:
        # Fetch OCR page text from PDF file
        page_text = journaltools.getpdf(args.filename, 0, args.verbose, args.debug)
        # Process pages
        title, start_page, start_pdf_page, end_pdf_page, author = processpdfnew(
            args.verbose, args.debug, page_text)
        # Export CSV file, or show what output would be if test flag is set
        journaltools.exportcsvnew(output_file, args.verbose, args.debug, args.test, title, start_page, start_pdf_page,
                                  end_pdf_page, author)

    # Split Original PDF into separate documents for each piece, unless test or csvOnly flags are set
    if not args.test and not args.csvOnly:
        journaltools.splitpdf(args.filename, args.verbose, args.debug, start_pdf_page, end_pdf_page, output_file)