Exemple #1
0
def main():
    parser = argparse.ArgumentParser(
        description='Function to extract metadata and split a PDF containing multiple journal articles. Set up for '
                    'Buffalo Law Review book reviews.'
    )
    parser.add_argument('filename',
                        help='PDF file to analyze and split.',
                        type=str,
                        )
    parser.add_argument('-v', '--verbose',
                        action='store_true',
                        dest='verbose',
                        help='Print status messages.',
                        )
    parser.add_argument('-t', '--test',
                        action='store_true',
                        help="Test only. Don't output any files. Use with debug options to see test output.",
                        )
    parser.add_argument('--write-csv-only',
                        action="store_true",
                        dest="csvOnly",
                        help="Write CSV file, but don't split PDFs. Test flag takes precedence.",
                        )
    parser.add_argument('-d', '--debug',
                        dest='debug',
                        type=int,
                        help="Set debug level (1-6). Levels 1-4 offer increasing levels of output. Level 5 displays "
                             "the PDF text. Level 6 prints all of the records.",
                        default=0,
                        )
    parser.add_argument('-o', '--output-file',
                        dest='destination',
                        type=str,
                        help='Use supplied filename as filename template for output files.',
                        )
    parser.add_argument('-i', '--input-file',
                        dest='input_file',
                        type=str,
                        help="Import CSV file to be used for PDF splitting. Must be in same format as export.")
    args = parser.parse_args()

    # Split filename from extension before passing to the various functions. Use input filename for template
    # if no output filename specified.
    if args.destination:
        output_file, output_extension = os.path.splitext(args.destination)
    else:
        output_file, output_extension = os.path.splitext(args.filename)

    # If importCSV is specified, read that file and get start_pdf_page and end_pdf_page to pass to SplitPDFs
    # If no importCSV is selected, process args.filename
    if args.input_file:
        start_pdf_page, end_pdf_page = journaltools.importcsv(args.input_file, args.debug)
    else:
        # Fetch OCR page text from PDF file
        page_text = journaltools.getpdf(args.filename, 0, args.verbose, args.debug)
        # Process pages
        title, start_page, start_pdf_page, end_pdf_page, author = processpdfnew(
            args.verbose, args.debug, page_text)
        # Export CSV file, or show what output would be if test flag is set
        journaltools.exportcsvnew(output_file, args.verbose, args.debug, args.test, title, start_page, start_pdf_page,
                                  end_pdf_page, author)

    # Split Original PDF into separate documents for each piece, unless test or csvOnly flags are set
    if not args.test and not args.csvOnly:
        journaltools.splitpdf(args.filename, args.verbose, args.debug, start_pdf_page, end_pdf_page, output_file)
Exemple #2
0
                        )
    parser.add_argument('-i', '--input-file',
                        dest='input_file',
                        type=str,
                        help="Import CSV file to be used for PDF splitting. Must be in same format as export. "
                             "Default is filename with .csv extension.")
    args = parser.parse_args()

    # Split filename from extension before passing to the various functions. Use input filename for template
    # if no output filename specified.
    if args.destination:
        output_file, output_extension = os.path.splitext(args.destination)
    else:
        output_file, output_extension = os.path.splitext(args.filename)

    # Set input CSV filename. If no filename provided, use the input filename with CSV extension.
    if args.input_file:
        input_file = args.input_file
    else:
        input_file, input_extension = os.path.splitext(args.filename)
        input_file = input_file + '.csv'

    # Read CSVfile and get starting and ending PDF pages to pass to splitpdf
    if os.path.exists(input_file):
        start_pdf_page, end_pdf_page = journaltools.importcsv(input_file, args.debug)
        # Split Original PDF into separate documents for each piece, unless test flag is set
        if not args.test:
            journaltools.splitpdf(args.filename, args.verbose, args.debug, start_pdf_page, end_pdf_page, output_file)
    else:
        print(f'{input_file} not present. Please specify a valid CSV file to use for the split points.')