Exemple #1
0
def job(x):
    # document conversion
    global pdf2txt
    global output_dir
    global count
    global upload
    y = x.replace('.pdf', '.txt')
    if output_dir:
        y = output_dir + y.split('/')[-1]
    if not os.path.isfile(y):
        if output_dir:
            os.system('python3 {} {} > {}'.format(pdf2txt, x, y))
        else:
            os.system('python3 {} {}'.format(pdf2txt, x))
        if os.stat(y).st_size <= MIN_BYTES:
            logging.info('{}: File Size unsatisfactory. Performing OCR'.format(x))
            ocr.pdfocr2txt(x, y, resolution=resolution, tmp=tmp)

        logging.info('{} Done'.format(x))
    else:
        logging.info('{} already a converted file'.format(x))

    count.value += 1
    logging.info('Complete {} out of {}'.format(int(count.value), total))

    return y
Exemple #2
0
def pdf_to_txt(input_pdf, output_txt):
    # document conversion
    try:
        global pdf2txt
        if not os.path.isfile(output_txt):
            if os.path.isfile(input_pdf):
                os.system('python3 {} {} > {}'.format(
                    pdf2txt, input_pdf, output_txt))
            else:
                logging.info('{}: File not found!'.format(input_pdf))
                return False
            if os.stat(output_txt).st_size <= MIN_BYTES:
                logging.info(
                    '{}: File Size unsatisfactory. Performing OCR'.format(input_pdf))
                ocr.pdfocr2txt(input_pdf, output_txt,
                               resolution=resolution, tmp=tmp)

            logging.info('{} Done'.format(input_pdf))
            return True
        else:
            logging.info('{} already a converted file'.format(input_pdf))
            return False
    except Exception as e:
        logging.error(
            "Exception occurred while converting file %s to txt", input_pdf, exc_info=True)
        return False