def fetch(self):
     self.fetch_pages()
     if not os.path.exists(self.get_book_file_path()):
         with open(self.get_book_file_path(), "wb") as f:
             pages = [
                 os.path.join(self.get_dir_path(), f"{page:03}.jpg")
                 for page in range(self.get_page_count())
             ]
             pages.insert(
                 1, os.path.join(WORKSPACE, "assets", "disclaimer.jpg"))
             pages.insert(2, os.path.join(WORKSPACE, "assets", "blank.jpg"))
             f.write(
                 img2pdf.convert(
                     pages,
                     title=self.name,
                     creationdate=datetime(int(self.year), 1, 1),
                     author="Comisión Nacional de Libros de Texto Gratuitos",
                 ))
     try:
         ocrmypdf.ocr(self.get_book_file_path(),
                      self.get_book_file_path(),
                      language='spa',
                      clean=True,
                      jobs=12,
                      max_image_mpixels=900)
     except ocrmypdf.exceptions.PriorOcrFoundError:
         pass
     except Exception as e:
         return print(f"Error este libro no pudo ser indexado: {e}")
     self.cleanup()
     print(f"Tu libro está listo en: {self.get_book_file_path()}\n")
Exemple #2
0
def process(file,
            failure_path=None,
            ocr_output_dir=None,
            gcs_bucket=None,
            **kwargs):
    if file.endswith('.pdf'):
        if not ocr_output_dir:
            ocr_output_dir = 'ocr_output'
        if not os.path.exists(ocr_output_dir):
            os.mkdir(ocr_output_dir)
        ocr_output_path = os.path.join(ocr_output_dir, os.path.basename(file))
        ocrmypdf.ocr(
            input_file=file,
            output_file=ocr_output_path,
            deskew=True,
            use_threads=True,
            skip_text=True,
        )
        source = ocr_output_path
    else:
        source = file
    scr = Screenplay(source=source, failure_path=failure_path, **kwargs)
    if gcs_bucket:
        gcs_filename = "{}.json".format(scr.title)
        gcs_upload(scr.data, gcs_bucket, gcs_filename)
    return scr
Exemple #3
0
def convert2images(unique_id, meet, seconds, custom_coordinates, ocr):
    file = f"./{videos_dir}/{unique_id}"
    directory = unique_id  # Directory
    parent_dir = f"./{slides_dir}"  # Parent Directory path
    path = os.path.join(parent_dir, directory)  # Path
    try:
        os.makedirs(path, exist_ok=True)
        print("Directory '%s' created successfully" % directory)
        frames = video_to_frames(
            video_path=file,
            frames_dir=f"./{slides_dir}",
            seconds=seconds,
            meet=meet,
            custom_coordinates=custom_coordinates,
        )
        if frames:  # If no frames have been generated due to poor video
            convert2pdf(unique_id)
            if ocr:
                # Disadvantage is pdf file size increases and image quality detoriates
                ocrmypdf.ocr(
                    f"./{pdfs_dir}/{unique_id}.pdf",
                    f"./{pdfs_dir}/{unique_id}.pdf",
                    deskew=True,
                    pdf_renderer="hocr",
                )
        freeUpSpace(unique_id)
    except OSError as e:
        raise e
        print("Directory '{0}' can not be created".format(unique_id))
        print(file, " didn't complete successfully.")
Exemple #4
0
def test_stream_api(resources):
    in_ = (resources / 'graph.pdf').open('rb')
    out = BytesIO()

    ocrmypdf.ocr(in_, out, tesseract_timeout=0.0)
    out.seek(0)
    assert b'%PDF' in out.read(1024)
Exemple #5
0
def pdf_extraction_func(pdf_file_path:pathlib.Path,output_folder_path:pathlib.Path, ):
        temp_folder= tempfile.TemporaryDirectory()
        temp_foldername =temp_folder.name
        filename_no_suffix=re.sub('__|___|____','_',re.sub('\s|\.|\-','_', pdf_file_path.stem))

        pdf_file_temp_path= pathlib.Path(temp_foldername) / (filename_no_suffix + pdf_file_path.suffix)

        shutil.copy(pdf_file_path, pdf_file_temp_path)

        pdf_txt_temp_path= pathlib.Path(temp_foldername)  / (filename_no_suffix + '.txt')
        pdf_modified_file_path = pathlib.Path(temp_foldername)  / (filename_no_suffix + '_modified.pdf')

        ocrmypdf.ocr(input_file=pdf_file_temp_path,sidecar=pdf_txt_temp_path,output_file=pdf_modified_file_path,language='eng+chi_tra',optimize=3, deskew=True,force_ocr=True,progress_bar=True,image_dpi=1200,tesseract_oem=1, tesseract_pagesegmode=3)

        with open(pdf_txt_temp_path, 'r', encoding='utf-8') as f:
            lines=[line  for line in f.readlines() if len(line.strip().strip('\n'))>1 ]


        pdf_txt_path = output_folder_path / (pdf_file_path.stem + '.txt')

        with open(pdf_txt_path, 'w', encoding='utf-8') as g:
            for line in lines:
                g.write(line)

        temp_folder.cleanup()
Exemple #6
0
    def convert(self, pdf_filename):
        pdf_filepath = os.path.join(PDF_DIR, pdf_filename)
        error_msg = ""
        if not os.path.isfile(pdf_filepath):
            error_msg = "File not found"
            return False, "", error_msg

        now = datetime.now().strftime("%d%m%Y-%H%M%S")

        new_pdf_filename = pdf_filename[:-4] + now + "_searchable.pdf"
        new_pdf_filepath = os.path.join(PDF_DIR, new_pdf_filename)

        try:
            ocr.ocr(input_file=pdf_filepath,
                    output_file=new_pdf_filepath,
                    skip_text=True)
        except Exception as e:
            error_msg = "OCR exception occured"
            return False, "", error_msg

        if os.path.isfile(new_pdf_filepath):
            return True, new_pdf_filename, error_msg

        error_msg = "File not found"
        return False, "", error_msg
Exemple #7
0
def ocr():
    '''
    inputFile: full path for scanned pdf
    outputFile: full path for ocr'd file (can be same as inputfile but will replaced the scanned file)
    textFile: full path for creating text file

    input_file, output_file: self explanatory
    force_ocr: if the file is already OCR'd, it ocr's it again by removing previous text layer
    deskew: Analyses the rotation of page
    tesseract_pagesegmode: psm mode (0-13)
    sidecar: arg for creating text file on the provided location
    rotate_pages: Further increases the accuracy of rotated scanned PDF's, also gives output pdf after
    rotating the pages in right direction

    :return:
    '''

    inputFile = "C:\\D_Drive\wordpress-pdf-invoice-plugin-sample.pdf"
    outputFile = "C:\\D_Drive\wordpress-pdf-invoice-plugin-sample_ocr.pdf"
    textFile = "C:\\D_Drive\wordpress-pdf-invoice-plugin-sample.txt"

    psm = None

    ocrmypdf.ocr(input_file=inputFile,
                 output_file=outputFile,
                 force_ocr=True,
                 deskew=True,
                 tesseract_pagesegmode=psm,
                 sidecar=textFile,
                 rotate_pages=True)
Exemple #8
0
def add_ocr_to_pdf(update, context):
    if not check_user_data(update, context, PDF_INFO):
        return ConversationHandler.END

    _ = set_lang(update, context)
    update.effective_message.reply_text(
        _("Adding an OCR text layer to your PDF file"),
        reply_markup=ReplyKeyboardRemove(),
    )

    with tempfile.NamedTemporaryFile() as tf:
        user_data = context.user_data
        file_id, file_name = user_data[PDF_INFO]
        pdf_file = context.bot.get_file(file_id)
        pdf_file.download(custom_path=tf.name)

        with tempfile.TemporaryDirectory() as dir_name:
            out_fn = os.path.join(dir_name,
                                  f"OCR_{os.path.splitext(file_name)[0]}.pdf")
            try:
                # logging.getLogger("ocrmypdf").setLevel(logging.WARNING)
                ocrmypdf.ocr(tf.name, out_fn, deskew=True, progress_bar=False)
                send_result_file(update, context, out_fn, "ocr")
            except PriorOcrFoundError:
                update.effective_message.reply_text(
                    _("Your PDF file already has a text layer"))

    # Clean up memory
    if user_data[PDF_INFO] == file_id:
        del user_data[PDF_INFO]

    return ConversationHandler.END
Exemple #9
0
def pdf2txt(doc_path):
    txt_path = doc_path.with_suffix('.txt')
    """ #remove the condition for treating the file again
    if os.path.exists(txt_path) and not(file_is_too_small(txt_path)):
        #tqdm.write(f"File {txt_path} exists. Skipping...")
        return 0"""
    if file_is_too_big(doc_path):
        tqdm.write(f"File {doc_path} is too big. Skipping...")
        return 0
    try:
        P.extract_text(doc_path)  # writes text to /path/to/my_file.txt
        if file_is_too_small(txt_path):
            # Text file is very small, PDF has an image probably, try OCRizing it
            try:
                ocr_txt = ocr_pdf(doc_path)
                with open(txt_path, "w") as filo:
                    filo.write(ocr_txt)
                return 1
            except Exception as e:
                if file_is_too_small(txt_path):
                    try:
                        ocrmypdf.ocr(doc_path,
                                     doc_path.parents[0] / "result.pdf",
                                     sidecar=txt_path)
                        return 1
                    except Exception as e:
                        print(
                            f"Could not ocr convert to txt file {doc_path}: {str(e)}"
                        )
                        return 0
    except Exception as e:
        print(f"Could not convert to txt file {doc_path}: {str(e)}")
        return 0
 def ocr(self, pdf_file, destination):
     """Calling ocrmypdf"""
     self.log.debug(f'ocr for source={pdf_file}')
     ocrmypdf.ocr(pdf_file,
                  destination,
                  deskew=True,
                  language="deu",
                  oversample=500)
Exemple #11
0
def ocr(book_title):
    if config['Do_OCR'] == "True":
        book_title = book_title + '_IMG'
        ocrmypdf.ocr(book_title + '.pdf',
                     book_title + "_OCR.pdf",
                     use_threads=True)
    else:
        pass
Exemple #12
0
def test_links(resources, outpdf):
    ocrmypdf.ocr(
        resources / 'link.pdf', outpdf, redo_ocr=True, oversample=200, output_type='pdf'
    )
    pdf = pikepdf.open(outpdf)
    p1 = pdf.pages[0]
    p2 = pdf.pages[1]
    assert p1.Annots[0].A.D[0].objgen == p2.objgen
    assert p2.Annots[0].A.D[0].objgen == p1.objgen
Exemple #13
0
def handle(req):

    decoded = base64.decodebytes(bytes(req, 'utf-8'))
    file_path = save_image_from_base64(req, 'pdf')

    ocrmypdf.ocr(file_path, './tmp/output.pdf', deskew=True)

    text = textract.process('./tmp/output.pdf')

    print(text)
Exemple #14
0
 def extractText(filename):
     outputFilename = f"/tmp/{path.basename(filename)}"
     try:
         ocr(input_file=filename,
             output_file=outputFilename,
             force_ocr=True,
             progress_bar=False)
         return outputFilename
     except:
         logging.exception(f"Cannot ocr {path.basename(filename)}.",
                           exc_info=True)
Exemple #15
0
def test_no_glyphless_graft(resources, outdir):
    pdf = pikepdf.open(resources / 'francais.pdf')
    pdf_aspect = pikepdf.open(resources / 'aspect.pdf')
    pdf_cmyk = pikepdf.open(resources / 'cmyk.pdf')
    pdf.pages.extend(pdf_aspect.pages)
    pdf.pages.extend(pdf_cmyk.pages)
    pdf.save(outdir / 'test.pdf')

    with patch('ocrmypdf._graft.MAX_REPLACE_PAGES', 2):
        ocrmypdf.ocr(outdir / 'test.pdf',
                     outdir / 'out.pdf',
                     deskew=True,
                     tesseract_timeout=0)
Exemple #16
0
def execute_ocrmypdf(file_path):
    filename = Path(file_path).name
    if OUTPUT_DIRECTORY_YEAR_MONTH:
        today = datetime.today()
        output_directory_year_month = Path(
            f'{OUTPUT_DIRECTORY}/{today.year}/{today.month}')
        if not output_directory_year_month.exists():
            output_directory_year_month.mkdir(parents=True, exist_ok=True)
        output_path = Path(output_directory_year_month) / filename
    else:
        output_path = Path(OUTPUT_DIRECTORY) / filename
    print(f'New file: {file_path}.\nAttempting to OCRmyPDF to: {output_path}')
    ocrmypdf.ocr(file_path, output_path)
Exemple #17
0
def createWOFileTypeKeys():
    keys = {}
    for ftype in list(
            os.scandir(
                '/media/andrew/F08C9B848C9B444E/analysis/tv/orderscoring/')):
        o = list(
            os.scandir(
                '/media/andrew/F08C9B848C9B444E/analysis/tv/orderscoring/' +
                ftype.name + '/'))
        op = []
        # Convert all pdfs to text, string process them and turn them into an array of strings
        for f in o:
            res = subprocess.run(['pdftotext', f.path, '-'],
                                 stdout=subprocess.PIPE).stdout.decode()
            res = res.split('\n')
            res = [x.replace(' ', '') for x in res]
            res = [x.replace(':', '') for x in res]
            res = [x.replace('.', '') for x in res]
            res = list(filter(lambda a: a != '', res))
            res = res[0:200]
            res = list(set(res))
            #print(res[0:100])
            if len(res) > 1:
                op.append(res)
            else:
                ocrmypdf.ocr(f.path, f.path, deskew=True, rotate_pages=True)
        curSet = []
        # Filter so only keys that exist in all files of the given report format remain.
        print(len(op))
        for l in op:
            if len(curSet) == 0:
                curSet = l
            else:
                curSet = [x for x in curSet if x in l]
        keys[ftype.name] = curSet
    # Filter out keys that are non-unique to that report type.
    for k in keys:
        types = ['contracts', 'invoices', 'orders']
        types = [x for x in types if x != k]
        for t in types:
            keys[k] = [x for x in keys[k] if x not in keys[t]]
        # Get rid of nonspecific keys
        keys[k] = list(filter(lambda a: (len(a) > 4) & (len(a) < 23), keys[k]))
        print(keys[k])
    keys = [[(k, vv) for vv in v] for k, v in keys.items()]
    keys2 = []
    for k in keys:
        for v in k:
            keys2.append(v)
    keys = pd.DataFrame(keys2, columns=['pdftype', 'keyword'])
    keys.to_csv('filetypekeywords.csv', index=False)
Exemple #18
0
def test_no_glyphless_graft(resources, outdir):
    pdf = pikepdf.open(resources / 'francais.pdf')
    pdf_aspect = pikepdf.open(resources / 'aspect.pdf')
    pdf_cmyk = pikepdf.open(resources / 'cmyk.pdf')
    pdf.pages.extend(pdf_aspect.pages)
    pdf.pages.extend(pdf_cmyk.pages)
    pdf.save(outdir / 'test.pdf')

    env = os.environ.copy()
    env['_OCRMYPDF_MAX_REPLACE_PAGES'] = '2'
    with os_environ(env):
        ocrmypdf.ocr(
            outdir / 'test.pdf', outdir / 'out.pdf', deskew=True, tesseract_timeout=0
        )
Exemple #19
0
    def create_searchable_pdf(self, input_pdf, output_pdf):

        logging.debug("Working directory: %s" % os.getcwd())
        try:
            ocrmypdf.ocr(input_pdf,
                         output_pdf,
                         rotate_pages=True,
                         rotate_pages_threshold=13,
                         deskew=True,
                         clean=True)

        except Exception as e:
            logging.error(e)
            return
Exemple #20
0
def test_limited_pages(resources, outpdf, spoof_tesseract_cache):
    multi = resources / 'multipage.pdf'
    ocrmypdf.ocr(
        multi,
        outpdf,
        pages='5-6',
        optimize=0,
        output_type='pdf',
        tesseract_env=spoof_tesseract_cache,
    )
    pi = PdfInfo(outpdf)
    assert not pi.pages[0].has_text
    assert pi.pages[4].has_text
    assert pi.pages[5].has_text
def test_limited_pages(resources, outpdf):
    multi = resources / 'multipage.pdf'
    ocrmypdf.ocr(
        multi,
        outpdf,
        pages='5-6',
        optimize=0,
        output_type='pdf',
        plugins=['tests/plugins/tesseract_cache.py'],
    )
    pi = PdfInfo(outpdf)
    assert not pi.pages[0].has_text
    assert pi.pages[4].has_text
    assert pi.pages[5].has_text
Exemple #22
0
def _get_text(inpdf, sesspath, language, unpaper_args, minwords):
    force_ocr, prelim_text = _need_ocr(inpdf, minwords)
    ocr(inpdf,
        f"{sesspath}/tmp.pdf",
        sidecar=f"{sesspath}/tmp.txt",
        language=language,
        deskew=force_ocr,
        rotate_pages=force_ocr,
        remove_background=force_ocr,
        clean=force_ocr,
        unpaper_args=unpaper_args,
        redo_ocr=(not force_ocr),
        force_ocr=force_ocr)
    with open(f"{sesspath}/tmp.txt", "rt") as text:
        return text.read(), prelim_text
Exemple #23
0
def test_masks(spoof_tesseract_noop, resources, outpdf):
    assert (
        ocrmypdf.ocr(
            resources / 'masks.pdf', outpdf, tesseract_env=spoof_tesseract_noop
        )
        == ExitCode.ok
    )
Exemple #24
0
def execute_ocrmypdf(file_path):
    new_file = Path(file_path)
    filename = new_file.name
    if OUTPUT_DIRECTORY_YEAR_MONTH:
        today = datetime.today()
        output_directory_year_month = Path(
            f'{OUTPUT_DIRECTORY}/{today.year}/{today.month}')
        if not output_directory_year_month.exists():
            output_directory_year_month.mkdir(parents=True, exist_ok=True)
        output_path = Path(output_directory_year_month) / filename
    else:
        output_path = Path(OUTPUT_DIRECTORY) / filename
    logger.info(f'New file: {file_path}. Waiting until fully loaded...')
    # This loop waits to make sure that the file is completely loaded on
    # disk before attempting to read. Docker sometimes will publish the
    # watchdog event before the file is actually fully on disk, causing
    # pikepdf to fail.
    current_size = None
    while current_size != new_file.stat().st_size:
        current_size = new_file.stat().st_size
        logger.debug(f'new_file current_size: {current_size}')
        time.sleep(POLL_NEW_FILE_SECONDS)
    logger.info(f'Attempting to OCRmyPDF to: {output_path}')
    exit_code = ocrmypdf.ocr(input_file=file_path,
                             output_file=output_path,
                             deskew=DESKEW)
    if exit_code == 0 and ON_SUCCESS_DELETE:
        logger.info(f'Done. Deleting: {file_path}')
        new_file.unlink()
    else:
        logger.info('Done')
Exemple #25
0
def test_masks(resources, outpdf):
    assert (
        ocrmypdf.ocr(
            resources / 'masks.pdf', outpdf, plugins=['tests/plugins/tesseract_noop.py']
        )
        == ExitCode.ok
    )
    def process(self, pdfData, outputName=None, modificationTime=None):
        inf, outf, sidef = self.store(pdfData)
        #self.log.info("Creating file: %s" % outf)
        try:
            if outputName is None:
                yr, mt, name = self.guess(sidef)
                destName = os.path.join(self.destPath,
                                        "%s %s %s" % (yr, mt, name))
                idx = 2
                orgName = destName
                while os.path.exists(destName):
                    destName = "%s %02d" % (orgName, idx)
                    idx += 1
            else:
                destName = os.path.join(self.destPath, outputName)
                os.makedirs(Path(destName).parent, exist_ok=True)
            if modificationTime is not None and os.path.exists(
                    destName
            ) and os.path.getmtime(destName) > modificationTime:
                self.log.info(
                    "Skipping processing, because newer file (%s) with same name exists"
                    % destName)
            else:
                ocrmypdf.ocr(inf,
                             outf,
                             deskew=self.deskew,
                             sidecar=sidef,
                             remove_background=self.removeBackground,
                             language=self.language)
                shutil.move(outf, destName)
                os.remove(sidef)
                self.log.info("Created & processed document %s" % destName)
        except ocrmypdf.exceptions.PriorOcrFoundError:
            # ok - we skip the document, but write a message to the log file.
            self.log.info(
                "Skipping processing (copy only), because of existing ocr: %s"
                % (inf if outputName is None else outputName))
            shutil.copyfile(inf, destName)
        except ocrmypdf.exceptions.EncryptedPdfError:
            # ok - we skip the document, but write a message to the log file.
            self.log.warn(
                "Skipping processing (copy only), because PDF is encrypted: %s"
                % (inf if outputName is None else outputName))
            shutil.copyfile(inf, destName)

        os.remove(inf)
Exemple #27
0
    def generate_searchable_pdf(self, pdf, tmp_path, separator):
        """
        Start from standard PDF, with no OCR, and create a searchable PDF, with OCR. Thanks to ocrmypdf python lib

        :param pdf: Path to original pdf (not searchable, without OCR)
        :param tmp_path: Path to store the final pdf, searchable with OCR
        :param separator: Class Separator instance
        """
        try:
            output_file = tmp_path + '/result.pdf'
            ocrmypdf.ocr(pdf, output_file, language=self.lang, skip_text=True, progress_bar=False, jobs=int(self.Config.cfg['GLOBAL']['nbthreads']))
            if separator.convert_to_pdfa == "True":
                output_file = tmp_path + '/result-pdfa.pdf'
                separator.convert_to_pdfa_function(output_file, tmp_path + '/result.pdf', self.Log)

            self.searchablePdf = open(output_file, 'rb').read()
        except ocrmypdf.exceptions.PriorOcrFoundError as e:
            self.Log.error(e)
        def do_POST(self):
            try:
                content_length = int(self.headers['Content-Length'])
                post_data = self.rfile.read(content_length)
                query = parse_qs(urlparse(self.path).query)
                def query_string(name, default):
                    raw = query.get(name, [])
                    if len(raw) == 1:
                        return raw[0]
                    else:
                        return default
                def query_boolean(name, default):
                    raw = query.get(name, [])
                    if len(raw) == 1 and (raw[0] == 'yes' or raw[0] == 'true' or raw[0] == '1'):
                        return True
                    elif len(raw) == 1 and (raw[0] == 'no' or raw[0] == 'false' or raw[0] == '0'):
                        return False
                    else:
                        return default

                with tempfile.NamedTemporaryFile() as temp:
                    with tempfile.NamedTemporaryFile() as tempOut:
                        temp.write(post_data)
                        temp.seek(0)

                        result = ocrmypdf.ocr(
                            temp.name,
                            tempOut.name,
                            language = 'deu+eng',
                            rotate_pages = query_boolean('rotate_pages', True),
                            deskew = query_boolean('deskew', True),
                            remove_background = query_boolean('remove_background', True),
                            clean_final = True,
                            force_ocr = query_boolean('force_ocr', False),
                            unpaper_args = '--dpi %s --post-size a4' % query_string('dpi', '200'),
                            progress_bar = False
                        )

                        self.send_response(200)
                        self.send_header('Content-type', 'application/pdf')
                        self.end_headers()
                        self.wfile.write(tempOut.read())
            except ocrmypdf.exceptions.PriorOcrFoundError:
                self.send_response(400)
                self.send_header('Content-type', 'text/plain')
                self.end_headers()
                self.wfile.write(bytes('Document already has been OCRed', 'utf-8')) 
            except:
                self.send_response(500)
                self.send_header('Content-type', 'text/plain')
                self.end_headers()
                self.wfile.write(bytes('Unknown error', 'utf-8'))
                traceback.print_exc(file=sys.stdout)
Exemple #29
0
def ocr_pdf_if_not_searchable(filepath):

    filesProcessed = ""
    x = 0
    head, tail = os.path.split(filepath)
    text_perc = get_text_percentage(filepath)
    if text_perc < 0.01:
        x += 1
        result = ocrmypdf.ocr(filepath,
                              filepath.split("/")[-1][:-4] + "_OCR.pdf",
                              redo_ocr=True)
        filesProcessed += tail
def OCR(choice, language):
    try:
        if choice:
            save_path = os.path.join(directory, f'{book_title}_ocr.pdf')
        else:
            save_path = os.path.join(directory, f'{book_title}.pdf')

        pdf_path = os.path.join(directory, f'{book_title}.pdf')
        ocrmypdf.ocr(pdf_path,
                     save_path,
                     rotate_pages=True,
                     remove_background=True,
                     language=language.get(),
                     deskew=True,
                     force_ocr=True)

        lbl_output_2.config(fg='green')
        output_2_text.set("OCR completed")

    except Exception as e:
        print(e)
        lbl_output_2.config(fg='red')
        output_2_text.set("Failed to perform OCR")