Example #1
0
def pages_from_rasterize(docid):
    """Rasterizes PDF pages, then continues with recognition."""
    doc = util.is_valid_doc(docid)
    print "Rasterizing pages..."
    page_files = util.rasterize_pdf(doc)

    if doc.num_pages != len(page_files): #Page count stage couldn't determine
        doc.num_pages = len(page_files)
        doc.save()

    for i in range(doc.num_pages):
        doc_page = DocumentPage(document=doc,
                files_prefix=page_files[i][0],
                stage_output_extension=page_files[i][1],
                page_number=i,
                start_process_date=datetime.now(),
                status='w')
        doc_page.save()
    
        #Docs already guaranteed converted, move to binarization.
        binarize_page.delay(doc_page)
Example #2
0
def pages_from_rasterize(docid):
    """Rasterizes PDF pages, then continues with recognition."""
    doc = util.is_valid_doc(docid)
    print "Rasterizing pages..."
    page_files = util.rasterize_pdf(doc)

    if doc.num_pages != len(page_files):  #Page count stage couldn't determine
        doc.num_pages = len(page_files)
        doc.save()

    for i in range(doc.num_pages):
        doc_page = DocumentPage(document=doc,
                                files_prefix=page_files[i][0],
                                stage_output_extension=page_files[i][1],
                                page_number=i,
                                start_process_date=datetime.now(),
                                status='w')
        doc_page.save()

        #Docs already guaranteed converted, move to binarization.
        binarize_page.delay(doc_page)
Example #3
0
def pages_from_images(docid):
    doc = util.is_valid_doc(docid)
    print "Constructing pages from images..."
    #TODO: Consider splitting to multi-page TIFF so tesseract can learn
    
    page_files = util.split_to_files(doc)
    
    if doc.num_pages != len(page_files): #Page count stage couldn't determine
        doc.num_pages = len(page_files)
        doc.save()
    # Creates DocumentPages for each file returned by
    # split function, then launches conversion, etc.
    # tasks for each DocumentPage.
    for i in range(doc.num_pages):
        doc_page = DocumentPage(document=doc,
                files_prefix=page_files[i][0],
                stage_output_extension=page_files[i][1],
                page_number=i,
                start_process_date=datetime.now(),
                status='w')
        doc_page.save()

        convert_page.delay(doc_page)
Example #4
0
def pages_from_images(docid):
    doc = util.is_valid_doc(docid)
    print "Constructing pages from images..."
    #TODO: Consider splitting to multi-page TIFF so tesseract can learn

    page_files = util.split_to_files(doc)

    if doc.num_pages != len(page_files):  #Page count stage couldn't determine
        doc.num_pages = len(page_files)
        doc.save()
    # Creates DocumentPages for each file returned by
    # split function, then launches conversion, etc.
    # tasks for each DocumentPage.
    for i in range(doc.num_pages):
        doc_page = DocumentPage(document=doc,
                                files_prefix=page_files[i][0],
                                stage_output_extension=page_files[i][1],
                                page_number=i,
                                start_process_date=datetime.now(),
                                status='w')
        doc_page.save()

        convert_page.delay(doc_page)