Ejemplo n.º 1
0
def iter_images_and_pages(images):
    """This function iterates over a images and also the contained pages. As
    OpenCV is not able to handle multipage TIFF files, we use the SDAPS internal
    loading method for those."""

    for filename in images:
        try:
            # Check whether this is a TIFF file (ie. try to retrieve the page count)
            pages = image.get_tiff_page_count(filename)
            is_tiff = True
        except AssertionError:
            pages = 1
            is_tiff = False

        for page in xrange(pages):
            if not is_tiff:
                img = cv2.imread(filename)

            else:
                # TIFF pages are zero based
                surf = image.get_rgb24_from_tiff(filename, page, False)

                width = surf.get_width()
                height = surf.get_height()
                stride = surf.get_stride()

                # We need to ensure a sane stride!
                np_width = stride / 4

                # This converts by doing a copy; first create target numpy array
                # We need a dummy alpha channel ...
                target = np.empty((height, np_width), dtype=np.uint32)

                tmp_surf = cairo.ImageSurface.create_for_data(
                    target.data, cairo.FORMAT_RGB24, width, height, stride)
                cr = cairo.Context(tmp_surf)
                cr.set_source_surface(surf)
                cr.paint()
                del cr
                tmp_surf.flush()
                del tmp_surf

                # Now, we need a bit of reshaping
                img = np.empty((height, width, 3), dtype=np.uint8)

                # order should be BGR
                img[:, :, 2] = 0xff & (target[:, :] >> 16)
                img[:, :, 1] = 0xff & (target[:, :] >> 8)
                img[:, :, 0] = 0xff & target[:, :]

            yield img, filename, page
Ejemplo n.º 2
0
def iter_images_and_pages(images):
    """This function iterates over a images and also the contained pages. As
    OpenCV is not able to handle multipage TIFF files, we use the SDAPS internal
    loading method for those."""

    for filename in images:
        try:
            # Check whether this is a TIFF file (ie. try to retrieve the page count)
            pages = image.get_tiff_page_count(filename)
            is_tiff = True
        except AssertionError:
            pages = 1
            is_tiff = False

        for page in xrange(pages):
            if not is_tiff:
                img = cv2.imread(filename)

            else:
                # TIFF pages are zero based
                surf = image.get_rgb24_from_tiff(filename, page, False)

                width = surf.get_width()
                height = surf.get_height()
                stride = surf.get_stride()

                # We need to ensure a sane stride!
                np_width = stride / 4

                # This converts by doing a copy; first create target numpy array
                # We need a dummy alpha channel ...
                target = np.empty((height, np_width), dtype=np.uint32)

                tmp_surf = cairo.ImageSurface.create_for_data(target.data, cairo.FORMAT_RGB24, width, height, stride)
                cr = cairo.Context(tmp_surf)
                cr.set_source_surface(surf)
                cr.paint()
                del cr
                tmp_surf.flush()
                del tmp_surf

                # Now, we need a bit of reshaping
                img = np.empty((height, width, 3), dtype=np.uint8)

                # order should be BGR
                img[:,:,2] = 0xff & (target[:,:] >> 16)
                img[:,:,1] = 0xff & (target[:,:] >> 8)
                img[:,:,0] = 0xff & target[:,:]

            yield img, filename, page
Ejemplo n.º 3
0
def add_image(survey, file, duplex_scan=False, force=False, copy=True):

    insert_dummy_pages, image_count_factor = _insert_dummy_pages(survey, duplex_scan)

    if not check_image(survey, file, duplex_scan, force, message=True):
        return

    num_pages = image.get_tiff_page_count(file)

    c = survey.questionnaire.page_count
    if not insert_dummy_pages:
        c = c * image_count_factor

    if insert_dummy_pages:
        c = c * image_count_factor

    if copy:
        tiff = survey.new_path('%i.tif')
        shutil.copyfile(file, tiff)
    else:
        tiff = file

    if copy:
        tiff = os.path.basename(tiff)
    else:
        tiff = os.path.relpath(os.path.abspath(tiff), survey.survey_dir)

    pages = list(range(num_pages))
    while len(pages) > 0:
        sheet = model.sheet.Sheet()
        survey.add_sheet(sheet)
        while len(pages) > 0 and len(sheet.images) < c:
            img = model.sheet.Image()
            sheet.add_image(img)
            img.filename = tiff
            img.tiff_page = pages.pop(0)

            # And a dummy page if required
            if insert_dummy_pages:
                img = model.sheet.Image()
                sheet.add_image(img)

                img.filename = "DUMMY"
                img.tiff_page = -1
                img.ignored = True
Ejemplo n.º 4
0
def add_image(survey, file, duplex_scan=False, force=False, copy=True):

    insert_dummy_pages, image_count_factor = _insert_dummy_pages(survey, duplex_scan)

    if not check_image(survey, file, duplex_scan, force, message=True):
        return

    num_pages = image.get_tiff_page_count(file)

    c = survey.questionnaire.page_count
    if not insert_dummy_pages:
        c = c * image_count_factor

    if insert_dummy_pages:
        c = c * image_count_factor

    if copy:
        tiff = survey.new_path("%i.tif")
        shutil.copyfile(file, tiff)
    else:
        tiff = file

    if copy:
        tiff = os.path.basename(tiff)
    else:
        tiff = os.path.relpath(os.path.abspath(tiff), survey.survey_dir)

    pages = range(num_pages)
    while len(pages) > 0:
        sheet = model.sheet.Sheet()
        survey.add_sheet(sheet)
        while len(pages) > 0 and len(sheet.images) < c:
            img = model.sheet.Image()
            sheet.add_image(img)
            img.filename = tiff
            img.tiff_page = pages.pop(0)

            # And a dummy page if required
            if insert_dummy_pages:
                img = model.sheet.Image()
                sheet.add_image(img)

                img.filename = "DUMMY"
                img.tiff_page = -1
                img.ignored = True
Ejemplo n.º 5
0
def check_image(survey, file, duplex_scan=False, force=False, message=False):

    insert_dummy_pages, image_count_factor = _insert_dummy_pages(survey, duplex_scan)

    if not image.check_tiff_monochrome(file):
        if message:
            print(_('Invalid input file %s. You need to specify a (multipage) monochrome TIFF as input.') % (file,))
        return False

    num_pages = image.get_tiff_page_count(file)

    c = survey.questionnaire.page_count
    if not insert_dummy_pages:
        c = c * image_count_factor

    # This test is on the image count that needs to come from the file
    if num_pages % c != 0 and not force:
        if message:
            print(_('Not adding %s because it has a wrong page count (needs to be a mulitple of %i).') % (file, c))
        return False

    return True
Ejemplo n.º 6
0
def check_image(survey, file, duplex_scan=False, force=False, message=False):

    insert_dummy_pages, image_count_factor = _insert_dummy_pages(survey, duplex_scan)

    if not image.check_tiff_monochrome(file):
        if message:
            print _("Invalid input file %s. You need to specify a (multipage) monochrome TIFF as input.") % (file,)
        return False

    num_pages = image.get_tiff_page_count(file)

    c = survey.questionnaire.page_count
    if not insert_dummy_pages:
        c = c * image_count_factor

    # This test is on the image count that needs to come from the file
    if num_pages % c != 0 and not force:
        if message:
            print _("Not adding %s because it has a wrong page count (needs to be a mulitple of %i).") % (file, c)
        return False

    return True
Ejemplo n.º 7
0
def add_image(survey, file, duplex_scan=False, force=False, copy=True):

    from sdaps import image
    import shutil

    # Insert dummy pages if the survey is duplex and the duplex option was not
    # passed
    if survey.defs.duplex:
        # One image per questionnaire page in duplex mode
        image_count_factor = 1
        # No dummy pages in duplex mode
        insert_dummy_pages = False
    else:
        # Two images per questionnaire page in duplex mode
        image_count_factor = 2

        # In simplex mode insertion of dummy pages depends on the command line
        # optoin (default is True)
        if duplex_scan:
            insert_dummy_pages = False
        else:
            insert_dummy_pages = True



    if not image.check_tiff_monochrome(file):
        print _('Invalid input file %s. You need to specify a (multipage) monochrome TIFF as input.') % (file,)
        raise AssertionError()

    num_pages = image.get_tiff_page_count(file)

    c = survey.questionnaire.page_count
    if not insert_dummy_pages:
        c = c * image_count_factor

    # This test is on the image count that needs to come from the file
    if num_pages % c != 0 and not force:
        print _('Not adding %s because it has a wrong page count (needs to be a mulitple of %i).') % (file, c)
        return

    if insert_dummy_pages:
        c = c * image_count_factor

    if copy:
        tiff = survey.new_path('%i.tif')
        shutil.copyfile(file, tiff)
    else:
        tiff = file

    if copy:
        tiff = os.path.basename(tiff)
    else:
        tiff = os.path.relpath(os.path.abspath(tiff), survey.survey_dir)

    pages = range(num_pages)
    while len(pages) > 0:
        sheet = model.sheet.Sheet()
        survey.add_sheet(sheet)
        while len(pages) > 0 and len(sheet.images) < c:
            img = model.sheet.Image()
            sheet.add_image(img)
            img.filename = tiff
            img.tiff_page = pages.pop(0)

            # And a dummy page if required
            if insert_dummy_pages:
                img = model.sheet.Image()
                sheet.add_image(img)

                img.filename = "DUMMY"
                img.tiff_page = -1
                img.ignored = True
Ejemplo n.º 8
0
def iter_images_and_pages(images):
    """This function iterates over a images and also the contained pages. As
    OpenCV is not able to handle multipage TIFF files, we use the SDAPS internal
    loading method for those."""

    for filename in images:
        pages = 1
        is_tiff = False
        is_pdf = False

        try:
            # Check whether this is a TIFF file (ie. try to retrieve the page count)
            pages = image.get_tiff_page_count(filename)
            is_tiff = True
        except AssertionError:
            pass

        if not is_tiff:
            try:
                gfile = Gio.File.new_for_path(filename)
                pdf_doc = Poppler.Document.new_from_gfile(gfile, None, None)
                pages = pdf_doc.get_n_pages()
                is_pdf = True
            except:
                # Either not PDF/damaged or poppler not installed properly
                pass


        for page in xrange(pages):
            if is_tiff:
                # TIFF pages are zero based
                surf = image.get_rgb24_from_tiff(filename, page, False)

                img = to_opencv(surf)

            elif is_pdf:
                # Try to retrieve a single fullpage image, if that fails, render
                # document at 300dpi.

                THRESH = 10 #pt

                pdfpage = pdf_doc.get_page(page)
                page_width, page_height = pdfpage.get_size()

                images = pdfpage.get_image_mapping()
                if len(images) == 1 and (
                        abs(images[0].area.x1) < THRESH and
                        abs(images[0].area.y1) < THRESH and
                        abs(images[0].area.x2 - page_width) < THRESH and
                        abs(images[0].area.y2 - page_height) < THRESH):
                    # Assume one full page image, and simply use that.
                    surf = pdfpage.get_image(images[0].image_id)

                else:
                    # Render page at 300dpi
                    surf = cairo.ImageSurface(cairo.FORMAT_RGB24, int(300 / 72 * page_width), int(300 / 72 * page_height))
                    cr = cairo.Context(surf)
                    cr.scale(300 / 72, 300 / 72)
                    cr.set_source_rgb(1, 1, 1)
                    cr.paint()

                    pdfpage.render_for_printing(cr)

                    del cr

                img = to_opencv(surf)

            else:
                img = cv2.imread(filename)

            yield img, filename, page
Ejemplo n.º 9
0
from sdaps import image

# Assume the first argument is a survey
survey = model.survey.Survey.load(sys.argv[1])

# We need the recognize buddies, as they are able to identify the data
from sdaps.recognize import buddies

# A sheet object to attach the images to
sheet = model.sheet.Sheet()
survey.add_sheet(sheet)

images = []

for file in sys.argv[2:]:
    num_pages = image.get_tiff_page_count(file)
    for page in xrange(num_pages):
        images.append((file, page))

if len(images) == 0:
    # No images, simply exit again.
    sys.exit(1)


def add_image(survey, tiff, page):
    img = model.sheet.Image()
    survey.sheet.add_image(img)
    # SDAPS assumes a relative path from the survey directory
    img.filename = os.path.relpath(os.path.abspath(tiff), survey.survey_dir)
    img.orig_name = tiff
    img.tiff_page = page
Ejemplo n.º 10
0
def iter_images_and_pages(images):
    """This function iterates over a images and also the contained pages. As
    OpenCV is not able to handle multipage TIFF files, we use the SDAPS internal
    loading method for those."""

    for filename in images:
        pages = 1
        is_tiff = False
        is_pdf = False

        try:
            # Check whether this is a TIFF file (ie. try to retrieve the page count)
            pages = image.get_tiff_page_count(filename)
            is_tiff = True
        except AssertionError:
            pass

        if not is_tiff:
            try:
                gfile = Gio.File.new_for_path(filename)
                pdf_doc = Poppler.Document.new_from_gfile(gfile, None, None)
                pages = pdf_doc.get_n_pages()
                is_pdf = True
            except:
                # Either not PDF/damaged or poppler not installed properly
                pass


        for page in xrange(pages):
            if is_tiff:
                # TIFF pages are zero based
                surf = image.get_rgb24_from_tiff(filename, page, False)

                img = to_opencv(surf)

            elif is_pdf:
                # Try to retrieve a single fullpage image, if that fails, render
                # document at 300dpi.

                THRESH = 10 #pt

                pdfpage = pdf_doc.get_page(page)
                page_width, page_height = pdfpage.get_size()

                images = pdfpage.get_image_mapping()
                if len(images) == 1 and (
                        abs(images[0].area.x1) < THRESH and
                        abs(images[0].area.y1) < THRESH and
                        abs(images[0].area.x2 - page_width) < THRESH and
                        abs(images[0].area.y2 - page_height) < THRESH):
                    # Assume one full page image, and simply use that.
                    surf = pdfpage.get_image(images[0].image_id)

                else:
                    # Render page at 300dpi
                    surf = cairo.ImageSurface(cairo.FORMAT_RGB24, int(300 / 72 * page_width), int(300 / 72 * page_height))
                    cr = cairo.Context(surf)
                    cr.scale(300 / 72, 300 / 72)
                    cr.set_source_rgb(1, 1, 1)
                    cr.paint()

                    pdfpage.render_for_printing(cr)

                    del cr

                img = to_opencv(surf)

            else:
                img = cv2.imread(filename)

            yield img, filename, page
Ejemplo n.º 11
0
def add_image(survey, file, duplex_scan=False, force=False, copy=True):

    from sdaps import image
    import shutil

    # Insert dummy pages if the survey is duplex and the duplex option was not
    # passed
    if survey.defs.duplex:
        # One image per questionnaire page in duplex mode
        image_count_factor = 1
        # No dummy pages in duplex mode
        insert_dummy_pages = False
    else:
        # Two images per questionnaire page in duplex mode
        image_count_factor = 2

        # In simplex mode insertion of dummy pages depends on the command line
        # optoin (default is True)
        if duplex_scan:
            insert_dummy_pages = False
        else:
            insert_dummy_pages = True



    if not image.check_tiff_monochrome(file):
        print _('Invalid input file %s. You need to specify a (multipage) monochrome TIFF as input.') % (file,)
        raise AssertionError()

    num_pages = image.get_tiff_page_count(file)

    c = survey.questionnaire.page_count
    if not insert_dummy_pages:
        c = c * image_count_factor

    # This test is on the image count that needs to come from the file
    if num_pages % c != 0 and not force:
        print _('Not adding %s because it has a wrong page count (needs to be a mulitple of %i).') % (file, c)
        return

    if insert_dummy_pages:
        c = c * image_count_factor

    if copy:
        tiff = survey.new_path('%i.tif')
        shutil.copyfile(file, tiff)
    else:
        tiff = file

    if copy:
        tiff = os.path.basename(tiff)
    else:
        tiff = os.path.relpath(os.path.abspath(tiff), survey.survey_dir)

    pages = range(num_pages)
    while len(pages) > 0:
        sheet = model.sheet.Sheet()
        survey.add_sheet(sheet)
        while len(pages) > 0 and len(sheet.images) < c:
            img = model.sheet.Image()
            sheet.add_image(img)
            img.filename = tiff
            img.tiff_page = pages.pop(0)

            # And a dummy page if required
            if insert_dummy_pages:
                img = model.sheet.Image()
                sheet.add_image(img)

                img.filename = "DUMMY"
                img.tiff_page = -1
                img.ignored = True
Ejemplo n.º 12
0
def watch(cmdline):

    # We need a survey that has the correct definitions (paper size, duplex mode)
    # Assume the first argument is a survey
    if os.path.exists('./WATCH/info'):
        print('WATCH project found')
        pass
    else:
        print('Creating WATCH project')
        subprocess.call(['sdaps', 'setup', 'WATCH', './watch.tex'])
    watchtexpath = (os.path.dirname(os.path.abspath(__file__)))
    #loading dummy survey
    print('Loading WATCH project')
    survey = model.survey.Survey.load('WATCH')

    # A sheet object to attach the images to
    sheet = model.sheet.Sheet()
    survey.add_sheet(sheet)

    print('Listing all projects in ProjectsFolder')

    #creating project dictionnary
    surveyIdList = {}

    #list of all subfolders containing 'info'
    for file in Path(cmdline['projectsFolder']).walkfiles('info'):
        s = file.dirname()
        with open(s + '/info', "r") as infoFile:
            #looking for survey id and add it to the dictionnary
            lines = infoFile.read()
            line = lines.split('\n')
            for l in line:
                words = l.split(' = ')
                if words[0] == 'survey_id':
                    print('DETECT ! : ' + words[1])
                    surveyIdList[words[1]] = s
    with open('surveyList.csv', 'w') as f:
        for key in surveyIdList.keys():
            f.write("%s,%s\n" % (key, surveyIdList[key]))

    #file retrieval
    print('Listing scanned files')
    scans = os.listdir(cmdline['scanFolder'])

    print(scans)

    #temp folder creation
    tempd = tempfile.mkdtemp()
    print('Temp folder :' + tempd)

    #folder with alreay processed scans
    renamedFolder = cmdline['renamedFolder']

    def is_tiff(scanned):
        scan_title, scan_extension = os.path.splitext(scanned)
        if scan_extension == '.tif' or scan_extension == '.tiff':
            return True
        else:
            return False

    def is_pdf(scanned):
        scan_title, scan_extension = os.path.splitext(scanned)
        if scan_extension == '.pdf':
            return True
        else:
            return False

#convert and copy

    for scan in scans:
        scan_title, scan_extension = os.path.splitext(scan)
        print(scan_title, scan_extension)
        if is_pdf(scan):
            print('PDF file found')
            print('Scan title ' + scan_title,
                  'Scan extension ' + scan_extension)
            tempscanpdf = tempfile.mktemp(suffix='.pdf', dir=tempd)
            tempscantif = tempfile.mktemp(suffix='.tif', dir=tempd)
            print('File', str(cmdline['scanFolder'] + '/' + scan),
                  'found, trying to convert to ' + tempscantif)
            subprocess.call(
                ['cp', cmdline['scanFolder'] + '/' + scan, tempscanpdf])
            print('Copied' + str(cmdline['scanFolder'] + '/' + scan) + 'to ' +
                  tempscanpdf)
            #subprocess.call(['sdaps', 'add', "WATCH", tempscanpdf, '--convert'])
            # for i, (img, filename, page) in enumerate(opencv.iter_images_and_pages(tempscanpdf)):
            #     print(img)
            #     print(filename)
            #     print(page)
            scantoconvert = []
            scantoconvert.append(tempscanpdf)
            convert.convert_images(scantoconvert, tempscantif,
                                   survey.defs.paper_width,
                                   survey.defs.paper_height)
            #subprocess.call(['pdfimages', '-tiff', cmdline['scanFolder']+'/'+scan, tempd+'/'+scan_title])
        elif is_tiff(scan):
            print('TIFF file found')
            tempscantif = tempfile.mktemp(suffix='.tif', dir=tempd)
            subprocess.call(
                ['cp', cmdline['scanFolder'] + '/' + scan, tempscantif])
        else:
            print('Wrong image format for file ' + scan)

    #we retrieve all tiff to be processed
    tiffscans = filter(is_tiff, os.listdir(tempd))

    images = []

    print('Files to be processed :' + str(tiffscans))

    for file in tiffscans:
        num_pages = image.get_tiff_page_count(tempd + '/' + file)
        print(num_pages)
        for page in range(num_pages):
            images.append((tempd + "/" + file, page))

    if len(images) == 0:
        # No images, simply exit again.
        sys.exit(1)

    def add_image(survey, tiff, page):
        img = model.sheet.Image()
        survey.sheet.add_image(img)
        # SDAPS assumes a relative path from the survey directory
        img.filename = os.path.relpath(os.path.abspath(tiff),
                                       survey.survey_dir)
        img.orig_name = tiff
        img.tiff_page = page
        #print('Images added :'+str(img.filename)+str(img.orig_name)+str(img.tiff_page))
        imgdummy = model.sheet.Image()
        survey.sheet.add_image(imgdummy)
        imgdummy.orig_name = "DUMMY"
        imgdummy.filename = "DUMMY"
        imgdummy.tiff_page = -1
        imgdummy.ignored = True
        #print('Images added :'+str(imgdummy.filename)+str(img.orig_name)+str(imgdummy.tiff_page))

    while images:
        # Simply drop the list of images again.
        sheet.images = []

        add_image(survey, *images.pop(0))
        print('Adding image simplex mode')

        if survey.defs.duplex:
            print('Adding image duplex mode')
            add_image(survey, *images.pop(0))

        #print(images)

        sheet.recognize.recognize()
        #
        for img in sheet.images:
            if img.tiff_page != -1:
                print(img.orig_name, img.tiff_page)
                print('\tPage:', img.page_number)
                print('\tRotated:', img.rotated)
                print('\tMatrix (px to mm):', img.raw_matrix)
                print('\tSurvey-ID:', sheet.survey_id)
                print('\tGlobal-ID:', sheet.global_id)
                print('\tBarcode-ID:', sheet.barcode_id)
                print('\tQuestionnaire-ID:', sheet.questionnaire_id)
                now = datetime.datetime.now()
                datestamp = now.strftime('%Y%m%d%H%M%S%f')
                tiffname = str(datestamp) + str(
                    sheet.questionnaire_id) + '_' + str(
                        sheet.survey_id) + '_' + str(sheet.barcode_id)
                subprocess.call(['cp', img.orig_name, tiffname + ".tif"])
Ejemplo n.º 13
0
def iter_images_and_pages(images):
    """This function iterates over a images and also the contained pages. As
    OpenCV is not able to handle multipage TIFF files, we use the SDAPS internal
    loading method for those."""

    for filename in images:
        if not os.path.exists(filename):
            raise IOError(errno.ENOENT, _("File does not exist"), filename)

        pages = 1
        is_tiff = False
        is_pdf = False

        try:
            # Check whether this is a TIFF file (ie. try to retrieve the page count)
            pages = image.get_tiff_page_count(filename)
            is_tiff = True
        except AssertionError:
            pass

        if not is_tiff:
            try:
                gfile = Gio.File.new_for_path(filename)
                pdf_doc = Poppler.Document.new_from_gfile(gfile, None, None)
                pages = pdf_doc.get_n_pages()
                is_pdf = True
            except:
                # Either not PDF/damaged or poppler not installed properly
                pass

        for page in range(pages):
            if is_tiff:
                # TIFF pages are zero based
                surf = image.get_rgb24_from_tiff(filename, page, False)

                img = to_opencv(surf)

            elif is_pdf:
                # Try to retrieve a single fullpage image, if that fails, render
                # document at 300dpi.

                THRESH = 10  #pt

                pdfpage = pdf_doc.get_page(page)
                page_width, page_height = pdfpage.get_size()

                images = pdfpage.get_image_mapping()
                if len(images) == 1 and (
                        abs(images[0].area.x1) < THRESH
                        and abs(images[0].area.y1) < THRESH
                        and abs(images[0].area.x2 - page_width) < THRESH
                        and abs(images[0].area.y2 - page_height) < THRESH):
                    # Assume one full page image, and simply use that.
                    surf = pdfpage.get_image(images[0].image_id)

                else:
                    dpi = 0
                    # Try to detect the DPI of the scan
                    for img in images:
                        if img.area.y2 - img.area.y1 < page_height / 2:
                            continue

                        surf = pdfpage.get_image(img.image_id)
                        # Calculate DPI from height
                        dpi_x = round(surf.get_height() /
                                      (img.area.y2 - img.area.y1) * 72)
                        dpi_y = round(surf.get_width() /
                                      (img.area.x2 - img.area.x1) * 72)
                        if abs(dpi_x - dpi_y) <= 1:
                            dpi = max(dpi, dpi_x, dpi_y)

                    # Fall back to 300dpi for odd values
                    if dpi < 199 or dpi > 601:
                        dpi = 300

                    surf = cairo.ImageSurface(cairo.FORMAT_RGB24,
                                              int(dpi / 72 * page_width),
                                              int(dpi / 72 * page_height))
                    cr = cairo.Context(surf)
                    cr.scale(dpi / 72, dpi / 72)
                    cr.set_source_rgb(1, 1, 1)
                    cr.paint()

                    pdfpage.render_for_printing(cr)

                    del cr

                img = to_opencv(surf)

            else:
                img = cv2.imread(filename)

            yield img, filename, page
Ejemplo n.º 14
0
def iter_images_and_pages(images):
    """This function iterates over a images and also the contained pages. As
    OpenCV is not able to handle multipage TIFF files, we use the SDAPS internal
    loading method for those."""

    for filename in images:
        if not os.path.exists(filename):
            raise IOError(errno.ENOENT, _("File does not exist"), filename)

        pages = 1
        is_tiff = False
        is_pdf = False

        try:
            # Check whether this is a TIFF file (ie. try to retrieve the page count)
            pages = image.get_tiff_page_count(filename)
            is_tiff = True
        except AssertionError:
            pass

        if not is_tiff:
            try:
                gfile = Gio.File.new_for_path(filename)
                pdf_doc = Poppler.Document.new_from_gfile(gfile, None, None)
                pages = pdf_doc.get_n_pages()
                is_pdf = True
            except:
                # Either not PDF/damaged or poppler not installed properly
                pass


        for page in range(pages):
            if is_tiff:
                # TIFF pages are zero based
                surf = image.get_rgb24_from_tiff(filename, page, False)

                img = to_opencv(surf)

            elif is_pdf:
                # Try to retrieve a single fullpage image, if that fails, render
                # document at 300dpi.

                THRESH = 10 #pt

                pdfpage = pdf_doc.get_page(page)
                page_width, page_height = pdfpage.get_size()

                images = pdfpage.get_image_mapping()
                if len(images) == 1 and (
                        abs(images[0].area.x1) < THRESH and
                        abs(images[0].area.y1) < THRESH and
                        abs(images[0].area.x2 - page_width) < THRESH and
                        abs(images[0].area.y2 - page_height) < THRESH):
                    # Assume one full page image, and simply use that.
                    surf = pdfpage.get_image(images[0].image_id)

                else:
                    dpi = 0
                    # Try to detect the DPI of the scan
                    for img in images:
                        if img.area.y2 - img.area.y1 < page_height / 2:
                            continue

                        surf = pdfpage.get_image(img.image_id)
                        # Calculate DPI from height
                        dpi_x = round(surf.get_height() / (img.area.y2 - img.area.y1) * 72)
                        dpi_y = round(surf.get_width() / (img.area.x2 - img.area.x1) * 72)
                        if abs(dpi_x - dpi_y) <= 1:
                            dpi = max(dpi, dpi_x, dpi_y)

                    # Fall back to 300dpi for odd values
                    if dpi < 199 or dpi > 601:
                        dpi = 300

                    surf = cairo.ImageSurface(cairo.FORMAT_RGB24, int(dpi / 72 * page_width), int(dpi / 72 * page_height))
                    cr = cairo.Context(surf)
                    cr.scale(dpi / 72, dpi / 72)
                    cr.set_source_rgb(1, 1, 1)
                    cr.paint()

                    pdfpage.render_for_printing(cr)

                    del cr

                img = to_opencv(surf)

            else:
                img = cv2.imread(filename)

            yield img, filename, page