Ejemplos de wordboxes_page_iterator en Python

Lenguaje de programación: Python

Namespace/Package Name: uplib.plibUtil

Método / Función: wordboxes_page_iterator

Ejemplos en hotexamples.com: 3

Python wordboxes_page_iterator - 3 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de uplib.plibUtil.wordboxes_page_iterator extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Ejemplo n.º 1

Mostrar archivo

Archivo: ePub.py Proyecto: project-renard-survey/xerox-parc-uplib-mirror

def get_svg_version (repo, response, params):

    doc_id = params.get("doc_id")
    if not doc_id:
        response.error(HTTPCodes.BAD_REQUEST, "No doc specified.\n")
        return
    elif not repo.valid_doc_id(doc_id):
        response.error(HTTPCodes.BAD_REQUEST, "Invalid doc ID %s specified.\n" % doc_id)
        return

    note("doc_id is %s", doc_id)
    doc = repo.get_document(doc_id)
    page = params.get("page")
    if not page:
        response.error(HTTPCodes.BAD_REQUEST, "No page index specified.")
        return
    page = int(page)
    note("page is %s", page)
    page_count = int(doc.get_metadata("page-count") or doc.get_metadata("pagecount") or "0")
    if page >= page_count:
        response.error(HTTPCodes.BAD_REQUEST, "No such page %d." % page)
        return

    language = doc.text_language() or "en-US"
    dpi = int(doc.get_metadata('images-dpi') or doc.get_metadata('tiff-dpi') or doc.get_metadata("dpi") or 300)
    page_image_size = tuple([(float(x.strip())*72/float(dpi))
                             for x in (doc.get_metadata("images-size") or
                                       doc.get_metadata("tiff-size")).split(",")])

    pages = {}
    illustrations = {}
    links = {}

    imd = read_illustrations_metadata(doc.folder(), True)
    for (left, top, width, height, type, bits, pageno) in imd:
        if ((width * height) < 100):
            continue
        if pageno in illustrations:
            illustrations[pageno].append((left, top, width, height, bits, pageno))
        else:
            illustrations[pageno] = [(left, top, width, height, bits, pageno)]
    lmd = doc.links().values()
    for link in lmd:
        if hasattr(link, "from_page") and (link.typename == "uri"):
            pageno = link.from_page
            if pageno in links:
                links[pageno].append(link)
            else:
                links[pageno] = [link]            

    note("links are %s", links)

    for page_index, bboxes in wordboxes_page_iterator(doc.folder()):

        page_svg  = (u'''<?xml version="1.0" standalone="no"?>
                         <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
                                   "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
                            <svg width="%spt" height="%spt" version="1.1"
                                 xmlns="http://www.w3.org/2000/svg"
                                 xmlns:xlink="http://www.w3.org/1999/xlink">
                     ''' % page_image_size)

        if page_index in illustrations:
            for left, top, width, height, image, junk in illustrations.get(page_index):
                page_svg += u'<image x="%spt" y="%spt" width="%spt" height="%spt" xlink:href="%s" />\n' % (
                    left, top, width, height, _form_data_url(image))

        if page_index in links:
            note("links for %s are %s", page_index, links.get(page_index))
            for link in links[page_index]:
                fr = getattr(link, "from_rect")
                if fr:
                    left, top, width, height = fr
                    uri = urllib.quote_plus(link.to_uri)
                    page_svg += (u'<a xlink:href="%s"><rect x="%spt" y="%spt" ' % (uri, left, top) +
                                 u'width="%spt" height="%spt" fill="none" stroke="none" /></a>\n' % (
                                     width, height))

        for bbox in bboxes:
            face = (bbox.is_italic() and "Italic") or "Regular"
            family = (bbox.is_fixedwidth() and "Monospace") or (bbox.is_serif() and "Serif") or "Sans-Serif"
            weight = (bbox.is_bold() and "Bold") or "Regular"
            page_svg += u'<text x="%spt" y="%spt" font-family="%s" font-size="%spt" font-style="%s" font-weight="%s">%s</text>' % (
                bbox.left(), bbox.top(), family, bbox.font_size() * 0.9, face, weight, htmlescape(bbox.text()))
            if bbox.ends_word():
                page_svg += u"\n"
        page_svg += u"</svg>\n"
        pages[page_index] = page_svg

    for pageno in pages:
        note("%s: %s\n", pageno, len(pages.get(pageno)))

    response.reply(pages.get(page), "image/svg+xml")

Ejemplo n.º 2

Mostrar archivo

Archivo: createPageBboxes.py Proyecto: project-renard-survey/xerox-parc-uplib-mirror

def do_page_bounding_boxes (dirpath):

    textfilepath = os.path.join(dirpath, "contents.txt")
    wordbox_file = open(os.path.join(dirpath, "wordbboxes"), 'rb')
    pos_filepath = os.path.join(dirpath, "contents.ind")
    para_filepath = os.path.join(dirpath, "paragraphs.txt")

    note ("doing page bboxes for %s...", dirpath)

    if os.path.exists(pos_filepath):
        fp = open(pos_filepath, 'r')
        postags = POSTag.parse_parseinfo(fp)
        fp.close()
    else:
        postags = None

    bbox_iterator = wordboxes_page_iterator(dirpath)

    text_file = open(textfilepath, 'rb')
    firstline = text_file.readline()
    charsetmatch = CHARSETPATTERN.match(firstline)
    if charsetmatch:
        charsetname = charsetmatch.group(1)
        text_file.readline()
        first_byte = text_file.tell()
    else:
        charsetname = "latin_1"
        readlines = false
        first_byte = 0
    if charsetname not in UTF8_ALIASES:
        raise ValueError("Charset in contents.txt must be UTF-8 for page bounding boxes to be created.  Apparently it's %s, instead." % charsetname)
    text_file.seek(first_byte)

    paras = read_paragraphs_file(para_filepath)
    if paras: paras.sort(key=lambda x: x.first_byte)

    from createThumbnails import thumbnail_translation_and_scaling
    translation, scaling = thumbnail_translation_and_scaling (dirpath)
    note(4, "   translation and scaling are %s and %s...", translation, scaling)

    def update_stats (stats, page_stats):
        if stats:
            stats += ", "
        stats += "%d:%.3f:%d:%d:%d:%d:%.3f" % (page_stats[0],
                                               ((page_stats[0] > 0) and float(page_stats[1])/float(page_stats[0]) or 0.0),
                                               page_stats[2], page_stats[3], page_stats[4], page_stats[5],
                                               ((page_stats[0] > 0) and float(page_stats[6])/float(page_stats[0]) or 0.0))
        return stats
        

    page_index = 0
    out_page_index = 0
    last_cindex = 0
    bboxes = []
    postags_index = 0

    stats = ""

    # accumulate stats
    doc_stats = [
        0,              # number of words
        0,              # total length (in characters)
        0,              # number of bold words
        0,              # number of italic words
        0,              # number of bold-italic words
        0,              # number of fixed-width words
        0.0,            # total font sizes
        ]

    for page_index, bboxes in bbox_iterator:

        page_stats = [
            0,              # number of words
            0,              # total length (in characters)
            0,              # number of bold words
            0,              # number of italic words
            0,              # number of bold-italic words
            0,              # number of fixed-width words
            0.0,            # total font sizes
            ]

        adjusted_bboxes = []

        for bbox in bboxes:

            char_count = bbox.nchars()

            doc_stats[0] += 1
            doc_stats[1] += bbox.nchars()
            if bbox.is_bold():
                doc_stats[2] += 1
            if bbox.is_italic():
                doc_stats[3] += 1
            if bbox.is_bold() and bbox.is_italic():
                doc_stats[4] += 1
            if bbox.is_fixedwidth():
                doc_stats[5] += 1
            doc_stats[6] += bbox.font_size()

            page_stats[0] += 1
            page_stats[1] += bbox.nchars()
            if bbox.is_bold():
                page_stats[2] += 1
            if bbox.is_italic():
                page_stats[3] += 1
            if bbox.is_bold() and bbox.is_italic():
                page_stats[4] += 1
            if bbox.is_fixedwidth():
                page_stats[5] += 1
            page_stats[6] += bbox.font_size()

            cindex = bbox.contents_offset()

            tag = None
            if postags:
                # advance to first POS tag which might apply to cindex
                while ((postags_index < len(postags)) and
                       (cindex >= (postags[postags_index].start + postags[postags_index].length))):
                    postags_index = postags_index + 1
                # might be cindex positions for which we have not tags -- check for that
                if ((postags_index < len(postags)) and (cindex >= postags[postags_index].start) and
                    (cindex < (postags[postags_index].start + postags[postags_index].length))):
                    tag = postags[postags_index]

            if paras and (paras[0].first_byte <= (cindex + char_count)) and (paras[0].first_byte_not >= cindex):
                # starts this paragraph
                if tag is None:
                    tag = POSTag(cindex, char_count, None, "",
                                 True, False, False)
                else:
                    tag.starts_paragraph = True
                paras = paras[1:]

            # again, add back in the 20-pixel border on the page
            ulx = trunc((bbox.left() + translation[0]) * scaling[0] + 0.5)
            uly = trunc((bbox.top() + translation[1]) * scaling[1] + 0.5)
            lrx = trunc((bbox.right() + translation[0]) * scaling[0] + 0.5)
            lry = trunc((bbox.bottom() + translation[1]) * scaling[1] + 0.5)

            adjusted_bboxes.append((bbox, tag, ulx, uly, lrx, lry))
            last_cindex = cindex

        if (len(adjusted_bboxes) > 0):

            startpoint = adjusted_bboxes[0][0].contents_offset()
            endpoint = adjusted_bboxes[-1][0].contents_offset() + (adjusted_bboxes[-1][0].nchars() * 4)
            text_file.seek(startpoint + first_byte)
            pagetext = text_file.read(endpoint - startpoint)
            pagestart = startpoint

        else:
            pagetext = ""
            pagestart = last_cindex

        flush_page (dirpath, page_index, adjusted_bboxes, pagetext, pagestart)

        stats = update_stats(stats, page_stats)

    text_file.close()
    wordbox_file.close()

    dstats = update_stats("", doc_stats)

    update_metadata(os.path.join(dirpath, "metadata.txt"), { "wordbbox-stats-pagewise": stats, "wordbbox-stats-docwise": dstats})

Ejemplo n.º 3

Mostrar archivo

Archivo: images.py Proyecto: project-renard-survey/xerox-parc-uplib-mirror

def findimages(folder, debug=None):

    images = []

    if not FINDIMAGES_PROGRAM:
        note(3, "FINDIMAGES_PROGRAM not defined")
        return images

    images_dir = os.path.join(folder, "page-images")
    if not os.path.isdir(images_dir):
        note(3, "No page images in %s", images_dir)
        return images

    md = read_metadata(os.path.join(folder, "metadata.txt"))
    dpi = int(md.get("images-dpi") or md.get("dpi") or md.get("tiff-dpi") or 300)
    scaling_factor = float(dpi)/72

    def get_images_for_page (page_index, wordboxes, dpi, images_dir):
        pageimages = []
        filepath = os.path.join(images_dir, "page%05d.png" % (page_index + 1))
        if os.path.exists(filepath):
            wordboxes_file = tempfile.mktemp()
            try:
                boxlist = []
                if wordboxes:
                    # first, write out list of wordboxes, in Leptonica BOXA format
                    for i in range(len(wordboxes)):
                        box = boxes[i]
                        x, y, w, h = (int(box.left() * dpi / 72.0), int(box.top() * dpi / 72.0),
                                      int(box.width() * dpi / 72.0), int(box.height() * dpi / 72.0))
                        if (w > 0) and (h > 0):
                            boxlist.append((x, y, w, h))
                    if len(boxlist) > 0:
                        fp = open(wordboxes_file, "wb")
                        fp.write("\nBoxa Version 2\nNumber of boxes = %d\n" % len(boxlist))
                        for i in range(len(boxlist)):
                            fp.write("  Box[%d]: " % i + "x = %d, y = %d, w = %d, h = %d\n" % boxlist[i])
                        fp.close()
                # now, run the finder on the page image plus the list of wordboxes
                debug_arg = (debug and "--debug") or " "
                cmd = "%s %s %s %s %s" % (FINDIMAGES_PROGRAM, debug_arg, dpi, filepath, (boxlist and wordboxes_file) or "-")
                note(4, "findimages cmd is <<%s>>", cmd)
                status, output, tsignal = subproc(cmd)
                if status == 0:
                    for line in [x.strip() for x in output.split('\n') if x.strip()]:
                        if not line.startswith("halftone "):
                            continue
                        pageimages.append((str(page_index) + " " + line.strip()).split())
                else:
                    note(3, "findimages command <%s> returns bad status %s:\n%s\n" % (cmd, status, output))
            finally:
                # remove the temp file
                if os.path.exists(wordboxes_file):
                    os.unlink(wordboxes_file)
                    # note("%d:  wordboxes file is %s", page_index, wordboxes_file)
        return pageimages

    if os.path.exists(os.path.join(folder, "wordbboxes")):
        for page_index, boxes in wordboxes_page_iterator(folder):
            images += get_images_for_page (page_index, boxes, dpi, images_dir)
    else:
        # handle case where there's no text for the image
        files = os.listdir(images_dir)
        for file in files:
            m = PAGE_IMAGE_FILENAME_PATTERN.match(file)
            if m:
                pageimages = get_images_for_page(int(m.group(1))-1, None, dpi, images_dir)
                images += pageimages

    point_squared = scaling_factor * scaling_factor
    images = [(pageno, imtype, x, y, width, height)
              for (pageno, imtype, x, y, width, height) in images
              if ((int(height) * int(width)) > point_squared)]
    note(3, "images for %s are %s", folder, images)
    return images