def do_page_bounding_boxes (dirpath):

    textfilepath = os.path.join(dirpath, "contents.txt")
    wordbox_file = open(os.path.join(dirpath, "wordbboxes"), 'rb')
    pos_filepath = os.path.join(dirpath, "contents.ind")
    para_filepath = os.path.join(dirpath, "paragraphs.txt")

    note ("doing page bboxes for %s...", dirpath)

    if os.path.exists(pos_filepath):
        fp = open(pos_filepath, 'r')
        postags = POSTag.parse_parseinfo(fp)
        fp.close()
    else:
        postags = None

    bbox_iterator = wordboxes_page_iterator(dirpath)

    text_file = open(textfilepath, 'rb')
    firstline = text_file.readline()
    charsetmatch = CHARSETPATTERN.match(firstline)
    if charsetmatch:
        charsetname = charsetmatch.group(1)
        text_file.readline()
        first_byte = text_file.tell()
    else:
        charsetname = "latin_1"
        readlines = false
        first_byte = 0
    if charsetname not in UTF8_ALIASES:
        raise ValueError("Charset in contents.txt must be UTF-8 for page bounding boxes to be created.  Apparently it's %s, instead." % charsetname)
    text_file.seek(first_byte)

    paras = read_paragraphs_file(para_filepath)
    if paras: paras.sort(key=lambda x: x.first_byte)

    from createThumbnails import thumbnail_translation_and_scaling
    translation, scaling = thumbnail_translation_and_scaling (dirpath)
    note(4, "   translation and scaling are %s and %s...", translation, scaling)

    def update_stats (stats, page_stats):
        if stats:
            stats += ", "
        stats += "%d:%.3f:%d:%d:%d:%d:%.3f" % (page_stats[0],
                                               ((page_stats[0] > 0) and float(page_stats[1])/float(page_stats[0]) or 0.0),
                                               page_stats[2], page_stats[3], page_stats[4], page_stats[5],
                                               ((page_stats[0] > 0) and float(page_stats[6])/float(page_stats[0]) or 0.0))
        return stats
        

    page_index = 0
    out_page_index = 0
    last_cindex = 0
    bboxes = []
    postags_index = 0

    stats = ""

    # accumulate stats
    doc_stats = [
        0,              # number of words
        0,              # total length (in characters)
        0,              # number of bold words
        0,              # number of italic words
        0,              # number of bold-italic words
        0,              # number of fixed-width words
        0.0,            # total font sizes
        ]

    for page_index, bboxes in bbox_iterator:

        page_stats = [
            0,              # number of words
            0,              # total length (in characters)
            0,              # number of bold words
            0,              # number of italic words
            0,              # number of bold-italic words
            0,              # number of fixed-width words
            0.0,            # total font sizes
            ]

        adjusted_bboxes = []

        for bbox in bboxes:

            char_count = bbox.nchars()

            doc_stats[0] += 1
            doc_stats[1] += bbox.nchars()
            if bbox.is_bold():
                doc_stats[2] += 1
            if bbox.is_italic():
                doc_stats[3] += 1
            if bbox.is_bold() and bbox.is_italic():
                doc_stats[4] += 1
            if bbox.is_fixedwidth():
                doc_stats[5] += 1
            doc_stats[6] += bbox.font_size()

            page_stats[0] += 1
            page_stats[1] += bbox.nchars()
            if bbox.is_bold():
                page_stats[2] += 1
            if bbox.is_italic():
                page_stats[3] += 1
            if bbox.is_bold() and bbox.is_italic():
                page_stats[4] += 1
            if bbox.is_fixedwidth():
                page_stats[5] += 1
            page_stats[6] += bbox.font_size()

            cindex = bbox.contents_offset()

            tag = None
            if postags:
                # advance to first POS tag which might apply to cindex
                while ((postags_index < len(postags)) and
                       (cindex >= (postags[postags_index].start + postags[postags_index].length))):
                    postags_index = postags_index + 1
                # might be cindex positions for which we have not tags -- check for that
                if ((postags_index < len(postags)) and (cindex >= postags[postags_index].start) and
                    (cindex < (postags[postags_index].start + postags[postags_index].length))):
                    tag = postags[postags_index]

            if paras and (paras[0].first_byte <= (cindex + char_count)) and (paras[0].first_byte_not >= cindex):
                # starts this paragraph
                if tag is None:
                    tag = POSTag(cindex, char_count, None, "",
                                 True, False, False)
                else:
                    tag.starts_paragraph = True
                paras = paras[1:]

            # again, add back in the 20-pixel border on the page
            ulx = trunc((bbox.left() + translation[0]) * scaling[0] + 0.5)
            uly = trunc((bbox.top() + translation[1]) * scaling[1] + 0.5)
            lrx = trunc((bbox.right() + translation[0]) * scaling[0] + 0.5)
            lry = trunc((bbox.bottom() + translation[1]) * scaling[1] + 0.5)

            adjusted_bboxes.append((bbox, tag, ulx, uly, lrx, lry))
            last_cindex = cindex

        if (len(adjusted_bboxes) > 0):

            startpoint = adjusted_bboxes[0][0].contents_offset()
            endpoint = adjusted_bboxes[-1][0].contents_offset() + (adjusted_bboxes[-1][0].nchars() * 4)
            text_file.seek(startpoint + first_byte)
            pagetext = text_file.read(endpoint - startpoint)
            pagestart = startpoint

        else:
            pagetext = ""
            pagestart = last_cindex

        flush_page (dirpath, page_index, adjusted_bboxes, pagetext, pagestart)

        stats = update_stats(stats, page_stats)

    text_file.close()
    wordbox_file.close()

    dstats = update_stats("", doc_stats)

    update_metadata(os.path.join(dirpath, "metadata.txt"), { "wordbbox-stats-pagewise": stats, "wordbbox-stats-docwise": dstats})
コード例 #2
0
 def thumbnail_translation_and_scaling (self):
     import createThumbnails
     return createThumbnails.thumbnail_translation_and_scaling (self.folder())