def get_svg_version (repo, response, params): doc_id = params.get("doc_id") if not doc_id: response.error(HTTPCodes.BAD_REQUEST, "No doc specified.\n") return elif not repo.valid_doc_id(doc_id): response.error(HTTPCodes.BAD_REQUEST, "Invalid doc ID %s specified.\n" % doc_id) return note("doc_id is %s", doc_id) doc = repo.get_document(doc_id) page = params.get("page") if not page: response.error(HTTPCodes.BAD_REQUEST, "No page index specified.") return page = int(page) note("page is %s", page) page_count = int(doc.get_metadata("page-count") or doc.get_metadata("pagecount") or "0") if page >= page_count: response.error(HTTPCodes.BAD_REQUEST, "No such page %d." % page) return language = doc.text_language() or "en-US" dpi = int(doc.get_metadata('images-dpi') or doc.get_metadata('tiff-dpi') or doc.get_metadata("dpi") or 300) page_image_size = tuple([(float(x.strip())*72/float(dpi)) for x in (doc.get_metadata("images-size") or doc.get_metadata("tiff-size")).split(",")]) pages = {} illustrations = {} links = {} imd = read_illustrations_metadata(doc.folder(), True) for (left, top, width, height, type, bits, pageno) in imd: if ((width * height) < 100): continue if pageno in illustrations: illustrations[pageno].append((left, top, width, height, bits, pageno)) else: illustrations[pageno] = [(left, top, width, height, bits, pageno)] lmd = doc.links().values() for link in lmd: if hasattr(link, "from_page") and (link.typename == "uri"): pageno = link.from_page if pageno in links: links[pageno].append(link) else: links[pageno] = [link] note("links are %s", links) for page_index, bboxes in wordboxes_page_iterator(doc.folder()): page_svg = (u'''<?xml version="1.0" standalone="no"?> <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> <svg width="%spt" height="%spt" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"> ''' % page_image_size) if page_index in illustrations: for left, top, width, height, image, junk in illustrations.get(page_index): page_svg += u'<image x="%spt" y="%spt" width="%spt" height="%spt" xlink:href="%s" />\n' % ( left, top, width, height, _form_data_url(image)) if page_index in links: note("links for %s are %s", page_index, links.get(page_index)) for link in links[page_index]: fr = getattr(link, "from_rect") if fr: left, top, width, height = fr uri = urllib.quote_plus(link.to_uri) page_svg += (u'<a xlink:href="%s"><rect x="%spt" y="%spt" ' % (uri, left, top) + u'width="%spt" height="%spt" fill="none" stroke="none" /></a>\n' % ( width, height)) for bbox in bboxes: face = (bbox.is_italic() and "Italic") or "Regular" family = (bbox.is_fixedwidth() and "Monospace") or (bbox.is_serif() and "Serif") or "Sans-Serif" weight = (bbox.is_bold() and "Bold") or "Regular" page_svg += u'<text x="%spt" y="%spt" font-family="%s" font-size="%spt" font-style="%s" font-weight="%s">%s</text>' % ( bbox.left(), bbox.top(), family, bbox.font_size() * 0.9, face, weight, htmlescape(bbox.text())) if bbox.ends_word(): page_svg += u"\n" page_svg += u"</svg>\n" pages[page_index] = page_svg for pageno in pages: note("%s: %s\n", pageno, len(pages.get(pageno))) response.reply(pages.get(page), "image/svg+xml")
def do_page_bounding_boxes (dirpath): textfilepath = os.path.join(dirpath, "contents.txt") wordbox_file = open(os.path.join(dirpath, "wordbboxes"), 'rb') pos_filepath = os.path.join(dirpath, "contents.ind") para_filepath = os.path.join(dirpath, "paragraphs.txt") note ("doing page bboxes for %s...", dirpath) if os.path.exists(pos_filepath): fp = open(pos_filepath, 'r') postags = POSTag.parse_parseinfo(fp) fp.close() else: postags = None bbox_iterator = wordboxes_page_iterator(dirpath) text_file = open(textfilepath, 'rb') firstline = text_file.readline() charsetmatch = CHARSETPATTERN.match(firstline) if charsetmatch: charsetname = charsetmatch.group(1) text_file.readline() first_byte = text_file.tell() else: charsetname = "latin_1" readlines = false first_byte = 0 if charsetname not in UTF8_ALIASES: raise ValueError("Charset in contents.txt must be UTF-8 for page bounding boxes to be created. Apparently it's %s, instead." % charsetname) text_file.seek(first_byte) paras = read_paragraphs_file(para_filepath) if paras: paras.sort(key=lambda x: x.first_byte) from createThumbnails import thumbnail_translation_and_scaling translation, scaling = thumbnail_translation_and_scaling (dirpath) note(4, " translation and scaling are %s and %s...", translation, scaling) def update_stats (stats, page_stats): if stats: stats += ", " stats += "%d:%.3f:%d:%d:%d:%d:%.3f" % (page_stats[0], ((page_stats[0] > 0) and float(page_stats[1])/float(page_stats[0]) or 0.0), page_stats[2], page_stats[3], page_stats[4], page_stats[5], ((page_stats[0] > 0) and float(page_stats[6])/float(page_stats[0]) or 0.0)) return stats page_index = 0 out_page_index = 0 last_cindex = 0 bboxes = [] postags_index = 0 stats = "" # accumulate stats doc_stats = [ 0, # number of words 0, # total length (in characters) 0, # number of bold words 0, # number of italic words 0, # number of bold-italic words 0, # number of fixed-width words 0.0, # total font sizes ] for page_index, bboxes in bbox_iterator: page_stats = [ 0, # number of words 0, # total length (in characters) 0, # number of bold words 0, # number of italic words 0, # number of bold-italic words 0, # number of fixed-width words 0.0, # total font sizes ] adjusted_bboxes = [] for bbox in bboxes: char_count = bbox.nchars() doc_stats[0] += 1 doc_stats[1] += bbox.nchars() if bbox.is_bold(): doc_stats[2] += 1 if bbox.is_italic(): doc_stats[3] += 1 if bbox.is_bold() and bbox.is_italic(): doc_stats[4] += 1 if bbox.is_fixedwidth(): doc_stats[5] += 1 doc_stats[6] += bbox.font_size() page_stats[0] += 1 page_stats[1] += bbox.nchars() if bbox.is_bold(): page_stats[2] += 1 if bbox.is_italic(): page_stats[3] += 1 if bbox.is_bold() and bbox.is_italic(): page_stats[4] += 1 if bbox.is_fixedwidth(): page_stats[5] += 1 page_stats[6] += bbox.font_size() cindex = bbox.contents_offset() tag = None if postags: # advance to first POS tag which might apply to cindex while ((postags_index < len(postags)) and (cindex >= (postags[postags_index].start + postags[postags_index].length))): postags_index = postags_index + 1 # might be cindex positions for which we have not tags -- check for that if ((postags_index < len(postags)) and (cindex >= postags[postags_index].start) and (cindex < (postags[postags_index].start + postags[postags_index].length))): tag = postags[postags_index] if paras and (paras[0].first_byte <= (cindex + char_count)) and (paras[0].first_byte_not >= cindex): # starts this paragraph if tag is None: tag = POSTag(cindex, char_count, None, "", True, False, False) else: tag.starts_paragraph = True paras = paras[1:] # again, add back in the 20-pixel border on the page ulx = trunc((bbox.left() + translation[0]) * scaling[0] + 0.5) uly = trunc((bbox.top() + translation[1]) * scaling[1] + 0.5) lrx = trunc((bbox.right() + translation[0]) * scaling[0] + 0.5) lry = trunc((bbox.bottom() + translation[1]) * scaling[1] + 0.5) adjusted_bboxes.append((bbox, tag, ulx, uly, lrx, lry)) last_cindex = cindex if (len(adjusted_bboxes) > 0): startpoint = adjusted_bboxes[0][0].contents_offset() endpoint = adjusted_bboxes[-1][0].contents_offset() + (adjusted_bboxes[-1][0].nchars() * 4) text_file.seek(startpoint + first_byte) pagetext = text_file.read(endpoint - startpoint) pagestart = startpoint else: pagetext = "" pagestart = last_cindex flush_page (dirpath, page_index, adjusted_bboxes, pagetext, pagestart) stats = update_stats(stats, page_stats) text_file.close() wordbox_file.close() dstats = update_stats("", doc_stats) update_metadata(os.path.join(dirpath, "metadata.txt"), { "wordbbox-stats-pagewise": stats, "wordbbox-stats-docwise": dstats})
def findimages(folder, debug=None): images = [] if not FINDIMAGES_PROGRAM: note(3, "FINDIMAGES_PROGRAM not defined") return images images_dir = os.path.join(folder, "page-images") if not os.path.isdir(images_dir): note(3, "No page images in %s", images_dir) return images md = read_metadata(os.path.join(folder, "metadata.txt")) dpi = int(md.get("images-dpi") or md.get("dpi") or md.get("tiff-dpi") or 300) scaling_factor = float(dpi)/72 def get_images_for_page (page_index, wordboxes, dpi, images_dir): pageimages = [] filepath = os.path.join(images_dir, "page%05d.png" % (page_index + 1)) if os.path.exists(filepath): wordboxes_file = tempfile.mktemp() try: boxlist = [] if wordboxes: # first, write out list of wordboxes, in Leptonica BOXA format for i in range(len(wordboxes)): box = boxes[i] x, y, w, h = (int(box.left() * dpi / 72.0), int(box.top() * dpi / 72.0), int(box.width() * dpi / 72.0), int(box.height() * dpi / 72.0)) if (w > 0) and (h > 0): boxlist.append((x, y, w, h)) if len(boxlist) > 0: fp = open(wordboxes_file, "wb") fp.write("\nBoxa Version 2\nNumber of boxes = %d\n" % len(boxlist)) for i in range(len(boxlist)): fp.write(" Box[%d]: " % i + "x = %d, y = %d, w = %d, h = %d\n" % boxlist[i]) fp.close() # now, run the finder on the page image plus the list of wordboxes debug_arg = (debug and "--debug") or " " cmd = "%s %s %s %s %s" % (FINDIMAGES_PROGRAM, debug_arg, dpi, filepath, (boxlist and wordboxes_file) or "-") note(4, "findimages cmd is <<%s>>", cmd) status, output, tsignal = subproc(cmd) if status == 0: for line in [x.strip() for x in output.split('\n') if x.strip()]: if not line.startswith("halftone "): continue pageimages.append((str(page_index) + " " + line.strip()).split()) else: note(3, "findimages command <%s> returns bad status %s:\n%s\n" % (cmd, status, output)) finally: # remove the temp file if os.path.exists(wordboxes_file): os.unlink(wordboxes_file) # note("%d: wordboxes file is %s", page_index, wordboxes_file) return pageimages if os.path.exists(os.path.join(folder, "wordbboxes")): for page_index, boxes in wordboxes_page_iterator(folder): images += get_images_for_page (page_index, boxes, dpi, images_dir) else: # handle case where there's no text for the image files = os.listdir(images_dir) for file in files: m = PAGE_IMAGE_FILENAME_PATTERN.match(file) if m: pageimages = get_images_for_page(int(m.group(1))-1, None, dpi, images_dir) images += pageimages point_squared = scaling_factor * scaling_factor images = [(pageno, imtype, x, y, width, height) for (pageno, imtype, x, y, width, height) in images if ((int(height) * int(width)) > point_squared)] note(3, "images for %s are %s", folder, images) return images