def create_html_pages(images_input_dir, blocks_input_dir, output_dir): if not os.path.isdir(output_dir): os.makedirs(output_dir) imgs_dir = output_dir + "/imgs/" if not os.path.isdir(imgs_dir): os.makedirs(imgs_dir) else: for img_fname in os.listdir(imgs_dir): os.remove(imgs_dir + img_fname) img_rel_dir = "imgs/" files = [fname for fname in os.listdir(images_input_dir) \ if not fname.startswith(".") and fname[-3:] in ["bmp", "tif", "png", "svg", "jpg", 'peg']] if not files: print "FUCKUP no files in images_input_dir" exit() files.sort() file2page_number, gaps = assigned_page_numbers, gaps = assign_page_numbers( files, blocks_input_dir) if gaps: print gaps print "FUCKUP with page numbers. TERMINATE." exit() html = """<html><head><title></title><meta charset="UTF-8"/></head><body>\n""" processed = 0 for fname in files: processed += 1 if processed % 20 == 0: print "..processed", processed html_filename = output_dir + fname.split(".")[0] + ".html" paragraphs, letters, formulas, images, block2text = upload_page_data( blocks_input_dir + fname) page_image = Image.open(images_input_dir + fname) page_image = page_image.convert('LA') page_draw = ImageDraw.Draw(page_image) draw_mockup(page_image, paragraphs, letters, formulas, images, block2text) images_count = 0 paragraphs_count = 0 formulas_count = 0 all_blocks = [line for parag in paragraphs for line in parag] + formulas + images page_block = ((0, 100), (0, 100)) if all_blocks: page_block = all_blocks[0] for block in all_blocks: page_block = merge_blocks(block, page_block) page_block = ( ( 0, page_image.size[1], ), ( page_block[1][0] - 10, page_block[1][1] + 10, ), ) html_page_width = 800 html_page_height = html_page_width * ( page_block[0][1] - page_block[0][0]) / (page_block[1][1] - page_block[1][0]) html_page_inner = "" for img_block in images: img_image = extract_block_as_image(img_block, page_image, page_draw) html_page_inner += convert2html_block(img_image, img_block, page_block, html_page_height, 5, imgs_dir, img_rel_dir) for paragraph in paragraphs: by_height = [(line[0][0], line) for line in paragraph] by_height.sort() paragraph = [line for _, line in by_height] paragraph_block = paragraph[0] for line_block in paragraph: paragraph_block = merge_blocks(line_block, paragraph_block) """ img_regions = [] for line_block in paragraph: region = extract_block_as_image(line_block, page_image, page_draw) coords_in_paragraph = ((line_block[0][0] - paragraph_block[0][0], line_block[0][1] - paragraph_block[0][0]), (line_block[1][0] - paragraph_block[1][0], line_block[1][1] - paragraph_block[1][0])) img_regions += [(coords_in_paragraph, region)] parag_x_size, parag_y_size = paragraph_block[0][1] - paragraph_block[0][0], paragraph_block[1][1] - paragraph_block[1][0] parag_image = Image.new("LA", (parag_y_size, parag_x_size), 255) for coord, line_img in img_regions: parag_image.paste(line_img, block2PIL_block(coord)) html_page_inner += convert2html_block(parag_image, paragraph_block, page_block, html_page_height, 10, html_images_path, html_images_path_rel_path) """ html_page_inner += convert_parag2html_block( paragraph, block2text, paragraph_block, page_block, html_page_height, 10, imgs_dir, img_rel_dir) for formula_block in formulas: formula_img = extract_block_as_image(formula_block, page_image, page_draw) html_page_inner += convert2html_block(formula_img, formula_block, page_block, html_page_height, 5, imgs_dir, img_rel_dir) del page_draw #html = """<html><head><title></title></head><body>\n<div id="page" style="width: %dpx; height: %dpx; border: 1px solid black; position: relative; " >\n%s\n</div></body></html>\n""" %\ # (html_page_width, html_page_height, html_page_inner) #open(html_filename, "w").write(html) html += """\n<div id="page" style="width: %dpx; height: %dpx; border: 1px solid black; position: relative; " >\n%s\n</div>\n""" %\ (html_page_width, html_page_height, html_page_inner) html += "</body></html>" open(output_dir + "/index.html", "w").write(html.encode("utf8"))
def do_GET(self): full_query = self.path full_query = full_query.replace("?callback=", "&callback=") query = urlparse.parse_qs(urlparse.urlparse(full_query).query) query_type = full_query.split("?")[0] page = pages[0] response = "['', [], [], []]" if "/next_page" in query_type and "page" in query: page = query["page"][0].split("/")[-1] if page in pages: cur_index = pages.index(page) if cur_index < len(pages): page = pages[cur_index + 1] if "/prev_page" in query_type and "page" in query: page = query["page"][0].split("/")[-1] if page in pages: cur_index = pages.index(page) if cur_index > 0: page = pages[cur_index - 1] if "/page_send" in query_type and "page" in query: page = query["page"][0].split("/")[-1] paragraphs = [] formulas = [] images = [] for field, array in [("p", paragraphs), ("f", formulas), ("i", images)]: if field in query: for block in query[field]: try: x1, x2, y1, y2 = [int(chunk) for chunk in block.split(',')] array += [((x1, x2), (y1, y2),)] except: print "fuckup:", block save_corrected_blocks(paragraphs, images, formulas, page) print "saved" if 1: print page paragraphs_blocks = [] formulas = [] images = [] if not load_corrected_blocks(paragraphs_blocks, images, formulas, page): print "load orig" paragraphs, _, formulas, images, _ = upload_page_data(blocks_source + page) paragraphs_blocks = [] for paragraph in paragraphs: by_height = [(line[0][0], line) for line in paragraph] by_height.sort() paragraph = [line for _, line in by_height] paragraph_block = paragraph[0] for line_block in paragraph: paragraph_block = merge_blocks(line_block, paragraph_block) paragraphs_blocks += [paragraph_block] parags_str = ",".join([str(coord) for block in paragraphs_blocks for dim in block for coord in dim]) formulas_str = ",".join([str(coord) for block in formulas for dim in block for coord in dim]) images_str = ",".join([str(coord) for block in images for dim in block for coord in dim]) response = "[\"%s\", [%s], [%s], [%s]]" % (img_source + page, parags_str, images_str, formulas_str) function_name = query.has_key("callback") and query["callback"][0] or "" response = function_name + "(" + response + ")" response = response.encode("utf8") request_headers = self.headers.__str__().replace(chr(10), " ").replace(chr(13), " ") print "[STAT]\tclient:", self.client_address, "\theaders:", request_headers, "\tquery:", full_query sys.stdout.flush() self.send_response(200) self.send_header("Content-type", "text/plain") self.send_header("Content-Length", str(len(response))) self.end_headers() #self.wfile.write(json_result) self.wfile.write(response) print response