def textreplace(bookname): data = request.get_json() replacements = {} for r in data["replacements"]: if r["page"] not in replacements: replacements[r["page"]] = [] replacements[r["page"]].append((r["line"], r["text"].strip(), r["comment"].strip())) layer = data["layer"] book = Book.query.filter_by(name=bookname).one() cnt = 0 for pname, rs in replacements.items(): p = Page.query.filter_by(book_id=book.id, name=pname).one() xml = p.data root = etree.fromstring(xml) ns = {"ns": root.nsmap[None]} for r in rs: uc = root.xpath('//ns:TextLine[@id="{}"]'.format(r[0]) + '/ns:TextEquiv[@index="{}"]'.format(layer) + '/ns:Unicode', namespaces=ns)[0] uc.text = r[1] uc.getparent().getparent().attrib["comments"] = r[2] cnt += 1 p.data = etree.tounicode(root.getroottree()) db_session.commit() return "Wrote {} lines to layer {}.".format(cnt, layer)
def textlayers(bookname): data = request.get_json() b = Book.query.filter_by(name=bookname).one() if request.method == "GET": return jsonify(layers=getlayers(b)) elif data["action"] == "copy": source = data["layer"] target = data["target"] ct = 0 for p in Page.query.filter_by(book_id=b.id): root = etree.fromstring(p.data) ns = {"ns": root.nsmap[None]} for e in root.xpath('//ns:TextEquiv[@index="{}"]'.format(source), namespaces=ns): tl = e.getparent() new = deepcopy(e) new.attrib["index"] = target old = e.xpath('../ns:TextEquiv[@index="{}"]'.format(target), namespaces=ns) if old: e.getparent().remove(old[0]) e.getparent().append(new) ct += 1 p.no_lines_gt = int(root.xpath('count(//ns:TextEquiv' '[@index="0"])', namespaces=ns)) p.no_lines_ocr = int(root.xpath('count(//ns:TextLine' '[count(./ns:TextEquiv' '[@index>0])>0])', namespaces=ns)) p.data = etree.tounicode(root.getroottree()) db_session.commit() return jsonify(copied=ct) elif data["action"] == "delete": layer = int(data["layer"]) ct = 0 for p in Page.query.filter_by(book_id=b.id): root = etree.fromstring(p.data) ns = {"ns": root.nsmap[None]} for e in root.xpath('//ns:TextEquiv[@index="{}"]'.format(layer), namespaces=ns): e.getparent().remove(e) ct += 1 p.no_lines_gt = int(root.xpath('count(//ns:TextEquiv' '[@index="0"])', namespaces=ns)) p.no_lines_ocr = int(root.xpath('count(//ns:TextLine' '[count(./ns:TextEquiv' '[@index>0])>0])', namespaces=ns)) p.data = etree.tounicode(root.getroottree()) db_session.commit() return jsonify(deleted=ct)
def import_folder(bookpath, bookname="", pages="*.xml"): if not bookname: bookname = path.split(bookpath)[1] no_pages_total = len(glob(bookpath + "/*.xml")) try: book = Book.query.filter_by(name=bookname).one() except NoResultFound: book = Book(name=bookname, no_pages_total=no_pages_total) book.no_pages_total = no_pages_total print('Importing book "{}"...'.format(bookname)) cnt = 0 for xmlfile in sorted(glob(bookpath + "/" + pages)): pagename = path.split(xmlfile)[1].split(".")[0] print("Importing page {}...".format(pagename)) try: page = Page.query.filter_by(book_id=book.id, name=pagename).one() except NoResultFound: page = Page(book=book, name=pagename) root = etree.parse(xmlfile).getroot() ns = {"ns": root.nsmap[None]} # convert point notation from pagexml version 2013 for c in root.xpath("//ns:Coords[not(@points)]", namespaces=ns): cc = [] for point in c.xpath("./ns:Point", namespaces=ns): cc.append(point.attrib["x"] + "," + point.attrib["y"]) c.remove(point) c.attrib["points"] = " ".join(cc) textregions = root.xpath('//ns:TextRegion', namespaces=ns) page.no_lines_segm = int( root.xpath("count(//ns:TextLine)", namespaces=ns)) page.no_lines_gt = int( root.xpath('count(//ns:TextLine/ns:TextEquiv[@index="0"])', namespaces=ns)) page.no_lines_ocr = int( root.xpath( 'count(//ns:TextLine[count' '(./ns:TextEquiv[@index>0])>0])', namespaces=ns)) page.data = etree.tounicode(root.getroottree()).replace( "http://schema.primaresearch.org/PAGE/gts/pagecontent/2010-03-19", "http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15") cnt += 1 db_session.add(book) db_session.commit() print('{} pages imported for book {}.'.format(cnt, bookname))
def bookdelete(): parser = argparse.ArgumentParser() parser.add_argument("bookname", type=str, help="The name of the book.") args = parser.parse_args() try: book = Book.query.filter_by(name=args.bookname).one() except NoResultFound: print("Book {} not in database!".format(args.bookname)) return for p in book.pages: db_session.delete(p) db_session.delete(book) db_session.commit() print("Deleted book {} from database.".format(args.bookname))
def scan_bookfolder(bookfolder, img_subdir): """ Scan bookfolder and write book info to database. """ books = glob(bookfolder + "//*/") for bookpath in books: bookname = path.split(bookpath[:-1])[1] files = set([f.split(sep=".")[0] for f in glob(bookpath + img_subdir + "*.png")]) no_pages_total = len(files) book = Book.query.filter_by(name=bookname).first() if not book: book = Book(name=bookname, no_pages_total=no_pages_total) db_session.add(book) else: book.no_pages_total = no_pages_total db_session.commit()
def ownership(bookname): book = Book.query.filter_by(name=bookname).one() if current_user.is_anonymous: flash("You are anonymous, ownership of {} cannot be changed." .format(bookname)) elif current_user.id == book.access: book.access = None flash("Ownership of {} returned.".format(bookname)) elif book.access and current_user.id != book.access: email = User.query.filter_by(id=book.access).one().email flash("Book {} is owned by {}.".format(bookname, email)) else: book.access = current_user.id flash("Ownership of {} claimed successfully.".format(bookname)) db_session.commit() return redirect(url_for("index"))
def libedit(bookname, action=""): book = Book.query.filter_by(name=bookname).one() if not book: return jsonify(success=0) if action == "delete": db_session.delete(book) db_session.commit() return jsonify(success=1) if action == "toggle_archive": book.archive = not book.archive db_session.commit() return jsonify(success=1) if action == "copy_to_larex": count = copy_to_larex(bookname, app.config["BOOKS_DIR"], app.config["IMAGE_SUBDIR"], app.config["LAREX_DIR"], app.config["LAREX_GRP"]) flash("Copied {} files to LAREX.".format(count)) return jsonify(files_copied=count) if action == "select_from_larex": ps = glob("{}/{}/*.xml".format(app.config["LAREX_DIR"], bookname)) pages = [] for p in ps: pname = path.split(p)[1].split(".")[0] existing = Page.query.filter_by(book_id=book.id, name=pname).first() lines_ex = existing.no_lines_gt if existing else -1 pages.append((pname, lines_ex)) return jsonify(res=render_template('selectlarex.html', pages=pages)) if action == "import_from_larex": data = request.json pages = data["pages"] pages = [ "{}/{}/{}.xml".format(app.config["LAREX_DIR"], bookname, p) for p in pages ] for p in pages: if not path.isfile(p): pages[pages.index(p)] = p[:-3] + "bin.xml" task = lareximport.apply_async(args=[bookname], kwargs={"pages": pages}) return jsonify({'Location': url_for('taskstatus', task_id=task.id)})
def upload_pagexml(file): try: zf = zipfile.ZipFile(file._file) except zipfile.BadZipFile: return "Upload failed. Please upload a valid zip file." result = {} for fn in zf.namelist(): try: bookname, filename = fn.split("/") except ValueError: return "Upload failed. The files inside the zip file have to be" +\ " named <BOOKNAME>/<PAGENAME>.xml." if not filename: continue try: book = Book.query.filter_by(name=bookname).one() except NoResultFound: return "Import aborted. Book {} is not in your library."\ .format(bookname) if bookname not in result: result[bookname] = 0 pagename = path.splitext(path.split(filename)[1])[0] try: page = Page.query.filter_by(book_id=book.id, name=pagename).one() except NoResultFound: page = Page(book=book, name=pagename) # return "Import aborted. Book {}, page {} is not in your library."\ # .format(bookname, pagename) pagexml = zf.read(fn).decode("utf-8") root = etree.fromstring(pagexml) ns = {"ns": root.nsmap[None]} page.no_lines_segm = int(root.xpath("count(//ns:TextLine)", namespaces=ns)) page.no_lines_gt = int(root.xpath( 'count(//ns:TextLine/ns:TextEquiv[@index="0"])', namespaces=ns)) page.no_lines_ocr = int(root.xpath('count(//ns:TextLine' '[count(./ns:TextEquiv' '[@index>0])>0])', namespaces=ns)) page.data = etree.tounicode(root.getroottree()) result[bookname] += 1 db_session.commit() res = "Import successfull: {}.".format(", ".join( [": ".join([i[0], str(i[1])]) for i in result.items()])) return res
def ocrdata(): if "Content-Encoding" in request.headers and \ request.headers["Content-Encoding"] == "gzip": data = json.loads(gzip.decompress(request.data).decode("utf-8")) else: data = request.get_json() cnt = 0 for bname, bdict in data["ocrdata"].items(): b = Book.query.filter_by(name=bname).one() for pname, pdict in bdict.items(): p = Page.query.filter_by(book_id=b.id, name=pname).one() root = etree.fromstring(p.data) ns = {"ns": root.nsmap[None]} for lid, text in pdict.items(): linexml = root.find('.//ns:TextLine[@id="'+lid+'"]', namespaces=ns) if linexml is None: continue textequivxml = linexml.find('./ns:TextEquiv[@index="{}"]' .format(data["index"]), namespaces=ns) if textequivxml is None: textequivxml = etree.SubElement(linexml, "{{{}}}TextEquiv" .format(ns["ns"]), attrib={"index": str(data["index"]) }) unicodexml = textequivxml.find('./ns:Unicode', namespaces=ns) if unicodexml is None: unicodexml = etree.SubElement(textequivxml, "{{{}}}Unicode" .format(ns["ns"])) unicodexml.text = text cnt += 1 p.no_lines_ocr = int(root.xpath('count(//ns:TextLine' '[count(./ns:TextEquiv' '[@index>0])>0])', namespaces=ns)) p.data = etree.tounicode(root.getroottree()) db_session.commit() return "Imported {} lines.".format(cnt)
def add_page(book, xmlfile, commit=True, scale=None): """ Add page from PageXML file to book (segment lines, update version...). scale may either be int, {"rtype": int, ... "other": int} or None TODO: add reading order """ text_direction = 'horizontal-rl'\ if book.name.endswith("_ar") else 'horizontal-lr' bookpath = path.split(xmlfile)[0] pagename = path.split(xmlfile)[1].split(".")[0] page = Page.query.filter_by(book_id=book.id, name=pagename).first() if not page: page = Page(book=book, name=pagename) page.data, page.no_lines_segm = \ pagexmllineseg(xmlfile, bookpath, text_direction=text_direction, scale=scale) if commit: db_session.add(book) db_session.commit() return page.no_lines_segm
def editorsettings(): if current_user.is_anonymous: email = "user@nashi" else: email = current_user.email if request.method == "GET": try: s = EditorSettings.query.filter_by(email=email).one() except NoResultFound: return jsonify(status="fail") return jsonify(status="success", settings=json.loads(s.settings)) if request.method == "POST": try: s = EditorSettings.query.filter_by(email=email).one() except NoResultFound: s = EditorSettings(email=email) s.settings = json.dumps(request.get_json()) db_session.add(s) db_session.commit() return jsonify(status="success")
""" text_direction = 'horizontal-rl'\ if book.name.endswith("_ar") else 'horizontal-lr' bookpath = path.split(xmlfile)[0] pagename = path.split(xmlfile)[1].split(".")[0] page = Page.query.filter_by(book_id=book.id, name=pagename).first() if not page: page = Page(book=book, name=pagename) page.data, page.no_lines_segm = \ pagexmllineseg(xmlfile, bookpath, text_direction=text_direction, scale=scale) if commit: db_session.add(book) db_session.commit() return page.no_lines_segm if __name__ == '__main__': for bookpath in [ x for x in glob(app.config["LAREX_DIR"] + "/*") if path.isdir(x) ]: book = create_book(bookpath) for xmlfile in sorted(glob(bookpath + "/*.xml")): add_page(book, xmlfile, commit=False) db_session.add(book) db_session.commit()
def pagedata(bookname, pagename): pnamesplits = pagename.split("+") command = "" if len(pnamesplits) == 2: pagename, command = pnamesplits book = Book.query.filter_by(name=bookname).first() if command: plist = sorted([x.name for x in Book.query.filter_by(name=bookname).first().pages]) if command == "first": pagename = min(plist) elif command == "next": pagename = plist[(plist.index(pagename) + 1) % len(plist)] elif command == "prev": pagename = plist[(plist.index(pagename) - 1) % len(plist)] page = Page.query.filter_by(book_id=book.id, name=pagename).first() root = etree.fromstring(page.data) ns = {"ns": root.nsmap[None]} if request.method == "GET": if (request.args.get('download', '', type=str) == 'xml'): response = make_response(page.data) response.headers['Cache-Control'] = 'no-cache' response.headers['Content-Type'] = 'text/xml' response.headers['Content-Disposition'] =\ "attachment; filename={}_{}.xml".format(bookname, pagename) return response pageattr = root.find(".//ns:Page", namespaces=ns).attrib image = {"file": pageattr["imageFilename"], "image_x": pageattr["imageWidth"], "image_y": pageattr["imageHeight"]} direction = "rtl" if bookname.endswith("_ar") else "ltr" regionmap = {} for r in root.findall(".//ns:TextRegion/ns:Coords[@points]", namespaces=ns): textregion = r.getparent() r_id = textregion.attrib["id"] regionmap[r_id] = {} regionmap[r_id]["points"] = r.attrib["points"] pagemap = {} for l in root.findall(".//ns:TextLine/ns:Coords[@points]", namespaces=ns): textline = l.getparent() l_id = textline.attrib["id"] pagemap[l_id] = {} pagemap[l_id]["points"] = l.attrib["points"] pagemap[l_id]["region"] = textline.getparent().attrib["id"] pagemap[l_id]["comments"] = textline.attrib["comments"] if \ "comments" in textline.attrib else "" indz = textline.xpath('./ns:TextEquiv/@index', namespaces=ns) lowestindex = min(indz) if indz else "" textcontent = textline.find( './ns:TextEquiv[@index="{}"]/ns:Unicode'.format(lowestindex), namespaces=ns).text if lowestindex else "" status = "empty" if textcontent: status = "ocr" if int(lowestindex) else "gt" if textcontent is None: textcontent = "" pagemap[l_id]["text"] = {"status": status, "content": textcontent} return jsonify(page=pagename, image=image, lines=pagemap, regions=regionmap, direction=direction) if request.method == "POST": data = request.json if current_user.is_anonymous: user = "******" else: user = current_user.email for l in [l for l in data["edits"] if l["action"] == "delete"]: cur = l["id"] ldata = l line = root.find('.//ns:TextLine[@id="'+cur+'"]', namespaces=ns) line.getparent().remove(line) for l in [l for l in data["edits"] if l["action"] == "create"]: cur = l["id"] ldata = l region = root.find('.//ns:TextRegion[@id="{}"]'.format( ldata["input"]["region"]), namespaces=ns) line = etree.SubElement(region, "{{{}}}TextLine".format( ns["ns"]), attrib={"id": cur}) coords = etree.SubElement(line, "{{{}}}Coords".format(ns["ns"]), attrib={"points": ldata["input"]["points"]}) for l in [l for l in data["edits"] if l["action"] == "change"]: cur = l["id"] ldata = l text = ldata["input"]["text"]["content"].strip() textstatus = ldata["input"]["text"]["status"] comments = ldata["input"]["comments"] points = ldata["input"]["points"] rid = ldata["input"]["region"] line = root.find('.//ns:TextLine[@id="'+cur+'"]', namespaces=ns) line.attrib["comments"] = comments if textstatus == "edit": tequiv = line.find('.//ns:TextEquiv[@index="0"]', namespaces=ns) if tequiv is None: tequiv = etree.SubElement(line, "{{{}}}TextEquiv" .format(ns["ns"]), attrib={"index": "0"}) unicodexml = etree.SubElement(tequiv, "Unicode") else: unicodexml = tequiv.find('./ns:Unicode', namespaces=ns) tequiv.attrib["comments"] = "User: "******"points"] = points page.no_lines_gt = int(root.xpath('count(//TextEquiv[@index="0"])')) +\ int(root.xpath('count(//ns:TextEquiv[@index="0"])', namespaces=ns)) page.no_lines_segm = int(root.xpath('count(//TextLine)')) + \ int(root.xpath('count(//ns:TextLine)', namespaces=ns)) page.data = etree.tounicode(root.getroottree()) db_session.commit() return jsonify(lineinfo=str(page.no_lines_gt) + "/" + str(page.no_lines_segm))