コード例 #1
0
def textreplace(bookname):
    data = request.get_json()
    replacements = {}
    for r in data["replacements"]:
        if r["page"] not in replacements:
            replacements[r["page"]] = []
        replacements[r["page"]].append((r["line"], r["text"].strip(),
                                        r["comment"].strip()))
    layer = data["layer"]
    book = Book.query.filter_by(name=bookname).one()
    cnt = 0
    for pname, rs in replacements.items():
        p = Page.query.filter_by(book_id=book.id, name=pname).one()
        xml = p.data
        root = etree.fromstring(xml)
        ns = {"ns": root.nsmap[None]}
        for r in rs:
            uc = root.xpath('//ns:TextLine[@id="{}"]'.format(r[0]) +
                            '/ns:TextEquiv[@index="{}"]'.format(layer) +
                            '/ns:Unicode', namespaces=ns)[0]
            uc.text = r[1]
            uc.getparent().getparent().attrib["comments"] = r[2]
            cnt += 1
        p.data = etree.tounicode(root.getroottree())
    db_session.commit()
    return "Wrote {} lines to layer {}.".format(cnt, layer)
コード例 #2
0
def textlayers(bookname):
    data = request.get_json()
    b = Book.query.filter_by(name=bookname).one()

    if request.method == "GET":
        return jsonify(layers=getlayers(b))

    elif data["action"] == "copy":
        source = data["layer"]
        target = data["target"]
        ct = 0
        for p in Page.query.filter_by(book_id=b.id):
            root = etree.fromstring(p.data)
            ns = {"ns": root.nsmap[None]}
            for e in root.xpath('//ns:TextEquiv[@index="{}"]'.format(source),
                                namespaces=ns):
                tl = e.getparent()
                new = deepcopy(e)
                new.attrib["index"] = target
                old = e.xpath('../ns:TextEquiv[@index="{}"]'.format(target),
                              namespaces=ns)
                if old:
                    e.getparent().remove(old[0])
                e.getparent().append(new)
                ct += 1
            p.no_lines_gt = int(root.xpath('count(//ns:TextEquiv'
                                           '[@index="0"])', namespaces=ns))
            p.no_lines_ocr = int(root.xpath('count(//ns:TextLine'
                                            '[count(./ns:TextEquiv'
                                            '[@index>0])>0])',
                                            namespaces=ns))
            p.data = etree.tounicode(root.getroottree())
        db_session.commit()
        return jsonify(copied=ct)

    elif data["action"] == "delete":
        layer = int(data["layer"])
        ct = 0
        for p in Page.query.filter_by(book_id=b.id):
            root = etree.fromstring(p.data)
            ns = {"ns": root.nsmap[None]}
            for e in root.xpath('//ns:TextEquiv[@index="{}"]'.format(layer),
                                namespaces=ns):
                e.getparent().remove(e)
                ct += 1
            p.no_lines_gt = int(root.xpath('count(//ns:TextEquiv'
                                           '[@index="0"])', namespaces=ns))
            p.no_lines_ocr = int(root.xpath('count(//ns:TextLine'
                                            '[count(./ns:TextEquiv'
                                            '[@index>0])>0])',
                                            namespaces=ns))
            p.data = etree.tounicode(root.getroottree())
        db_session.commit()
        return jsonify(deleted=ct)
コード例 #3
0
ファイル: import_from_pagexml.py プロジェクト: stweil/nashi
def import_folder(bookpath, bookname="", pages="*.xml"):
    if not bookname:
        bookname = path.split(bookpath)[1]
    no_pages_total = len(glob(bookpath + "/*.xml"))
    try:
        book = Book.query.filter_by(name=bookname).one()
    except NoResultFound:
        book = Book(name=bookname, no_pages_total=no_pages_total)
        book.no_pages_total = no_pages_total

    print('Importing book "{}"...'.format(bookname))
    cnt = 0
    for xmlfile in sorted(glob(bookpath + "/" + pages)):
        pagename = path.split(xmlfile)[1].split(".")[0]
        print("Importing page {}...".format(pagename))

        try:
            page = Page.query.filter_by(book_id=book.id, name=pagename).one()
        except NoResultFound:
            page = Page(book=book, name=pagename)

        root = etree.parse(xmlfile).getroot()
        ns = {"ns": root.nsmap[None]}

        # convert point notation from pagexml version 2013
        for c in root.xpath("//ns:Coords[not(@points)]", namespaces=ns):
            cc = []
            for point in c.xpath("./ns:Point", namespaces=ns):
                cc.append(point.attrib["x"] + "," + point.attrib["y"])
                c.remove(point)
            c.attrib["points"] = " ".join(cc)

        textregions = root.xpath('//ns:TextRegion', namespaces=ns)

        page.no_lines_segm = int(
            root.xpath("count(//ns:TextLine)", namespaces=ns))
        page.no_lines_gt = int(
            root.xpath('count(//ns:TextLine/ns:TextEquiv[@index="0"])',
                       namespaces=ns))
        page.no_lines_ocr = int(
            root.xpath(
                'count(//ns:TextLine[count'
                '(./ns:TextEquiv[@index>0])>0])',
                namespaces=ns))
        page.data = etree.tounicode(root.getroottree()).replace(
            "http://schema.primaresearch.org/PAGE/gts/pagecontent/2010-03-19",
            "http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15")
        cnt += 1

    db_session.add(book)
    db_session.commit()
    print('{} pages imported for book {}.'.format(cnt, bookname))
コード例 #4
0
ファイル: import_from_pagexml.py プロジェクト: stweil/nashi
def bookdelete():
    parser = argparse.ArgumentParser()
    parser.add_argument("bookname", type=str, help="The name of the book.")
    args = parser.parse_args()
    try:
        book = Book.query.filter_by(name=args.bookname).one()
    except NoResultFound:
        print("Book {} not in database!".format(args.bookname))
        return
    for p in book.pages:
        db_session.delete(p)
    db_session.delete(book)
    db_session.commit()
    print("Deleted book {} from database.".format(args.bookname))
コード例 #5
0
def scan_bookfolder(bookfolder, img_subdir):
    """ Scan bookfolder and write book info to database. """
    books = glob(bookfolder + "//*/")
    for bookpath in books:
        bookname = path.split(bookpath[:-1])[1]
        files = set([f.split(sep=".")[0] for f in glob(bookpath + img_subdir
                                                       + "*.png")])
        no_pages_total = len(files)
        book = Book.query.filter_by(name=bookname).first()
        if not book:
            book = Book(name=bookname, no_pages_total=no_pages_total)
            db_session.add(book)
        else:
            book.no_pages_total = no_pages_total
    db_session.commit()
コード例 #6
0
def ownership(bookname):
    book = Book.query.filter_by(name=bookname).one()
    if current_user.is_anonymous:
        flash("You are anonymous, ownership of {} cannot be changed."
              .format(bookname))
    elif current_user.id == book.access:
        book.access = None
        flash("Ownership of {} returned.".format(bookname))
    elif book.access and current_user.id != book.access:
        email = User.query.filter_by(id=book.access).one().email
        flash("Book {} is owned by {}.".format(bookname, email))
    else:
        book.access = current_user.id
        flash("Ownership of {} claimed successfully.".format(bookname))
    db_session.commit()
    return redirect(url_for("index"))
コード例 #7
0
def libedit(bookname, action=""):
    book = Book.query.filter_by(name=bookname).one()
    if not book:
        return jsonify(success=0)

    if action == "delete":
        db_session.delete(book)
        db_session.commit()
        return jsonify(success=1)

    if action == "toggle_archive":
        book.archive = not book.archive
        db_session.commit()
        return jsonify(success=1)

    if action == "copy_to_larex":
        count = copy_to_larex(bookname, app.config["BOOKS_DIR"],
                              app.config["IMAGE_SUBDIR"],
                              app.config["LAREX_DIR"], app.config["LAREX_GRP"])
        flash("Copied {} files to LAREX.".format(count))
        return jsonify(files_copied=count)

    if action == "select_from_larex":
        ps = glob("{}/{}/*.xml".format(app.config["LAREX_DIR"], bookname))
        pages = []
        for p in ps:
            pname = path.split(p)[1].split(".")[0]
            existing = Page.query.filter_by(book_id=book.id,
                                            name=pname).first()
            lines_ex = existing.no_lines_gt if existing else -1
            pages.append((pname, lines_ex))
        return jsonify(res=render_template('selectlarex.html', pages=pages))

    if action == "import_from_larex":
        data = request.json
        pages = data["pages"]
        pages = [
            "{}/{}/{}.xml".format(app.config["LAREX_DIR"], bookname, p)
            for p in pages
        ]
        for p in pages:
            if not path.isfile(p):
                pages[pages.index(p)] = p[:-3] + "bin.xml"
        task = lareximport.apply_async(args=[bookname],
                                       kwargs={"pages": pages})
        return jsonify({'Location': url_for('taskstatus', task_id=task.id)})
コード例 #8
0
def upload_pagexml(file):
    try:
        zf = zipfile.ZipFile(file._file)
    except zipfile.BadZipFile:
        return "Upload failed. Please upload a valid zip file."
    result = {}
    for fn in zf.namelist():
        try:
            bookname, filename = fn.split("/")
        except ValueError:
            return "Upload failed. The files inside the zip file have to be" +\
                   " named <BOOKNAME>/<PAGENAME>.xml."
        if not filename:
            continue
        try:
            book = Book.query.filter_by(name=bookname).one()
        except NoResultFound:
            return "Import aborted. Book {} is not in your library."\
                    .format(bookname)
        if bookname not in result:
            result[bookname] = 0
        pagename = path.splitext(path.split(filename)[1])[0]
        try:
            page = Page.query.filter_by(book_id=book.id, name=pagename).one()
        except NoResultFound:
            page = Page(book=book, name=pagename)
        # return "Import aborted. Book {}, page {} is not in your library."\
        #       .format(bookname, pagename)
        pagexml = zf.read(fn).decode("utf-8")
        root = etree.fromstring(pagexml)
        ns = {"ns": root.nsmap[None]}
        page.no_lines_segm = int(root.xpath("count(//ns:TextLine)",
                                            namespaces=ns))
        page.no_lines_gt = int(root.xpath(
            'count(//ns:TextLine/ns:TextEquiv[@index="0"])', namespaces=ns))
        page.no_lines_ocr = int(root.xpath('count(//ns:TextLine'
                                           '[count(./ns:TextEquiv'
                                           '[@index>0])>0])', namespaces=ns))
        page.data = etree.tounicode(root.getroottree())
        result[bookname] += 1
    db_session.commit()
    res = "Import successfull: {}.".format(", ".join(
        [": ".join([i[0], str(i[1])]) for i in result.items()]))
    return res
コード例 #9
0
def ocrdata():
    if "Content-Encoding" in request.headers and \
            request.headers["Content-Encoding"] == "gzip":
        data = json.loads(gzip.decompress(request.data).decode("utf-8"))
    else:
        data = request.get_json()
    cnt = 0
    for bname, bdict in data["ocrdata"].items():
        b = Book.query.filter_by(name=bname).one()
        for pname, pdict in bdict.items():
            p = Page.query.filter_by(book_id=b.id, name=pname).one()
            root = etree.fromstring(p.data)
            ns = {"ns": root.nsmap[None]}
            for lid, text in pdict.items():
                linexml = root.find('.//ns:TextLine[@id="'+lid+'"]',
                                    namespaces=ns)
                if linexml is None:
                    continue
                textequivxml = linexml.find('./ns:TextEquiv[@index="{}"]'
                                            .format(data["index"]),
                                            namespaces=ns)
                if textequivxml is None:
                    textequivxml = etree.SubElement(linexml,
                                                    "{{{}}}TextEquiv"
                                                    .format(ns["ns"]),
                                                    attrib={"index":
                                                            str(data["index"])
                                                            })
                unicodexml = textequivxml.find('./ns:Unicode',
                                               namespaces=ns)
                if unicodexml is None:
                    unicodexml = etree.SubElement(textequivxml,
                                                  "{{{}}}Unicode"
                                                  .format(ns["ns"]))
                unicodexml.text = text
                cnt += 1
            p.no_lines_ocr = int(root.xpath('count(//ns:TextLine'
                                            '[count(./ns:TextEquiv'
                                            '[@index>0])>0])',
                                            namespaces=ns))
            p.data = etree.tounicode(root.getroottree())
    db_session.commit()
    return "Imported {} lines.".format(cnt)
コード例 #10
0
ファイル: import_from_larex.py プロジェクト: stweil/nashi
def add_page(book, xmlfile, commit=True, scale=None):
    """ Add page from PageXML file to book (segment lines, update version...).
        scale may either be int, {"rtype": int, ... "other": int} or None
        TODO: add reading order
    """
    text_direction = 'horizontal-rl'\
        if book.name.endswith("_ar") else 'horizontal-lr'
    bookpath = path.split(xmlfile)[0]
    pagename = path.split(xmlfile)[1].split(".")[0]
    page = Page.query.filter_by(book_id=book.id, name=pagename).first()
    if not page:
        page = Page(book=book, name=pagename)

    page.data, page.no_lines_segm = \
        pagexmllineseg(xmlfile, bookpath, text_direction=text_direction,
                       scale=scale)
    if commit:
        db_session.add(book)
        db_session.commit()
    return page.no_lines_segm
コード例 #11
0
def editorsettings():
    if current_user.is_anonymous:
        email = "user@nashi"
    else:
        email = current_user.email
    if request.method == "GET":
        try:
            s = EditorSettings.query.filter_by(email=email).one()
        except NoResultFound:
            return jsonify(status="fail")
        return jsonify(status="success", settings=json.loads(s.settings))

    if request.method == "POST":
        try:
            s = EditorSettings.query.filter_by(email=email).one()
        except NoResultFound:
            s = EditorSettings(email=email)
        s.settings = json.dumps(request.get_json())
        db_session.add(s)
        db_session.commit()
        return jsonify(status="success")
コード例 #12
0
ファイル: import_from_larex.py プロジェクト: stweil/nashi
    """
    text_direction = 'horizontal-rl'\
        if book.name.endswith("_ar") else 'horizontal-lr'
    bookpath = path.split(xmlfile)[0]
    pagename = path.split(xmlfile)[1].split(".")[0]
    page = Page.query.filter_by(book_id=book.id, name=pagename).first()
    if not page:
        page = Page(book=book, name=pagename)

    page.data, page.no_lines_segm = \
        pagexmllineseg(xmlfile, bookpath, text_direction=text_direction,
                       scale=scale)
    if commit:
        db_session.add(book)
        db_session.commit()
    return page.no_lines_segm


if __name__ == '__main__':
    for bookpath in [
            x for x in glob(app.config["LAREX_DIR"] + "/*") if path.isdir(x)
    ]:

        book = create_book(bookpath)

        for xmlfile in sorted(glob(bookpath + "/*.xml")):
            add_page(book, xmlfile, commit=False)

        db_session.add(book)
        db_session.commit()
コード例 #13
0
def pagedata(bookname, pagename):
    pnamesplits = pagename.split("+")
    command = ""
    if len(pnamesplits) == 2:
        pagename, command = pnamesplits
    book = Book.query.filter_by(name=bookname).first()
    if command:
        plist = sorted([x.name for x in
                        Book.query.filter_by(name=bookname).first().pages])
        if command == "first":
            pagename = min(plist)
        elif command == "next":
            pagename = plist[(plist.index(pagename) + 1) % len(plist)]
        elif command == "prev":
            pagename = plist[(plist.index(pagename) - 1) % len(plist)]

    page = Page.query.filter_by(book_id=book.id, name=pagename).first()
    root = etree.fromstring(page.data)
    ns = {"ns": root.nsmap[None]}

    if request.method == "GET":
        if (request.args.get('download', '', type=str) == 'xml'):
            response = make_response(page.data)
            response.headers['Cache-Control'] = 'no-cache'
            response.headers['Content-Type'] = 'text/xml'
            response.headers['Content-Disposition'] =\
                "attachment; filename={}_{}.xml".format(bookname, pagename)
            return response

        pageattr = root.find(".//ns:Page", namespaces=ns).attrib
        image = {"file": pageattr["imageFilename"],
                 "image_x": pageattr["imageWidth"],
                 "image_y": pageattr["imageHeight"]}
        direction = "rtl" if bookname.endswith("_ar") else "ltr"
        regionmap = {}
        for r in root.findall(".//ns:TextRegion/ns:Coords[@points]",
                              namespaces=ns):
            textregion = r.getparent()
            r_id = textregion.attrib["id"]
            regionmap[r_id] = {}
            regionmap[r_id]["points"] = r.attrib["points"]
        pagemap = {}
        for l in root.findall(".//ns:TextLine/ns:Coords[@points]",
                              namespaces=ns):
            textline = l.getparent()
            l_id = textline.attrib["id"]
            pagemap[l_id] = {}
            pagemap[l_id]["points"] = l.attrib["points"]
            pagemap[l_id]["region"] = textline.getparent().attrib["id"]
            pagemap[l_id]["comments"] = textline.attrib["comments"] if \
                "comments" in textline.attrib else ""
            indz = textline.xpath('./ns:TextEquiv/@index', namespaces=ns)
            lowestindex = min(indz) if indz else ""
            textcontent = textline.find(
                './ns:TextEquiv[@index="{}"]/ns:Unicode'.format(lowestindex),
                namespaces=ns).text if lowestindex else ""
            status = "empty"
            if textcontent:
                status = "ocr" if int(lowestindex) else "gt"
            if textcontent is None:
                textcontent = ""
            pagemap[l_id]["text"] = {"status": status, "content": textcontent}
        return jsonify(page=pagename, image=image, lines=pagemap,
                       regions=regionmap, direction=direction)

    if request.method == "POST":
        data = request.json
        if current_user.is_anonymous:
            user = "******"
        else:
            user = current_user.email

        for l in [l for l in data["edits"] if l["action"] == "delete"]:
            cur = l["id"]
            ldata = l
            line = root.find('.//ns:TextLine[@id="'+cur+'"]', namespaces=ns)
            line.getparent().remove(line)

        for l in [l for l in data["edits"] if l["action"] == "create"]:
            cur = l["id"]
            ldata = l
            region = root.find('.//ns:TextRegion[@id="{}"]'.format(
                ldata["input"]["region"]), namespaces=ns)
            line = etree.SubElement(region, "{{{}}}TextLine".format(
                ns["ns"]), attrib={"id": cur})
            coords = etree.SubElement(line, "{{{}}}Coords".format(ns["ns"]),
                                      attrib={"points":
                                              ldata["input"]["points"]})

        for l in [l for l in data["edits"] if l["action"] == "change"]:
            cur = l["id"]
            ldata = l
            text = ldata["input"]["text"]["content"].strip()
            textstatus = ldata["input"]["text"]["status"]
            comments = ldata["input"]["comments"]
            points = ldata["input"]["points"]
            rid = ldata["input"]["region"]
            line = root.find('.//ns:TextLine[@id="'+cur+'"]',
                             namespaces=ns)
            line.attrib["comments"] = comments
            if textstatus == "edit":
                tequiv = line.find('.//ns:TextEquiv[@index="0"]',
                                   namespaces=ns)
                if tequiv is None:
                    tequiv = etree.SubElement(line, "{{{}}}TextEquiv"
                                              .format(ns["ns"]),
                                              attrib={"index": "0"})
                    unicodexml = etree.SubElement(tequiv, "Unicode")
                else:
                    unicodexml = tequiv.find('./ns:Unicode', namespaces=ns)
                tequiv.attrib["comments"] = "User: "******"points"] = points

        page.no_lines_gt = int(root.xpath('count(//TextEquiv[@index="0"])')) +\
            int(root.xpath('count(//ns:TextEquiv[@index="0"])', namespaces=ns))
        page.no_lines_segm = int(root.xpath('count(//TextLine)')) + \
            int(root.xpath('count(//ns:TextLine)', namespaces=ns))
        page.data = etree.tounicode(root.getroottree())
        db_session.commit()
        return jsonify(lineinfo=str(page.no_lines_gt)
                       + "/" + str(page.no_lines_segm))