Ejemplo n.º 1
0
def extract_json_for_tkbs_from_toc_file(
        toc_folder_path="resources_for_tests\\1914-11-06",
        images_and_xmls_folder_path="resources_for_tests\\output\\1914-11-06",
        author="test_user",
        description="pipeline"):
    p = Document()
    p.load_legacy_data(os.path.join(toc_folder_path))
    page_images, page_xmls = p.img_names_by_pgnum(), p.pxml_names_by_pgnum()
    title = extract_title_from_TOC_xml(os.path.join(toc_folder_path,
                                                    "TOC.xml"))
    img_objects = {}

    for key, value in page_images.items():
        with open(os.path.join(images_and_xmls_folder_path, value),
                  'rb') as file:
            img_objects[key] = file.read()

    xml_objects = {}
    for key, value in page_xmls.items():
        with open(os.path.join(images_and_xmls_folder_path, value),
                  'rb') as file:
            xml_objects[key] = file.read()

    d = {
        "md": {
            "title": title,
            "author": author,
            "description": description
        },
        "pageList": {
            "pages": [{
                "fileName": value,
                "pageXmlName": page_xmls[key],
                "pageNr": int(key)
            } for key, value in page_images.items()]
        }
    }

    json_as_str = json.dumps(d)

    img_and_xml_list = [{
        'img': (value, img_objects[key], 'application/octet-stream'),
        'xml': (page_xmls[key], xml_objects[key], 'application/octet-stream')
    } for key, value in page_images.items()]
    return json_as_str, img_and_xml_list
Ejemplo n.º 2
0
def upload_a_folder(sfolder):
    user = config.username
    outfolder = os.path.join(config.src_path, tkbs_subfolder)
    prep_dir(outfolder)
    legacy_output = os.path.join(config.src_path, "legacy_output")
    collec = config.collection_id
    HTRmodelid = config.htr_model_id
    infolder = sfolder
    OkayMessage = "Done OKAY " + infolder
    ErrorMessage = "Done with ERRORs " + infolder

    try:
        if not os.path.isfile(os.path.join(sfolder, 'TOC.xml')):
            return (ErrorMessage)

        start = str(datetime.datetime.now().strftime("%y-%m-%d-%H-%M"))
        print(start + " - " + infolder)
        v and print("---   CREATING DATA to upload  ---")
        p = Document()
        p.load_legacy_data(infolder)
        uniquename = p.doc_title + "_" + start
        firstexportdir = sfolder.replace(config.src_path, legacy_output)
        if not os.path.isdir(firstexportdir):
            print(
                p.doc_title + " Skipping... TKBS output missing under " +
                firstexportdir +
                "\nRun stage-1 script  first, to convert legacy to transkribus format."
            )
            return (OkayMessage)
        v and print(p.doc_title +
                    "---   UPLOADING data to server       --- from " +
                    firstexportdir)
        docid = upload(collec, firstexportdir, p.img_names_by_pgnum(),
                       p.pxml_names_by_pgnum(), p.title, user, "pipeline test",
                       tkbs)
        if docid <= 0:
            print(p.doc_title + "ERROR - document failed to upload " + p.title)
            return (ErrorMessage)

        v and print(p.doc_title + "---   GETTING page ids       ---")
        docjson = get_doc(collec, docid, tkbs)
        pageids = p.load_tkbs_page_ids(docjson)

        if config.line_detection != None and config.line_detection.upper(
        ) == "SKIP":
            v and print(p.doc_title + "Skipping from Line Detection and on...")
            return (OkayMessage)

        v and print(p.doc_title + "---   LINE DETECTION          ---")
        detection_status = line_detect(collec, docid, pageids, tkbs)
        if not detection_status:
            print(p.doc_title + "ERROR - document failed line detection " +
                  p.title)
            return (ErrorMessage)

        if len(HTRmodelid) < 2:
            v and print(p.doc_title + "Skipping from Htr and on...")
            return (OkayMessage)

        v and print(p.doc_title + "---   RUNNING OCR          ---")
        #            ocr_status = run_ocr_with_options(collec, HTRmodelid, "", str(446788), {}, tkbs)
        dictionary = ""
        if config.htr_lang_model != None and config.htr_lang_model:
            dictionary = "trainDataLanguageModel"
            v and print(p.doc_title + "Using trainDataLanguageModel")
        ocr_status = run_ocr(collec, HTRmodelid, dictionary, str(docid),
                             pageids, tkbs)
        if not ocr_status:
            print(p.doc_title + "ERROR - document failed ocr " + p.title +
                  " with status " + str(ocr_status))
            return (ErrorMessage)

        v and print(p.doc_title +
                    "---   FINAL DOWNLOAD after OCR for TEI export        ---")
        otarget_dir = os.path.join(
            outfolder, uniquename + "_" + str(collec) + "_" + str(docid))
        ocrdocjson = download(collec, str(docid), otarget_dir, tkbs,
                              p.tkbs_meta_filename)
        pageids = p.load_tkbs_page_ids(ocrdocjson)

        width = config.default_garbage_line_width
        try:
            width = int(config.user_garbage_line_width)
        except:
            width = config.default_garbage_line_width
        if width > 0:
            v and print(p.doc_title +
                        "---   DELETING GARBAGE TEXT         ---")
            for num, fname in p.pxml_names_by_pgnum().items():
                fullname = os.path.join(otarget_dir, fname)
                delete_garbage_text(fullname, width)

        return (OkayMessage)
    except Exception as e:
        print(p.doc_title + "ERROR in upload_a_folder ")
        print(e)
        print(p.doc_title + "END ERROR \n\n")
        return (ErrorMessage)
Ejemplo n.º 3
0
def upload_pipeline(config):
    folders_to_be_uploaded = find_sub_folders_with_toc_file(config.src_path)
    outfolder = os.path.join(config.src_path, "transkribus_output")
    prep_dir(outfolder)
    legacy_output = os.path.join(config.src_path, "legacy_output")
    collec = config.collection_id
    user = config.username
    key = config.password
    HTRmodelid = config.htr_model_id
    disable_warnings(InsecureRequestWarning)
    tkbs = TranskribusClient(sServerUrl="https://transkribus.eu/TrpServer")
    tkbs.auth_login(user, key, True)

    for sfolder in folders_to_be_uploaded:
        try:
            if not os.path.isfile(os.path.join(sfolder, 'TOC.xml')):
                continue
            infolder = sfolder

            start = str(datetime.datetime.now().strftime("%y-%m-%d-%H-%M"))
            print(start + " - " + infolder)
            v and print("---   CREATING DATA to upload  ---")
            p = Document()
            p.load_legacy_data(infolder)
            uniquename = p.doc_title + "_" + start
            firstexportdir = sfolder.replace(config.src_path, legacy_output)
            if not os.path.isdir(firstexportdir):
                print(
                    "Skipping... TKBS output missing under " + firstexportdir +
                    "\nRun stage-1 script  first, to convert legacy to transkribus format."
                )
                continue
            v and print("---   UPLOADING data to server       ---")
            v and print("from " + firstexportdir)
            docid = upload(collec, firstexportdir, p.img_names_by_pgnum(),
                           p.pxml_names_by_pgnum(), p.title, user,
                           "pipeline test", tkbs)
            if docid <= 0:
                print("ERROR - document failed to upload " + p.title)
                continue

            v and print("---   GETTING page ids       ---")
            docjson = get_doc(collec, docid, tkbs)
            pageids = p.load_tkbs_page_ids(docjson)

            if config.line_detection != None and config.line_detection.upper(
            ) == "SKIP":
                v and print("Skipping from Line Detection and on...")
                continue

            v and print("---   LINE DETECTION          ---")
            detection_status = line_detect(collec, docid, pageids, tkbs)
            if not detection_status:
                print("ERROR - document failed line detection " + p.title)
                continue

            if len(HTRmodelid) < 2:
                v and print("Skipping from Htr and on...")
                continue

            v and print("---   RUNNING OCR          ---")
            #            ocr_status = run_ocr_with_options(collec, HTRmodelid, "", str(446788), {}, tkbs)
            dictionary = ""
            if config.htr_lang_model != None and config.htr_lang_model:
                dictionary = "trainDataLanguageModel"
                v and print("Using trainDataLanguageModel")
            ocr_status = run_ocr(collec, HTRmodelid, dictionary, str(docid),
                                 pageids, tkbs)
            if not ocr_status:
                print("ERROR - document failed ocr " + p.title +
                      " with status " + str(ocr_status))
                continue

            v and print(
                "---   FINAL DOWNLOAD after OCR for TEI export        ---")
            otarget_dir = os.path.join(
                outfolder, uniquename + "_" + str(collec) + "_" + str(docid))
            ocrdocjson = download(collec, str(docid), otarget_dir, tkbs,
                                  p.tkbs_meta_filename)
            pageids = p.load_tkbs_page_ids(ocrdocjson)

            width = config.default_garbage_line_width
            try:
                width = int(config.user_garbage_line_width)
            except:
                width = config.default_garbage_line_width
            if width > 0:
                v and print("---   DELETING GARBAGE TEXT         ---")
                for num, fname in p.pxml_names_by_pgnum().items():
                    fullname = os.path.join(otarget_dir, fname)
                    delete_garbage_text(fullname, width)

        except Exception as e:
            print("ERROR in upload_pipeline main loop ")
            print(e)
            print("END ERROR \n\n")
            pass

    print("DONE. Output is under " + outfolder)
    tkbs.auth_logout()
Ejemplo n.º 4
0
p.export_tkbs_format(exportdir)

v and print("---   CONNECTING to server    ---")
user = "******"  #CHANGE THIS
key = "<password>"  #CHANGE THIS
collec = "17989"  #CHANGE THIS
tkbs = TranskribusClient(sServerUrl="https://transkribus.eu/TrpServer")
tkbs.auth_login(user, key, True)
#HTRmodelname = 'Test'
HTRmodelid = "10168"  #CHANGE THIS
#dictName =  "Hebrew_Test.dict" #CHANGE THIS
#print("session id: " + tkbs.getSessionId() + "\n=================")

v and print("---   UPLOADING data to server       ---")
docid = upload(collec, exportdir, p.img_names_by_pgnum(),
               p.pxml_names_by_pgnum(), p.title, user, "pipeline test", tkbs)
if docid <= 0:
    print("ERROR - document failed to upload " + p.title)
    sys.exit(1)

v and print("---   DOWNLOADING-1 doc for page ids       ---")
tempdowndir = os.path.join(outfolder, "tempdowndir")
prep_dir(tempdowndir)
target_dir = os.path.join(tempdowndir,
                          p.title + "_" + str(collec) + "_" + str(docid))
docjson = download(collec, str(docid), target_dir, tkbs, p.tkbs_meta_filename)
pageids = p.load_tkbs_page_ids(docjson)

v and print("---   LINE DETECTION          ---")
detection_status = line_detect(collec, docid, pageids, tkbs)
if not detection_status: