Exemple #1
0
def get_page_ids_from_document_id(collection_id, document_id, tkbs_client):
    # This function is slow because it requires downloading the file from Transkribus.
    # I couldn't find a way to extract the page ids without downloading the file.
    # If there is such a way - it will surely improve the running speed of the code.
    now = datetime.now()
    current_time = now.strftime("%H-%M-%S")
    temp_folder_name = "temp_folder_for_page_id_" + current_time
    download(collection_id, document_id, temp_folder_name, tkbs_client)
    trp_json_path = os.path.join(temp_folder_name, "trp.json")
    data = read_tkbs_json_file(trp_json_path)
    p = Document()
    page_ids = p.load_tkbs_page_ids(data)
    delete_directory(temp_folder_name)
    return page_ids
Exemple #2
0
def upload_a_folder(sfolder):
    user = config.username
    outfolder = os.path.join(config.src_path, tkbs_subfolder)
    prep_dir(outfolder)
    legacy_output = os.path.join(config.src_path, "legacy_output")
    collec = config.collection_id
    HTRmodelid = config.htr_model_id
    infolder = sfolder
    OkayMessage = "Done OKAY " + infolder
    ErrorMessage = "Done with ERRORs " + infolder

    try:
        if not os.path.isfile(os.path.join(sfolder, 'TOC.xml')):
            return (ErrorMessage)

        start = str(datetime.datetime.now().strftime("%y-%m-%d-%H-%M"))
        print(start + " - " + infolder)
        v and print("---   CREATING DATA to upload  ---")
        p = Document()
        p.load_legacy_data(infolder)
        uniquename = p.doc_title + "_" + start
        firstexportdir = sfolder.replace(config.src_path, legacy_output)
        if not os.path.isdir(firstexportdir):
            print(
                p.doc_title + " Skipping... TKBS output missing under " +
                firstexportdir +
                "\nRun stage-1 script  first, to convert legacy to transkribus format."
            )
            return (OkayMessage)
        v and print(p.doc_title +
                    "---   UPLOADING data to server       --- from " +
                    firstexportdir)
        docid = upload(collec, firstexportdir, p.img_names_by_pgnum(),
                       p.pxml_names_by_pgnum(), p.title, user, "pipeline test",
                       tkbs)
        if docid <= 0:
            print(p.doc_title + "ERROR - document failed to upload " + p.title)
            return (ErrorMessage)

        v and print(p.doc_title + "---   GETTING page ids       ---")
        docjson = get_doc(collec, docid, tkbs)
        pageids = p.load_tkbs_page_ids(docjson)

        if config.line_detection != None and config.line_detection.upper(
        ) == "SKIP":
            v and print(p.doc_title + "Skipping from Line Detection and on...")
            return (OkayMessage)

        v and print(p.doc_title + "---   LINE DETECTION          ---")
        detection_status = line_detect(collec, docid, pageids, tkbs)
        if not detection_status:
            print(p.doc_title + "ERROR - document failed line detection " +
                  p.title)
            return (ErrorMessage)

        if len(HTRmodelid) < 2:
            v and print(p.doc_title + "Skipping from Htr and on...")
            return (OkayMessage)

        v and print(p.doc_title + "---   RUNNING OCR          ---")
        #            ocr_status = run_ocr_with_options(collec, HTRmodelid, "", str(446788), {}, tkbs)
        dictionary = ""
        if config.htr_lang_model != None and config.htr_lang_model:
            dictionary = "trainDataLanguageModel"
            v and print(p.doc_title + "Using trainDataLanguageModel")
        ocr_status = run_ocr(collec, HTRmodelid, dictionary, str(docid),
                             pageids, tkbs)
        if not ocr_status:
            print(p.doc_title + "ERROR - document failed ocr " + p.title +
                  " with status " + str(ocr_status))
            return (ErrorMessage)

        v and print(p.doc_title +
                    "---   FINAL DOWNLOAD after OCR for TEI export        ---")
        otarget_dir = os.path.join(
            outfolder, uniquename + "_" + str(collec) + "_" + str(docid))
        ocrdocjson = download(collec, str(docid), otarget_dir, tkbs,
                              p.tkbs_meta_filename)
        pageids = p.load_tkbs_page_ids(ocrdocjson)

        width = config.default_garbage_line_width
        try:
            width = int(config.user_garbage_line_width)
        except:
            width = config.default_garbage_line_width
        if width > 0:
            v and print(p.doc_title +
                        "---   DELETING GARBAGE TEXT         ---")
            for num, fname in p.pxml_names_by_pgnum().items():
                fullname = os.path.join(otarget_dir, fname)
                delete_garbage_text(fullname, width)

        return (OkayMessage)
    except Exception as e:
        print(p.doc_title + "ERROR in upload_a_folder ")
        print(e)
        print(p.doc_title + "END ERROR \n\n")
        return (ErrorMessage)
Exemple #3
0
def upload_pipeline(config):
    folders_to_be_uploaded = find_sub_folders_with_toc_file(config.src_path)
    outfolder = os.path.join(config.src_path, "transkribus_output")
    prep_dir(outfolder)
    legacy_output = os.path.join(config.src_path, "legacy_output")
    collec = config.collection_id
    user = config.username
    key = config.password
    HTRmodelid = config.htr_model_id
    disable_warnings(InsecureRequestWarning)
    tkbs = TranskribusClient(sServerUrl="https://transkribus.eu/TrpServer")
    tkbs.auth_login(user, key, True)

    for sfolder in folders_to_be_uploaded:
        try:
            if not os.path.isfile(os.path.join(sfolder, 'TOC.xml')):
                continue
            infolder = sfolder

            start = str(datetime.datetime.now().strftime("%y-%m-%d-%H-%M"))
            print(start + " - " + infolder)
            v and print("---   CREATING DATA to upload  ---")
            p = Document()
            p.load_legacy_data(infolder)
            uniquename = p.doc_title + "_" + start
            firstexportdir = sfolder.replace(config.src_path, legacy_output)
            if not os.path.isdir(firstexportdir):
                print(
                    "Skipping... TKBS output missing under " + firstexportdir +
                    "\nRun stage-1 script  first, to convert legacy to transkribus format."
                )
                continue
            v and print("---   UPLOADING data to server       ---")
            v and print("from " + firstexportdir)
            docid = upload(collec, firstexportdir, p.img_names_by_pgnum(),
                           p.pxml_names_by_pgnum(), p.title, user,
                           "pipeline test", tkbs)
            if docid <= 0:
                print("ERROR - document failed to upload " + p.title)
                continue

            v and print("---   GETTING page ids       ---")
            docjson = get_doc(collec, docid, tkbs)
            pageids = p.load_tkbs_page_ids(docjson)

            if config.line_detection != None and config.line_detection.upper(
            ) == "SKIP":
                v and print("Skipping from Line Detection and on...")
                continue

            v and print("---   LINE DETECTION          ---")
            detection_status = line_detect(collec, docid, pageids, tkbs)
            if not detection_status:
                print("ERROR - document failed line detection " + p.title)
                continue

            if len(HTRmodelid) < 2:
                v and print("Skipping from Htr and on...")
                continue

            v and print("---   RUNNING OCR          ---")
            #            ocr_status = run_ocr_with_options(collec, HTRmodelid, "", str(446788), {}, tkbs)
            dictionary = ""
            if config.htr_lang_model != None and config.htr_lang_model:
                dictionary = "trainDataLanguageModel"
                v and print("Using trainDataLanguageModel")
            ocr_status = run_ocr(collec, HTRmodelid, dictionary, str(docid),
                                 pageids, tkbs)
            if not ocr_status:
                print("ERROR - document failed ocr " + p.title +
                      " with status " + str(ocr_status))
                continue

            v and print(
                "---   FINAL DOWNLOAD after OCR for TEI export        ---")
            otarget_dir = os.path.join(
                outfolder, uniquename + "_" + str(collec) + "_" + str(docid))
            ocrdocjson = download(collec, str(docid), otarget_dir, tkbs,
                                  p.tkbs_meta_filename)
            pageids = p.load_tkbs_page_ids(ocrdocjson)

            width = config.default_garbage_line_width
            try:
                width = int(config.user_garbage_line_width)
            except:
                width = config.default_garbage_line_width
            if width > 0:
                v and print("---   DELETING GARBAGE TEXT         ---")
                for num, fname in p.pxml_names_by_pgnum().items():
                    fullname = os.path.join(otarget_dir, fname)
                    delete_garbage_text(fullname, width)

        except Exception as e:
            print("ERROR in upload_pipeline main loop ")
            print(e)
            print("END ERROR \n\n")
            pass

    print("DONE. Output is under " + outfolder)
    tkbs.auth_logout()
Exemple #4
0
#print("session id: " + tkbs.getSessionId() + "\n=================")

v and print("---   UPLOADING data to server       ---")
docid = upload(collec, exportdir, p.img_names_by_pgnum(),
               p.pxml_names_by_pgnum(), p.title, user, "pipeline test", tkbs)
if docid <= 0:
    print("ERROR - document failed to upload " + p.title)
    sys.exit(1)

v and print("---   DOWNLOADING-1 doc for page ids       ---")
tempdowndir = os.path.join(outfolder, "tempdowndir")
prep_dir(tempdowndir)
target_dir = os.path.join(tempdowndir,
                          p.title + "_" + str(collec) + "_" + str(docid))
docjson = download(collec, str(docid), target_dir, tkbs, p.tkbs_meta_filename)
pageids = p.load_tkbs_page_ids(docjson)

v and print("---   LINE DETECTION          ---")
detection_status = line_detect(collec, docid, pageids, tkbs)
if not detection_status:
    print("ERROR - document failed line detection " + p.title)
    sys.exit(1)

v and print("---   DOWNLOADING-2 doc for baseline extention      ---")
extentiondowndir = os.path.join(outfolder, "extentiondowndir")
prep_dir(extentiondowndir)
xtarget_dir = os.path.join(extentiondowndir,
                           p.title + "_" + str(collec) + "_" + str(docid))
xdocjson = download(collec, str(docid), xtarget_dir, tkbs,
                    p.tkbs_meta_filename)
xpageids = p.load_tkbs_page_ids(xdocjson)