def get_page_ids_from_document_id(collection_id, document_id, tkbs_client): # This function is slow because it requires downloading the file from Transkribus. # I couldn't find a way to extract the page ids without downloading the file. # If there is such a way - it will surely improve the running speed of the code. now = datetime.now() current_time = now.strftime("%H-%M-%S") temp_folder_name = "temp_folder_for_page_id_" + current_time download(collection_id, document_id, temp_folder_name, tkbs_client) trp_json_path = os.path.join(temp_folder_name, "trp.json") data = read_tkbs_json_file(trp_json_path) p = Document() page_ids = p.load_tkbs_page_ids(data) delete_directory(temp_folder_name) return page_ids
def upload_a_folder(sfolder): user = config.username outfolder = os.path.join(config.src_path, tkbs_subfolder) prep_dir(outfolder) legacy_output = os.path.join(config.src_path, "legacy_output") collec = config.collection_id HTRmodelid = config.htr_model_id infolder = sfolder OkayMessage = "Done OKAY " + infolder ErrorMessage = "Done with ERRORs " + infolder try: if not os.path.isfile(os.path.join(sfolder, 'TOC.xml')): return (ErrorMessage) start = str(datetime.datetime.now().strftime("%y-%m-%d-%H-%M")) print(start + " - " + infolder) v and print("--- CREATING DATA to upload ---") p = Document() p.load_legacy_data(infolder) uniquename = p.doc_title + "_" + start firstexportdir = sfolder.replace(config.src_path, legacy_output) if not os.path.isdir(firstexportdir): print( p.doc_title + " Skipping... TKBS output missing under " + firstexportdir + "\nRun stage-1 script first, to convert legacy to transkribus format." ) return (OkayMessage) v and print(p.doc_title + "--- UPLOADING data to server --- from " + firstexportdir) docid = upload(collec, firstexportdir, p.img_names_by_pgnum(), p.pxml_names_by_pgnum(), p.title, user, "pipeline test", tkbs) if docid <= 0: print(p.doc_title + "ERROR - document failed to upload " + p.title) return (ErrorMessage) v and print(p.doc_title + "--- GETTING page ids ---") docjson = get_doc(collec, docid, tkbs) pageids = p.load_tkbs_page_ids(docjson) if config.line_detection != None and config.line_detection.upper( ) == "SKIP": v and print(p.doc_title + "Skipping from Line Detection and on...") return (OkayMessage) v and print(p.doc_title + "--- LINE DETECTION ---") detection_status = line_detect(collec, docid, pageids, tkbs) if not detection_status: print(p.doc_title + "ERROR - document failed line detection " + p.title) return (ErrorMessage) if len(HTRmodelid) < 2: v and print(p.doc_title + "Skipping from Htr and on...") return (OkayMessage) v and print(p.doc_title + "--- RUNNING OCR ---") # ocr_status = run_ocr_with_options(collec, HTRmodelid, "", str(446788), {}, tkbs) dictionary = "" if config.htr_lang_model != None and config.htr_lang_model: dictionary = "trainDataLanguageModel" v and print(p.doc_title + "Using trainDataLanguageModel") ocr_status = run_ocr(collec, HTRmodelid, dictionary, str(docid), pageids, tkbs) if not ocr_status: print(p.doc_title + "ERROR - document failed ocr " + p.title + " with status " + str(ocr_status)) return (ErrorMessage) v and print(p.doc_title + "--- FINAL DOWNLOAD after OCR for TEI export ---") otarget_dir = os.path.join( outfolder, uniquename + "_" + str(collec) + "_" + str(docid)) ocrdocjson = download(collec, str(docid), otarget_dir, tkbs, p.tkbs_meta_filename) pageids = p.load_tkbs_page_ids(ocrdocjson) width = config.default_garbage_line_width try: width = int(config.user_garbage_line_width) except: width = config.default_garbage_line_width if width > 0: v and print(p.doc_title + "--- DELETING GARBAGE TEXT ---") for num, fname in p.pxml_names_by_pgnum().items(): fullname = os.path.join(otarget_dir, fname) delete_garbage_text(fullname, width) return (OkayMessage) except Exception as e: print(p.doc_title + "ERROR in upload_a_folder ") print(e) print(p.doc_title + "END ERROR \n\n") return (ErrorMessage)
def upload_pipeline(config): folders_to_be_uploaded = find_sub_folders_with_toc_file(config.src_path) outfolder = os.path.join(config.src_path, "transkribus_output") prep_dir(outfolder) legacy_output = os.path.join(config.src_path, "legacy_output") collec = config.collection_id user = config.username key = config.password HTRmodelid = config.htr_model_id disable_warnings(InsecureRequestWarning) tkbs = TranskribusClient(sServerUrl="https://transkribus.eu/TrpServer") tkbs.auth_login(user, key, True) for sfolder in folders_to_be_uploaded: try: if not os.path.isfile(os.path.join(sfolder, 'TOC.xml')): continue infolder = sfolder start = str(datetime.datetime.now().strftime("%y-%m-%d-%H-%M")) print(start + " - " + infolder) v and print("--- CREATING DATA to upload ---") p = Document() p.load_legacy_data(infolder) uniquename = p.doc_title + "_" + start firstexportdir = sfolder.replace(config.src_path, legacy_output) if not os.path.isdir(firstexportdir): print( "Skipping... TKBS output missing under " + firstexportdir + "\nRun stage-1 script first, to convert legacy to transkribus format." ) continue v and print("--- UPLOADING data to server ---") v and print("from " + firstexportdir) docid = upload(collec, firstexportdir, p.img_names_by_pgnum(), p.pxml_names_by_pgnum(), p.title, user, "pipeline test", tkbs) if docid <= 0: print("ERROR - document failed to upload " + p.title) continue v and print("--- GETTING page ids ---") docjson = get_doc(collec, docid, tkbs) pageids = p.load_tkbs_page_ids(docjson) if config.line_detection != None and config.line_detection.upper( ) == "SKIP": v and print("Skipping from Line Detection and on...") continue v and print("--- LINE DETECTION ---") detection_status = line_detect(collec, docid, pageids, tkbs) if not detection_status: print("ERROR - document failed line detection " + p.title) continue if len(HTRmodelid) < 2: v and print("Skipping from Htr and on...") continue v and print("--- RUNNING OCR ---") # ocr_status = run_ocr_with_options(collec, HTRmodelid, "", str(446788), {}, tkbs) dictionary = "" if config.htr_lang_model != None and config.htr_lang_model: dictionary = "trainDataLanguageModel" v and print("Using trainDataLanguageModel") ocr_status = run_ocr(collec, HTRmodelid, dictionary, str(docid), pageids, tkbs) if not ocr_status: print("ERROR - document failed ocr " + p.title + " with status " + str(ocr_status)) continue v and print( "--- FINAL DOWNLOAD after OCR for TEI export ---") otarget_dir = os.path.join( outfolder, uniquename + "_" + str(collec) + "_" + str(docid)) ocrdocjson = download(collec, str(docid), otarget_dir, tkbs, p.tkbs_meta_filename) pageids = p.load_tkbs_page_ids(ocrdocjson) width = config.default_garbage_line_width try: width = int(config.user_garbage_line_width) except: width = config.default_garbage_line_width if width > 0: v and print("--- DELETING GARBAGE TEXT ---") for num, fname in p.pxml_names_by_pgnum().items(): fullname = os.path.join(otarget_dir, fname) delete_garbage_text(fullname, width) except Exception as e: print("ERROR in upload_pipeline main loop ") print(e) print("END ERROR \n\n") pass print("DONE. Output is under " + outfolder) tkbs.auth_logout()
#print("session id: " + tkbs.getSessionId() + "\n=================") v and print("--- UPLOADING data to server ---") docid = upload(collec, exportdir, p.img_names_by_pgnum(), p.pxml_names_by_pgnum(), p.title, user, "pipeline test", tkbs) if docid <= 0: print("ERROR - document failed to upload " + p.title) sys.exit(1) v and print("--- DOWNLOADING-1 doc for page ids ---") tempdowndir = os.path.join(outfolder, "tempdowndir") prep_dir(tempdowndir) target_dir = os.path.join(tempdowndir, p.title + "_" + str(collec) + "_" + str(docid)) docjson = download(collec, str(docid), target_dir, tkbs, p.tkbs_meta_filename) pageids = p.load_tkbs_page_ids(docjson) v and print("--- LINE DETECTION ---") detection_status = line_detect(collec, docid, pageids, tkbs) if not detection_status: print("ERROR - document failed line detection " + p.title) sys.exit(1) v and print("--- DOWNLOADING-2 doc for baseline extention ---") extentiondowndir = os.path.join(outfolder, "extentiondowndir") prep_dir(extentiondowndir) xtarget_dir = os.path.join(extentiondowndir, p.title + "_" + str(collec) + "_" + str(docid)) xdocjson = download(collec, str(docid), xtarget_dir, tkbs, p.tkbs_meta_filename) xpageids = p.load_tkbs_page_ids(xdocjson)