def make_pxml(res=None, f1=None, f2=None): log(0,"pxml") p = Document() for f in factors: p.set_factors(f[0], f[1], f[2]) if res is not None: p.set_factors(res,f1,f2) # directory containing TOC.xml p.load_legacy_data(paper) p.export_tkbs_format(os.path.join(paper, config['pxml_dir'])) log(1,"pxml")
def get_page_ids_from_document_id(collection_id, document_id, tkbs_client): # This function is slow because it requires downloading the file from Transkribus. # I couldn't find a way to extract the page ids without downloading the file. # If there is such a way - it will surely improve the running speed of the code. now = datetime.now() current_time = now.strftime("%H-%M-%S") temp_folder_name = "temp_folder_for_page_id_" + current_time download(collection_id, document_id, temp_folder_name, tkbs_client) trp_json_path = os.path.join(temp_folder_name, "trp.json") data = read_tkbs_json_file(trp_json_path) p = Document() page_ids = p.load_tkbs_page_ids(data) delete_directory(temp_folder_name) return page_ids
def convert_legacy_folder_to_tkbs_format(src_path, dst_path): try: p = Document() p.load_legacy_data(src_path) p.export_tkbs_format(dst_path) except Exception as e: print("ERROR in convert_legacy_folder_to_tkbs_format with src_path " + src_path) print(e)
def upload_pipeline(config): p = Document() folders_to_be_uploaded = find_sub_folders_with_toc_file(config.src_path) #print(folders_to_be_uploaded) for folder in folders_to_be_uploaded: tkbs_client = connect_to_tkbs(config) # output_folder is the folder that legacy_to_tkbs_converter save the output of this folder converter_output_folder = os.path.join( config.src_path, "output", os.path.basename(os.path.normpath(folder))) print(converter_output_folder) json_as_str, img_and_xml_list = extract_json_for_tkbs_from_toc_file( toc_folder_path=folder, images_and_xmls_folder_path=converter_output_folder, author=config.username, description="pipeline") document_id = upload(config.collection_id, tkbs_client, json_as_str, img_and_xml_list, config) print("** Document uploaded **") if True: # TODO: add condition for check if line detection needed detection_status = line_detection(config.collection_id, document_id, tkbs_client, config) if not detection_status: print("ERROR - document failed line detection " + str(p.title)) continue print("Line detection done...") if config.htr_model_id != "": run_ocr(config.collection_id, config.htr_model_id, "", document_id, tkbs_client, config) print("OCR done...") if config.dst_path != "": dest_folder = os.path.join( config.dst_path, "output", os.path.basename(os.path.normpath(folder))) print(dest_folder) download(config.collection_id, document_id, dest_folder, tkbs_client, config) print("** Document downloaded **") time.sleep(40)
def extract_json_for_tkbs_from_toc_file( toc_folder_path="resources_for_tests\\1914-11-06", images_and_xmls_folder_path="resources_for_tests\\output\\1914-11-06", author="test_user", description="pipeline"): p = Document() p.load_legacy_data(os.path.join(toc_folder_path)) page_images, page_xmls = p.img_names_by_pgnum(), p.pxml_names_by_pgnum() title = extract_title_from_TOC_xml(os.path.join(toc_folder_path, "TOC.xml")) img_objects = {} for key, value in page_images.items(): with open(os.path.join(images_and_xmls_folder_path, value), 'rb') as file: img_objects[key] = file.read() xml_objects = {} for key, value in page_xmls.items(): with open(os.path.join(images_and_xmls_folder_path, value), 'rb') as file: xml_objects[key] = file.read() d = { "md": { "title": title, "author": author, "description": description }, "pageList": { "pages": [{ "fileName": value, "pageXmlName": page_xmls[key], "pageNr": int(key) } for key, value in page_images.items()] } } json_as_str = json.dumps(d) img_and_xml_list = [{ 'img': (value, img_objects[key], 'application/octet-stream'), 'xml': (page_xmls[key], xml_objects[key], 'application/octet-stream') } for key, value in page_images.items()] return json_as_str, img_and_xml_list
def upload_a_folder(sfolder): user = config.username outfolder = os.path.join(config.src_path, tkbs_subfolder) prep_dir(outfolder) legacy_output = os.path.join(config.src_path, "legacy_output") collec = config.collection_id HTRmodelid = config.htr_model_id infolder = sfolder OkayMessage = "Done OKAY " + infolder ErrorMessage = "Done with ERRORs " + infolder try: if not os.path.isfile(os.path.join(sfolder, 'TOC.xml')): return (ErrorMessage) start = str(datetime.datetime.now().strftime("%y-%m-%d-%H-%M")) print(start + " - " + infolder) v and print("--- CREATING DATA to upload ---") p = Document() p.load_legacy_data(infolder) uniquename = p.doc_title + "_" + start firstexportdir = sfolder.replace(config.src_path, legacy_output) if not os.path.isdir(firstexportdir): print( p.doc_title + " Skipping... TKBS output missing under " + firstexportdir + "\nRun stage-1 script first, to convert legacy to transkribus format." ) return (OkayMessage) v and print(p.doc_title + "--- UPLOADING data to server --- from " + firstexportdir) docid = upload(collec, firstexportdir, p.img_names_by_pgnum(), p.pxml_names_by_pgnum(), p.title, user, "pipeline test", tkbs) if docid <= 0: print(p.doc_title + "ERROR - document failed to upload " + p.title) return (ErrorMessage) v and print(p.doc_title + "--- GETTING page ids ---") docjson = get_doc(collec, docid, tkbs) pageids = p.load_tkbs_page_ids(docjson) if config.line_detection != None and config.line_detection.upper( ) == "SKIP": v and print(p.doc_title + "Skipping from Line Detection and on...") return (OkayMessage) v and print(p.doc_title + "--- LINE DETECTION ---") detection_status = line_detect(collec, docid, pageids, tkbs) if not detection_status: print(p.doc_title + "ERROR - document failed line detection " + p.title) return (ErrorMessage) if len(HTRmodelid) < 2: v and print(p.doc_title + "Skipping from Htr and on...") return (OkayMessage) v and print(p.doc_title + "--- RUNNING OCR ---") # ocr_status = run_ocr_with_options(collec, HTRmodelid, "", str(446788), {}, tkbs) dictionary = "" if config.htr_lang_model != None and config.htr_lang_model: dictionary = "trainDataLanguageModel" v and print(p.doc_title + "Using trainDataLanguageModel") ocr_status = run_ocr(collec, HTRmodelid, dictionary, str(docid), pageids, tkbs) if not ocr_status: print(p.doc_title + "ERROR - document failed ocr " + p.title + " with status " + str(ocr_status)) return (ErrorMessage) v and print(p.doc_title + "--- FINAL DOWNLOAD after OCR for TEI export ---") otarget_dir = os.path.join( outfolder, uniquename + "_" + str(collec) + "_" + str(docid)) ocrdocjson = download(collec, str(docid), otarget_dir, tkbs, p.tkbs_meta_filename) pageids = p.load_tkbs_page_ids(ocrdocjson) width = config.default_garbage_line_width try: width = int(config.user_garbage_line_width) except: width = config.default_garbage_line_width if width > 0: v and print(p.doc_title + "--- DELETING GARBAGE TEXT ---") for num, fname in p.pxml_names_by_pgnum().items(): fullname = os.path.join(otarget_dir, fname) delete_garbage_text(fullname, width) return (OkayMessage) except Exception as e: print(p.doc_title + "ERROR in upload_a_folder ") print(e) print(p.doc_title + "END ERROR \n\n") return (ErrorMessage)
def upload_pipeline(config): folders_to_be_uploaded = find_sub_folders_with_toc_file(config.src_path) outfolder = os.path.join(config.src_path, "transkribus_output") prep_dir(outfolder) legacy_output = os.path.join(config.src_path, "legacy_output") collec = config.collection_id user = config.username key = config.password HTRmodelid = config.htr_model_id disable_warnings(InsecureRequestWarning) tkbs = TranskribusClient(sServerUrl="https://transkribus.eu/TrpServer") tkbs.auth_login(user, key, True) for sfolder in folders_to_be_uploaded: try: if not os.path.isfile(os.path.join(sfolder, 'TOC.xml')): continue infolder = sfolder start = str(datetime.datetime.now().strftime("%y-%m-%d-%H-%M")) print(start + " - " + infolder) v and print("--- CREATING DATA to upload ---") p = Document() p.load_legacy_data(infolder) uniquename = p.doc_title + "_" + start firstexportdir = sfolder.replace(config.src_path, legacy_output) if not os.path.isdir(firstexportdir): print( "Skipping... TKBS output missing under " + firstexportdir + "\nRun stage-1 script first, to convert legacy to transkribus format." ) continue v and print("--- UPLOADING data to server ---") v and print("from " + firstexportdir) docid = upload(collec, firstexportdir, p.img_names_by_pgnum(), p.pxml_names_by_pgnum(), p.title, user, "pipeline test", tkbs) if docid <= 0: print("ERROR - document failed to upload " + p.title) continue v and print("--- GETTING page ids ---") docjson = get_doc(collec, docid, tkbs) pageids = p.load_tkbs_page_ids(docjson) if config.line_detection != None and config.line_detection.upper( ) == "SKIP": v and print("Skipping from Line Detection and on...") continue v and print("--- LINE DETECTION ---") detection_status = line_detect(collec, docid, pageids, tkbs) if not detection_status: print("ERROR - document failed line detection " + p.title) continue if len(HTRmodelid) < 2: v and print("Skipping from Htr and on...") continue v and print("--- RUNNING OCR ---") # ocr_status = run_ocr_with_options(collec, HTRmodelid, "", str(446788), {}, tkbs) dictionary = "" if config.htr_lang_model != None and config.htr_lang_model: dictionary = "trainDataLanguageModel" v and print("Using trainDataLanguageModel") ocr_status = run_ocr(collec, HTRmodelid, dictionary, str(docid), pageids, tkbs) if not ocr_status: print("ERROR - document failed ocr " + p.title + " with status " + str(ocr_status)) continue v and print( "--- FINAL DOWNLOAD after OCR for TEI export ---") otarget_dir = os.path.join( outfolder, uniquename + "_" + str(collec) + "_" + str(docid)) ocrdocjson = download(collec, str(docid), otarget_dir, tkbs, p.tkbs_meta_filename) pageids = p.load_tkbs_page_ids(ocrdocjson) width = config.default_garbage_line_width try: width = int(config.user_garbage_line_width) except: width = config.default_garbage_line_width if width > 0: v and print("--- DELETING GARBAGE TEXT ---") for num, fname in p.pxml_names_by_pgnum().items(): fullname = os.path.join(otarget_dir, fname) delete_garbage_text(fullname, width) except Exception as e: print("ERROR in upload_pipeline main loop ") print(e) print("END ERROR \n\n") pass print("DONE. Output is under " + outfolder) tkbs.auth_logout()
seconds = 80 * len(pids) return wait_for_jobstatus(jobid, seconds, mytkbs) except Exception as e: print("ERROR in run_ocr for docid " + str(mydocid)) print(e) print("END ERROR \n\n") pass v = True infolder = r'C:\_test_\in_0105' #CHANGE THIS outfolder = r'C:\_test_\out' #CHANGE THIS v and print("--- CREATING DATA to upload ---") p = Document() #p.set_factors(150, 1.7238, 0.67) p.load_legacy_data(infolder) exportdir = os.path.join(outfolder, "pagexml_for_upload") prep_dir(exportdir) p.export_tkbs_format(exportdir) v and print("--- CONNECTING to server ---") user = "******" #CHANGE THIS key = "<password>" #CHANGE THIS collec = "17989" #CHANGE THIS tkbs = TranskribusClient(sServerUrl="https://transkribus.eu/TrpServer") tkbs.auth_login(user, key, True) #HTRmodelname = 'Test' HTRmodelid = "10168" #CHANGE THIS
def export_pipeline(config): folders_to_be_exported = find_sub_folders_with_toc_file(config.src_path) tkbs_topfolder = os.path.join(config.src_path, "transkribus_output") exportfolder = prep_dir(os.path.join(config.src_path, "transkribus_export")) if config.export_csv: csvfolder_byregion = prep_dir( os.path.join(exportfolder, 'csv_by_region')) csvfolder_byarticle = prep_dir( os.path.join(exportfolder, 'csv_by_article')) if config.export_plaintext: plaintextfolder = prep_dir(os.path.join(exportfolder, 'plaintext')) plaintextfolder_byarticle = prep_dir( os.path.join(exportfolder, 'plaintext_by_article')) if config.export_tei: teifolder = prep_dir(os.path.join(exportfolder, 'tei')) for sfolder in folders_to_be_exported: try: if not os.path.isfile(os.path.join(sfolder, 'TOC.xml')): continue infolder = sfolder start = str(datetime.datetime.now().strftime("%y-%m-%d-%H-%M")) print(start + " - " + infolder) # + "\n==============") v and print("--- LOADING Legacy data ---") p = Document() p.load_legacy_data(infolder) tkbsfolder = find_latest_folder(tkbs_topfolder, p.doc_title) p.load_tkbs_data(tkbsfolder) #FIX p.load_legacy_articles(p.legacy_metafile) p.match_legacy_articles() if config.export_tei: v and print("--- TEI export ---") p.export_tei(teifolder) if config.export_plaintext: v and print("--- PLAINTEXT export ---") p.export_plaintext(plaintextfolder) p.export_plaintext_articles(plaintextfolder_byarticle) if config.export_csv: v and print("--- CSV export ---") p.export_csv_articles(csvfolder_byarticle) p.export_csv_regions(csvfolder_byregion) except Exception as e: print("ERROR in export_pipeline main loop ") print(e) print("END ERROR \n\n") pass print("DONE. Output is under " + exportfolder)
def convert_legacy_folder_to_tkbs_format(src_path, dst_path): p = Document() p.load_legacy_data(src_path) p.export_tkbs_format(dst_path)