def export_pipeline(config): folders_to_be_exported = find_sub_folders_with_toc_file(config.src_path) tkbs_topfolder = os.path.join(config.src_path, "transkribus_output") exportfolder = prep_dir(os.path.join(config.src_path, "transkribus_export")) if config.export_csv: csvfolder_byregion = prep_dir( os.path.join(exportfolder, 'csv_by_region')) csvfolder_byarticle = prep_dir( os.path.join(exportfolder, 'csv_by_article')) if config.export_plaintext: plaintextfolder = prep_dir(os.path.join(exportfolder, 'plaintext')) plaintextfolder_byarticle = prep_dir( os.path.join(exportfolder, 'plaintext_by_article')) if config.export_tei: teifolder = prep_dir(os.path.join(exportfolder, 'tei')) for sfolder in folders_to_be_exported: try: if not os.path.isfile(os.path.join(sfolder, 'TOC.xml')): continue infolder = sfolder start = str(datetime.datetime.now().strftime("%y-%m-%d-%H-%M")) print(start + " - " + infolder) # + "\n==============") v and print("--- LOADING Legacy data ---") p = Document() p.load_legacy_data(infolder) tkbsfolder = find_latest_folder(tkbs_topfolder, p.doc_title) p.load_tkbs_data(tkbsfolder) #FIX p.load_legacy_articles(p.legacy_metafile) p.match_legacy_articles() if config.export_tei: v and print("--- TEI export ---") p.export_tei(teifolder) if config.export_plaintext: v and print("--- PLAINTEXT export ---") p.export_plaintext(plaintextfolder) p.export_plaintext_articles(plaintextfolder_byarticle) if config.export_csv: v and print("--- CSV export ---") p.export_csv_articles(csvfolder_byarticle) p.export_csv_regions(csvfolder_byregion) except Exception as e: print("ERROR in export_pipeline main loop ") print(e) print("END ERROR \n\n") pass print("DONE. Output is under " + exportfolder)
ocr_status = run_ocr(collec, HTRmodelid, "", str(xdocid), ppageids, tkbs) if not ocr_status: print("ERROR - document failed ocr " + p.title) sys.exit(1) v and print("--- FINAL DOWNLOAD after OCR for TEI export ---") ocrdowndir = os.path.join(outfolder, "ocrdowndir") prep_dir(ocrdowndir) otarget_dir = os.path.join(ocrdowndir, p.title + "_" + str(collec) + "_" + str(xdocid)) ocrdocjson = download(collec, str(xdocid), otarget_dir, tkbs, p.tkbs_meta_filename) pageids = p.load_tkbs_page_ids(ocrdocjson) tkbs.auth_logout() v and print("--- TEI export ---") tkbsfolder = otarget_dir p.load_tkbs_data(tkbsfolder) p.load_legacy_articles(p.legacy_metafile) p.match_legacy_articles() teifolder = os.path.join(outfolder, 'tei') prep_dir(teifolder) p.export_tei(teifolder) v and print("--- PLAINTEXT export ---") plaintextfolder = os.path.join(outfolder, 'plaintext') prep_dir(teifolder) p.export_plaintext(plaintextfolder)