예제 #1
0
def export_pipeline(config):
    folders_to_be_exported = find_sub_folders_with_toc_file(config.src_path)
    tkbs_topfolder = os.path.join(config.src_path, "transkribus_output")
    exportfolder = prep_dir(os.path.join(config.src_path,
                                         "transkribus_export"))
    if config.export_csv:
        csvfolder_byregion = prep_dir(
            os.path.join(exportfolder, 'csv_by_region'))
        csvfolder_byarticle = prep_dir(
            os.path.join(exportfolder, 'csv_by_article'))
    if config.export_plaintext:
        plaintextfolder = prep_dir(os.path.join(exportfolder, 'plaintext'))
        plaintextfolder_byarticle = prep_dir(
            os.path.join(exportfolder, 'plaintext_by_article'))
    if config.export_tei:
        teifolder = prep_dir(os.path.join(exportfolder, 'tei'))
    for sfolder in folders_to_be_exported:
        try:
            if not os.path.isfile(os.path.join(sfolder, 'TOC.xml')):
                continue
            infolder = sfolder

            start = str(datetime.datetime.now().strftime("%y-%m-%d-%H-%M"))
            print(start + " - " + infolder)  # + "\n==============")
            v and print("---   LOADING Legacy data ---")
            p = Document()
            p.load_legacy_data(infolder)
            tkbsfolder = find_latest_folder(tkbs_topfolder, p.doc_title)
            p.load_tkbs_data(tkbsfolder)  #FIX
            p.load_legacy_articles(p.legacy_metafile)
            p.match_legacy_articles()

            if config.export_tei:
                v and print("---   TEI export           ---")
                p.export_tei(teifolder)

            if config.export_plaintext:
                v and print("---   PLAINTEXT export     ---")
                p.export_plaintext(plaintextfolder)
                p.export_plaintext_articles(plaintextfolder_byarticle)

            if config.export_csv:
                v and print("---   CSV export           ---")
                p.export_csv_articles(csvfolder_byarticle)
                p.export_csv_regions(csvfolder_byregion)

        except Exception as e:
            print("ERROR in export_pipeline main loop ")
            print(e)
            print("END ERROR \n\n")
            pass

    print("DONE. Output is under " + exportfolder)
예제 #2
0
ocr_status = run_ocr(collec, HTRmodelid, "", str(xdocid), ppageids, tkbs)
if not ocr_status:
    print("ERROR - document failed ocr " + p.title)
    sys.exit(1)

v and print("---   FINAL DOWNLOAD after OCR for TEI export        ---")
ocrdowndir = os.path.join(outfolder, "ocrdowndir")
prep_dir(ocrdowndir)
otarget_dir = os.path.join(ocrdowndir,
                           p.title + "_" + str(collec) + "_" + str(xdocid))
ocrdocjson = download(collec, str(xdocid), otarget_dir, tkbs,
                      p.tkbs_meta_filename)
pageids = p.load_tkbs_page_ids(ocrdocjson)

tkbs.auth_logout()

v and print("---   TEI export           ---")
tkbsfolder = otarget_dir
p.load_tkbs_data(tkbsfolder)
p.load_legacy_articles(p.legacy_metafile)
p.match_legacy_articles()
teifolder = os.path.join(outfolder, 'tei')
prep_dir(teifolder)
p.export_tei(teifolder)

v and print("---   PLAINTEXT export     ---")

plaintextfolder = os.path.join(outfolder, 'plaintext')
prep_dir(teifolder)
p.export_plaintext(plaintextfolder)