def parse_files(in_dir, out_dir, htids, language): for folder, subfolders, files in os.walk(in_dir): if not subfolders: for xml_file in files: if xml_file[-4:] == ".xml": htid_test = test_file_htid(htids, folder, xml_file) # test if htid in set of htids, store it and build file if true if htid_test[0]: htid = htid_test[1] obj = Parsed() # replace periods for file-naming obj.h = htid.replace(".", "_") try: obj.a = htids[htid][0] obj.t = htids[htid][1] obj.y = htids[htid][2] except KeyError: print("File with HTID {0} not found in CSV reference file.".format(htid)) for zip_file in files: if zip_file[-4:] == ".zip": with zipfile.ZipFile(folder + "/" + zip_file, 'r') as zf: for txt_file in zf.namelist(): if txt_file[-4:] == ".txt": text = zf.read(txt_file).decode('utf-8') add_content(text, obj, language) with open(out_dir + str(obj.h) + ".json", 'w', encoding='utf-8') as out: out.write(build_json(obj))
def build_json(file: Parsed): """ Construct JSON object which represents a volume in a corpus. """ if file.t is None: file.t = "No title listed" if file.a is None: file.a = "No author listed" if file.p is None: file.p = "No publisher listed" if file.i == '': file.i = "No ISBN listed" if file.d is None: file.d = "No document type" if file.h is None: file.h = "No HTID for this file" file.t = file.t.replace("\n", " ") file.a = file.a.replace("\n", " ") file.p = file.p.replace("\n", " ") file.d = file.d.replace("\n", " ") file.ch = filter_chapters(file.ch) jfile = json.dumps( { 'Title': file.t, 'Author': file.a, 'Publisher': file.p, 'Date': file.y, 'ISBN': file.i, 'Document Type': file.d, 'List of chapters': file.ch, 'HTID': file.h, 'Text': file.c, 'Stemmed': file.cstem, 'Filtered': file.tx, 'Filtered Stemmed': file.txstem, 'Full Sentences': file.c_sent, 'Filtered Sentences': file.tx_sent, 'Stemmed Sentences': file.cstem_sent, 'Filtered Stemmed Sentences': file.txstem_sent, 'URL': file.url }, sort_keys=True, indent=4, separators=(',', ': '), ensure_ascii=False) return jfile