Exemple #1
0
def parse_files(in_dir, out_dir, htids, language):
    for folder, subfolders, files in os.walk(in_dir):
        if not subfolders:
            for xml_file in files:
                if xml_file[-4:] == ".xml":
                    htid_test = test_file_htid(htids, folder, xml_file)
                    
                    # test if htid in set of htids, store it and build file if true
                    if htid_test[0]:
                        htid = htid_test[1]
                        obj = Parsed()
                        
                        # replace periods for file-naming
                        obj.h = htid.replace(".", "_")
                        
                        try:
                            obj.a = htids[htid][0]
                            obj.t = htids[htid][1]
                            obj.y = htids[htid][2]
                        except KeyError:
                            print("File with HTID {0} not found in CSV reference file.".format(htid))
                        for zip_file in files:
                            if zip_file[-4:] == ".zip":
                                with zipfile.ZipFile(folder + "/" + zip_file, 'r') as zf:
                                    for txt_file in zf.namelist():
                                        if txt_file[-4:] == ".txt":
                                            text = zf.read(txt_file).decode('utf-8')
                                            add_content(text, obj, language)
                            with open(out_dir + str(obj.h) + ".json", 'w', encoding='utf-8') as out:
                                out.write(build_json(obj))
Exemple #2
0
def build_json(file: Parsed):
    """
    Construct JSON object which represents a volume in a corpus.
    """

    if file.t is None:
        file.t = "No title listed"
    if file.a is None:
        file.a = "No author listed"
    if file.p is None:
        file.p = "No publisher listed"
    if file.i == '':
        file.i = "No ISBN listed"
    if file.d is None:
        file.d = "No document type"
    if file.h is None:
        file.h = "No HTID for this file"

    file.t = file.t.replace("\n", " ")
    file.a = file.a.replace("\n", " ")
    file.p = file.p.replace("\n", " ")
    file.d = file.d.replace("\n", " ")
    file.ch = filter_chapters(file.ch)

    jfile = json.dumps(
        {
            'Title': file.t,
            'Author': file.a,
            'Publisher': file.p,
            'Date': file.y,
            'ISBN': file.i,
            'Document Type': file.d,
            'List of chapters': file.ch,
            'HTID': file.h,
            'Text': file.c,
            'Stemmed': file.cstem,
            'Filtered': file.tx,
            'Filtered Stemmed': file.txstem,
            'Full Sentences': file.c_sent,
            'Filtered Sentences': file.tx_sent,
            'Stemmed Sentences': file.cstem_sent,
            'Filtered Stemmed Sentences': file.txstem_sent,
            'URL': file.url
        },
        sort_keys=True,
        indent=4,
        separators=(',', ': '),
        ensure_ascii=False)
    return jfile