Example #1
0
def build_json(file: Parsed):
    """
    Construct JSON object which represents a volume in a corpus.
    """

    if file.t is None:
        file.t = "No title listed"
    if file.a is None:
        file.a = "No author listed"
    if file.p is None:
        file.p = "No publisher listed"
    if file.i == '':
        file.i = "No ISBN listed"
    if file.d is None:
        file.d = "No document type"
    if file.h is None:
        file.h = "No HTID for this file"

    file.t = file.t.replace("\n", " ")
    file.a = file.a.replace("\n", " ")
    file.p = file.p.replace("\n", " ")
    file.d = file.d.replace("\n", " ")
    file.ch = filter_chapters(file.ch)

    jfile = json.dumps(
        {
            'Title': file.t,
            'Author': file.a,
            'Publisher': file.p,
            'Date': file.y,
            'ISBN': file.i,
            'Document Type': file.d,
            'List of chapters': file.ch,
            'HTID': file.h,
            'Text': file.c,
            'Stemmed': file.cstem,
            'Filtered': file.tx,
            'Filtered Stemmed': file.txstem,
            'Full Sentences': file.c_sent,
            'Filtered Sentences': file.tx_sent,
            'Stemmed Sentences': file.cstem_sent,
            'Filtered Stemmed Sentences': file.txstem_sent,
            'URL': file.url
        },
        sort_keys=True,
        indent=4,
        separators=(',', ': '),
        ensure_ascii=False)
    return jfile
Example #2
0
def parse_txt(in_dir, ids, out_dir):
    """
    Iterate over directory of Gutenberg text files, parse each volume to a JSON object.
    """

    for subdir, dirs, files in os.walk(in_dir):
        for txt_f in tqdm.tqdm(files):

            if txt_f[0] != ".":
                reading = False
                obj = Parsed()

                with open(in_dir + txt_f, 'r', encoding='utf-8') as txt_in:
                    for line in txt_in:
                        if 'Posting Date' in line:
                            idno = get_idno(line)
                            pub_info = match_pub_info(idno, ids)
                            obj.a, obj.t, obj.y = pub_info[1], pub_info[
                                2], pub_info[3]
                        if 'START OF THIS PROJECT GUTENBERG EBOOK' in line:
                            reading = True
                        if 'END OF THIS PROJECT GUTENBERG EBOOK' in line:
                            reading = False
                        if reading and 'START OF THIS PROJECT GUTENBERG EBOOK' not in line:
                            add_content(line, obj, 'german')

                with open(out_dir + txt_f[:-4] + '.json',
                          'w',
                          encoding='utf-8') as out:
                    out.write(build_json(obj))
                    out.close()
Example #3
0
def parse_txt(in_dir, mappings, out_dir):
    """
    Iterate over directory of Banken text files, parse each volume to a JSON object.
    """

    for subdir, dirs, files in os.walk(in_dir):
        for txt_f in tqdm.tqdm(files):

            if txt_f[0] != ".":

                obj = Parsed()

                id_str = txt_f[:-4]
                try:
                    maps = mappings[id_str]
                    obj.a = maps["AUTHOR"]
                    obj.t = maps["TITLE"]
                    obj.y = maps["PUBDATE"]

                    with open(in_dir + txt_f, 'r', encoding='utf-8') as txt_in:
                        for line in txt_in:
                            add_content(line, obj, 'swedish')

                    with open(out_dir + txt_f[:-4] + '.json',
                              'w',
                              encoding='utf-8') as out:
                        out.write(build_json(obj))
                        out.close()
                except KeyError:
                    pass
Example #4
0
def parse_files(in_dir, out_dir, htids, language):
    for folder, subfolders, files in os.walk(in_dir):
        if not subfolders:
            for xml_file in files:
                if xml_file[-4:] == ".xml":
                    htid_test = test_file_htid(htids, folder, xml_file)
                    
                    # test if htid in set of htids, store it and build file if true
                    if htid_test[0]:
                        htid = htid_test[1]
                        obj = Parsed()
                        
                        # replace periods for file-naming
                        obj.h = htid.replace(".", "_")
                        
                        try:
                            obj.a = htids[htid][0]
                            obj.t = htids[htid][1]
                            obj.y = htids[htid][2]
                        except KeyError:
                            print("File with HTID {0} not found in CSV reference file.".format(htid))
                        for zip_file in files:
                            if zip_file[-4:] == ".zip":
                                with zipfile.ZipFile(folder + "/" + zip_file, 'r') as zf:
                                    for txt_file in zf.namelist():
                                        if txt_file[-4:] == ".txt":
                                            text = zf.read(txt_file).decode('utf-8')
                                            add_content(text, obj, language)
                            with open(out_dir + str(obj.h) + ".json", 'w', encoding='utf-8') as out:
                                out.write(build_json(obj))
Example #5
0
    def _parse_files(self, doc, subdir):
        """
        Parse an individual XML volume.
        """

        try:
            f = open("{0}/{1}".format(self.input_dir, doc), 'r')
        except FileNotFoundError:
            f = open("{0}/{1}".format(subdir, doc), 'r')

        tree = BeautifulSoup(f.read(), 'xml')
        obj = Parsed()
        self.get_text(tree, obj)

        pub_info = self.mapping[doc[:-4]]

        obj.a = pub_info["author"]
        obj.t = pub_info["title"]
        obj.y = pub_info["pub_date"]

        with open("{0}/{1}.json".format(self.output_dir, doc[:-4]),
                  'w',
                  encoding='utf-8') as out:
            out.write(build_json(obj))
            out.close()

        f.close()
Example #6
0
def parse_txt(in_dir, mappings, out_dir):
    """
    Iterate over directory of Runeberg text files, parse each volume to a JSON object.
    """

    for subdir, dirs, files in os.walk(in_dir):
        for vol in tqdm.tqdm(dirs):

            if vol[0] != "." and vol != "":

                obj = Parsed()

                try:
                    with open("{}/{}/title".format(in_dir, vol),
                              'r') as title_str:
                        id_str = title_str.read()
                    maps = mappings[id_str]
                    valid = True
                except KeyError:
                    valid = False

                if valid:

                    obj.a = maps["AUTHOR"]
                    obj.t = maps["TITLE"]
                    obj.y = maps["PUBDATE"]

                    for subdir, dirs, files in os.walk("{}/{}/Pages/".format(
                            in_dir, vol)):
                        for text_f in files:
                            if text_f != "whole-page-ok.lst" and text_f[
                                    0] != ".":
                                with open(
                                        "{}/{}/Pages/{}".format(
                                            in_dir, vol, text_f),
                                        'r') as txt_in:
                                    for line in txt_in:
                                        add_content(line, obj, 'swedish')

                    with open(out_dir + vol[:-4] + '.json',
                              'w',
                              encoding='utf-8') as out:
                        out.write(build_json(obj))
                        out.close()
Example #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        metavar='in-directory',
                        action="store",
                        help="input directory argument")
    parser.add_argument("-o", help="output directory argument", action="store")
    parser.add_argument("-csv",
                        help="csv file with publication dates",
                        action="store")

    try:
        args = parser.parse_args()
    except IOError:
        fail("IOError")

    build_out(args.o)

    if args.csv is not None:
        ids = parse_csv(args.csv)
    else:
        fail("Please specify input csv file path")

    for subdir, dirs, files in os.walk(args.i):
        for xmldoc in tqdm.tqdm(files):
            if xmldoc[0] != ".":
                tree = ET.parse(args.i + xmldoc)
                root = tree.getroot()
                base_url = get_id(root)
                obj = Parsed()
                get_text(root, obj)
                if len(obj.c) > 0:
                    pub_info = get_pub_info(ids, base_url)
                    obj.a, obj.t, obj.y = pub_info[0], pub_info[1], pub_info[2]
                    with open(args.o + xmldoc[:-4] + '.json',
                              'w',
                              encoding='utf-8') as out:
                        out.write(build_json(obj))
                        out.close()