Python Parsed.a Examples

Programming Language: Python

Namespace/Package Name: parsing.parsed

Class/Type: Parsed

Method/Function: a

Examples at hotexamples.com: 7

Python Parsed.a - 7 examples found. These are the top rated real world Python examples of parsing.parsed.Parsed.a extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Parsed(7)

a(7)

t(5)

y(5)

add_filtered(3)

add_filtered_sent(3)

add_filtered_stemmed(3)

add_filtered_stemmed_sent(3)

add_content_sent(3)

add_stemmed_sent(3)

add_content(3)

add_stemmed(3)

h(2)

ch(1)

d(1)

i(1)

p(1)

Example #1

Show file

def build_json(file: Parsed):
    """
    Construct JSON object which represents a volume in a corpus.
    """

    if file.t is None:
        file.t = "No title listed"
    if file.a is None:
        file.a = "No author listed"
    if file.p is None:
        file.p = "No publisher listed"
    if file.i == '':
        file.i = "No ISBN listed"
    if file.d is None:
        file.d = "No document type"
    if file.h is None:
        file.h = "No HTID for this file"

    file.t = file.t.replace("\n", " ")
    file.a = file.a.replace("\n", " ")
    file.p = file.p.replace("\n", " ")
    file.d = file.d.replace("\n", " ")
    file.ch = filter_chapters(file.ch)

    jfile = json.dumps(
        {
            'Title': file.t,
            'Author': file.a,
            'Publisher': file.p,
            'Date': file.y,
            'ISBN': file.i,
            'Document Type': file.d,
            'List of chapters': file.ch,
            'HTID': file.h,
            'Text': file.c,
            'Stemmed': file.cstem,
            'Filtered': file.tx,
            'Filtered Stemmed': file.txstem,
            'Full Sentences': file.c_sent,
            'Filtered Sentences': file.tx_sent,
            'Stemmed Sentences': file.cstem_sent,
            'Filtered Stemmed Sentences': file.txstem_sent,
            'URL': file.url
        },
        sort_keys=True,
        indent=4,
        separators=(',', ': '),
        ensure_ascii=False)
    return jfile

Example #2

Show file

def parse_txt(in_dir, ids, out_dir):
    """
    Iterate over directory of Gutenberg text files, parse each volume to a JSON object.
    """

    for subdir, dirs, files in os.walk(in_dir):
        for txt_f in tqdm.tqdm(files):

            if txt_f[0] != ".":
                reading = False
                obj = Parsed()

                with open(in_dir + txt_f, 'r', encoding='utf-8') as txt_in:
                    for line in txt_in:
                        if 'Posting Date' in line:
                            idno = get_idno(line)
                            pub_info = match_pub_info(idno, ids)
                            obj.a, obj.t, obj.y = pub_info[1], pub_info[
                                2], pub_info[3]
                        if 'START OF THIS PROJECT GUTENBERG EBOOK' in line:
                            reading = True
                        if 'END OF THIS PROJECT GUTENBERG EBOOK' in line:
                            reading = False
                        if reading and 'START OF THIS PROJECT GUTENBERG EBOOK' not in line:
                            add_content(line, obj, 'german')

                with open(out_dir + txt_f[:-4] + '.json',
                          'w',
                          encoding='utf-8') as out:
                    out.write(build_json(obj))
                    out.close()

Example #3

Show file

File: banken_txt_parser.py Project: hicsail/corpus

def parse_txt(in_dir, mappings, out_dir):
    """
    Iterate over directory of Banken text files, parse each volume to a JSON object.
    """

    for subdir, dirs, files in os.walk(in_dir):
        for txt_f in tqdm.tqdm(files):

            if txt_f[0] != ".":

                obj = Parsed()

                id_str = txt_f[:-4]
                try:
                    maps = mappings[id_str]
                    obj.a = maps["AUTHOR"]
                    obj.t = maps["TITLE"]
                    obj.y = maps["PUBDATE"]

                    with open(in_dir + txt_f, 'r', encoding='utf-8') as txt_in:
                        for line in txt_in:
                            add_content(line, obj, 'swedish')

                    with open(out_dir + txt_f[:-4] + '.json',
                              'w',
                              encoding='utf-8') as out:
                        out.write(build_json(obj))
                        out.close()
                except KeyError:
                    pass

Example #4

Show file

def parse_files(in_dir, out_dir, htids, language):
    for folder, subfolders, files in os.walk(in_dir):
        if not subfolders:
            for xml_file in files:
                if xml_file[-4:] == ".xml":
                    htid_test = test_file_htid(htids, folder, xml_file)
                    
                    # test if htid in set of htids, store it and build file if true
                    if htid_test[0]:
                        htid = htid_test[1]
                        obj = Parsed()
                        
                        # replace periods for file-naming
                        obj.h = htid.replace(".", "_")
                        
                        try:
                            obj.a = htids[htid][0]
                            obj.t = htids[htid][1]
                            obj.y = htids[htid][2]
                        except KeyError:
                            print("File with HTID {0} not found in CSV reference file.".format(htid))
                        for zip_file in files:
                            if zip_file[-4:] == ".zip":
                                with zipfile.ZipFile(folder + "/" + zip_file, 'r') as zf:
                                    for txt_file in zf.namelist():
                                        if txt_file[-4:] == ".txt":
                                            text = zf.read(txt_file).decode('utf-8')
                                            add_content(text, obj, language)
                            with open(out_dir + str(obj.h) + ".json", 'w', encoding='utf-8') as out:
                                out.write(build_json(obj))

Example #5

Show file

File: dutch.py Project: hicsail/corpus

    def _parse_files(self, doc, subdir):
        """
        Parse an individual XML volume.
        """

        try:
            f = open("{0}/{1}".format(self.input_dir, doc), 'r')
        except FileNotFoundError:
            f = open("{0}/{1}".format(subdir, doc), 'r')

        tree = BeautifulSoup(f.read(), 'xml')
        obj = Parsed()
        self.get_text(tree, obj)

        pub_info = self.mapping[doc[:-4]]

        obj.a = pub_info["author"]
        obj.t = pub_info["title"]
        obj.y = pub_info["pub_date"]

        with open("{0}/{1}.json".format(self.output_dir, doc[:-4]),
                  'w',
                  encoding='utf-8') as out:
            out.write(build_json(obj))
            out.close()

        f.close()

Example #6

Show file

def parse_txt(in_dir, mappings, out_dir):
    """
    Iterate over directory of Runeberg text files, parse each volume to a JSON object.
    """

    for subdir, dirs, files in os.walk(in_dir):
        for vol in tqdm.tqdm(dirs):

            if vol[0] != "." and vol != "":

                obj = Parsed()

                try:
                    with open("{}/{}/title".format(in_dir, vol),
                              'r') as title_str:
                        id_str = title_str.read()
                    maps = mappings[id_str]
                    valid = True
                except KeyError:
                    valid = False

                if valid:

                    obj.a = maps["AUTHOR"]
                    obj.t = maps["TITLE"]
                    obj.y = maps["PUBDATE"]

                    for subdir, dirs, files in os.walk("{}/{}/Pages/".format(
                            in_dir, vol)):
                        for text_f in files:
                            if text_f != "whole-page-ok.lst" and text_f[
                                    0] != ".":
                                with open(
                                        "{}/{}/Pages/{}".format(
                                            in_dir, vol, text_f),
                                        'r') as txt_in:
                                    for line in txt_in:
                                        add_content(line, obj, 'swedish')

                    with open(out_dir + vol[:-4] + '.json',
                              'w',
                              encoding='utf-8') as out:
                        out.write(build_json(obj))
                        out.close()

Example #7

Show file

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        metavar='in-directory',
                        action="store",
                        help="input directory argument")
    parser.add_argument("-o", help="output directory argument", action="store")
    parser.add_argument("-csv",
                        help="csv file with publication dates",
                        action="store")

    try:
        args = parser.parse_args()
    except IOError:
        fail("IOError")

    build_out(args.o)

    if args.csv is not None:
        ids = parse_csv(args.csv)
    else:
        fail("Please specify input csv file path")

    for subdir, dirs, files in os.walk(args.i):
        for xmldoc in tqdm.tqdm(files):
            if xmldoc[0] != ".":
                tree = ET.parse(args.i + xmldoc)
                root = tree.getroot()
                base_url = get_id(root)
                obj = Parsed()
                get_text(root, obj)
                if len(obj.c) > 0:
                    pub_info = get_pub_info(ids, base_url)
                    obj.a, obj.t, obj.y = pub_info[0], pub_info[1], pub_info[2]
                    with open(args.o + xmldoc[:-4] + '.json',
                              'w',
                              encoding='utf-8') as out:
                        out.write(build_json(obj))
                        out.close()