Esempio n. 1
0
def normalize_records(inputfiles, outputdir):
    print("Normalizing", len(inputfiles), "records to", outputdir)
    for index, (filename, record) in enumerate(read_records(inputfiles)):
        progress((index+1)/len(inputfiles))
        normalized_record = normalize(record)
        out_file = os.path.basename(filename).replace("xml", "json")
        write_json_file(outputdir, out_file, normalized_record)
Esempio n. 2
0
def normalize_records(inputfiles, outputdir):
    print("Normalizing", len(inputfiles), "records to", outputdir)
    for index, (filename, record) in enumerate(read_records(inputfiles)):
        progress((index + 1) / len(inputfiles))
        normalized_record = normalize(record)
        out_file = os.path.basename(filename).replace("xml", "json")
        write_json_file(outputdir, out_file, normalized_record)
Esempio n. 3
0
def summarize_records(inputfiles):
    for filename, record in read_records(inputfiles):
        data = json.loads(record)
        summarize(data)
    uris_unique = sorted(set(uris))
    print(len(uris_unique), " different GND references:")
    for uri in uris_unique:
        print(uri)
Esempio n. 4
0
def summarize_records(inputfiles):
    for filename, record in read_records(inputfiles):
        data = json.loads(record)
        summarize(data)
    uris_unique = sorted(set(uris))
    print(len(uris_unique), " different GND references:")
    for uri in uris_unique:
        print(uri)
Esempio n. 5
0
def summarize_sameas(inputfiles, outputfile):

    print("Summarizing", len(inputfiles), "authors in", outputfile)
    with codecs.open(outputfile, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=common.sameas_fieldnames, lineterminator='\n')
        writer.writeheader()
        for filename, record in read_records(inputfiles):
            data = json.loads(record, encoding='utf-8')
            entries = summarize_sameas_data(data, common.sameas_fieldnames)
            writer.writerows(entries)
Esempio n. 6
0
def normalize_records(inputfiles, outputdir):
    print("Normalizing", len(inputfiles), "records to", outputdir)
    for index, (filename, record) in enumerate(read_records(inputfiles)):
        progress((index + 1) / len(inputfiles))
        normalized_record = normalize(record)
        print 'normalized record: ', normalized_record
        #new_normalized_record = [unicode(elem).encode('utf-8') for elem in normalized_record]
        #print 'new normalized record: ', new_normalized_record

        out_file = os.path.basename(filename).replace("xml", "json")
        write_json_file(outputdir, out_file, normalized_record)
Esempio n. 7
0
def enrich_records(inputfiles, outputdir, force=False):
    print("Enriching", len(inputfiles), "records. Saving to", outputdir)
    for index, (filename, record) in enumerate(read_records(inputfiles)):
        progress((index+1)/len(inputfiles))
        out_file = os.path.basename(filename).replace(".json",
                                                      "_enriched.json")
        out_path = outputdir + "/" + out_file
        if(os.path.exists(out_path) and not force):
            print(out_file, "already enriched. Skipping...")
        else:
            enriched_record = enrich(record)
            write_json_file(outputdir, out_file, enriched_record)
Esempio n. 8
0
def enrich_records(inputfiles, outputdir, force=False):
    print("Enriching", len(inputfiles), "records. Saving to", outputdir)
    for index, (filename, record) in enumerate(read_records(inputfiles)):
        progress((index + 1) / len(inputfiles))
        out_file = os.path.basename(filename).replace(".json",
                                                      "_enriched.json")
        out_path = outputdir + "/" + out_file
        if (os.path.exists(out_path) and not force):
            print(out_file, "already enriched. Skipping...")
        else:
            enriched_record = enrich(record)
            write_json_file(outputdir, out_file, enriched_record)
Esempio n. 9
0
def summarize_sameas(inputfiles, outputfile):

    print("Summarizing", len(inputfiles), "authors in", outputfile)
    with codecs.open(outputfile, 'w') as csvfile:
        writer = csv.DictWriter(csvfile,
                                delimiter=';',
                                fieldnames=common.sameas_fieldnames,
                                lineterminator='\n')
        writer.writeheader()
        for filename, record in read_records(inputfiles):
            data = json.loads(record, encoding='utf-8')
            entries = summarize_sameas_data(data, common.sameas_fieldnames)
            writer.writerows(entries)
Esempio n. 10
0
def summarize_records(inputfiles, outputfile):
    print("Summarizing", len(inputfiles), "records in", outputfile)
    with open(outputfile, 'w') as csvfile:
        fieldnames = [
            'id', 'links_artwork', 'persons', 'links_person_gnd',
            'links_person_dbpedia', 'links_person_viaf',
            'related_europeana_items'
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for filename, record in read_records(inputfiles):
            data = json.loads(record)
            entry = summarize(data)
            writer.writerow(entry)
Esempio n. 11
0
def summarize_authors(inputfiles, outputfile):

    print("Summarizing", len(inputfiles), "authors in", outputfile)
    with codecs.open(outputfile, 'w') as csvfile:
        fieldnames = ['onb id',
                      'author name',
                      'gnd url',
                      'dbpedia id']

        writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=fieldnames, lineterminator='\n')
        writer.writeheader()
        for filename, record in read_records(inputfiles):
            data = json.loads(record, encoding='utf-8')
            entries = summarize_authors_data(data, fieldnames)
            writer.writerows(entries)
Esempio n. 12
0
def summarize_titles(inputfiles, outputfile):

    print("Summarizing", len(inputfiles), "titles in", outputfile)
    with codecs.open(outputfile, 'w') as csvfile:
        fieldnames = ['gnd',
                      'author',
                      'subject',
                      'title']

        writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=fieldnames, lineterminator='\n')
        writer.writeheader()
        for filename, record in read_records(inputfiles):
            data = json.loads(record, encoding='utf-8')
            entry = summarize_titles_data(data, fieldnames)
            writer.writerow(entry)
Esempio n. 13
0
def summarize_authors(inputfiles, outputfile):

    print("Summarizing", len(inputfiles), "authors in", outputfile)
    with codecs.open(outputfile, 'w') as csvfile:
        fieldnames = ['onb id', 'author name', 'gnd url', 'dbpedia id']

        writer = csv.DictWriter(csvfile,
                                delimiter=';',
                                fieldnames=fieldnames,
                                lineterminator='\n')
        writer.writeheader()
        for filename, record in read_records(inputfiles):
            data = json.loads(record, encoding='utf-8')
            entries = summarize_authors_data(data, fieldnames)
            writer.writerows(entries)
Esempio n. 14
0
def summarize_titles(inputfiles, outputfile):

    print("Summarizing", len(inputfiles), "titles in", outputfile)
    with codecs.open(outputfile, 'w') as csvfile:
        fieldnames = ['gnd', 'author', 'subject', 'title']

        writer = csv.DictWriter(csvfile,
                                delimiter=';',
                                fieldnames=fieldnames,
                                lineterminator='\n')
        writer.writeheader()
        for filename, record in read_records(inputfiles):
            data = json.loads(record, encoding='utf-8')
            entry = summarize_titles_data(data, fieldnames)
            writer.writerow(entry)
Esempio n. 15
0
def summarize_records(inputfiles, outputfile):
    print("Summarizing", len(inputfiles), "records in", outputfile)
    with open(outputfile, 'w') as csvfile:
        fieldnames = ['id',
                      'links_artwork',
                      'persons',
                      'links_person_gnd',
                      'links_person_dbpedia',
                      'links_person_viaf',
                      'related_europeana_items']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for filename, record in read_records(inputfiles):
            data = json.loads(record)
            entry = summarize(data)
            writer.writerow(entry)