def normalize_records(inputfiles, outputdir): print("Normalizing", len(inputfiles), "records to", outputdir) for index, (filename, record) in enumerate(read_records(inputfiles)): progress((index+1)/len(inputfiles)) normalized_record = normalize(record) out_file = os.path.basename(filename).replace("xml", "json") write_json_file(outputdir, out_file, normalized_record)
def normalize_records(inputfiles, outputdir): print("Normalizing", len(inputfiles), "records to", outputdir) for index, (filename, record) in enumerate(read_records(inputfiles)): progress((index + 1) / len(inputfiles)) normalized_record = normalize(record) out_file = os.path.basename(filename).replace("xml", "json") write_json_file(outputdir, out_file, normalized_record)
def summarize_records(inputfiles): for filename, record in read_records(inputfiles): data = json.loads(record) summarize(data) uris_unique = sorted(set(uris)) print(len(uris_unique), " different GND references:") for uri in uris_unique: print(uri)
def summarize_sameas(inputfiles, outputfile): print("Summarizing", len(inputfiles), "authors in", outputfile) with codecs.open(outputfile, 'w') as csvfile: writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=common.sameas_fieldnames, lineterminator='\n') writer.writeheader() for filename, record in read_records(inputfiles): data = json.loads(record, encoding='utf-8') entries = summarize_sameas_data(data, common.sameas_fieldnames) writer.writerows(entries)
def normalize_records(inputfiles, outputdir): print("Normalizing", len(inputfiles), "records to", outputdir) for index, (filename, record) in enumerate(read_records(inputfiles)): progress((index + 1) / len(inputfiles)) normalized_record = normalize(record) print 'normalized record: ', normalized_record #new_normalized_record = [unicode(elem).encode('utf-8') for elem in normalized_record] #print 'new normalized record: ', new_normalized_record out_file = os.path.basename(filename).replace("xml", "json") write_json_file(outputdir, out_file, normalized_record)
def enrich_records(inputfiles, outputdir, force=False): print("Enriching", len(inputfiles), "records. Saving to", outputdir) for index, (filename, record) in enumerate(read_records(inputfiles)): progress((index+1)/len(inputfiles)) out_file = os.path.basename(filename).replace(".json", "_enriched.json") out_path = outputdir + "/" + out_file if(os.path.exists(out_path) and not force): print(out_file, "already enriched. Skipping...") else: enriched_record = enrich(record) write_json_file(outputdir, out_file, enriched_record)
def enrich_records(inputfiles, outputdir, force=False): print("Enriching", len(inputfiles), "records. Saving to", outputdir) for index, (filename, record) in enumerate(read_records(inputfiles)): progress((index + 1) / len(inputfiles)) out_file = os.path.basename(filename).replace(".json", "_enriched.json") out_path = outputdir + "/" + out_file if (os.path.exists(out_path) and not force): print(out_file, "already enriched. Skipping...") else: enriched_record = enrich(record) write_json_file(outputdir, out_file, enriched_record)
def summarize_records(inputfiles, outputfile): print("Summarizing", len(inputfiles), "records in", outputfile) with open(outputfile, 'w') as csvfile: fieldnames = [ 'id', 'links_artwork', 'persons', 'links_person_gnd', 'links_person_dbpedia', 'links_person_viaf', 'related_europeana_items' ] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for filename, record in read_records(inputfiles): data = json.loads(record) entry = summarize(data) writer.writerow(entry)
def summarize_authors(inputfiles, outputfile): print("Summarizing", len(inputfiles), "authors in", outputfile) with codecs.open(outputfile, 'w') as csvfile: fieldnames = ['onb id', 'author name', 'gnd url', 'dbpedia id'] writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=fieldnames, lineterminator='\n') writer.writeheader() for filename, record in read_records(inputfiles): data = json.loads(record, encoding='utf-8') entries = summarize_authors_data(data, fieldnames) writer.writerows(entries)
def summarize_titles(inputfiles, outputfile): print("Summarizing", len(inputfiles), "titles in", outputfile) with codecs.open(outputfile, 'w') as csvfile: fieldnames = ['gnd', 'author', 'subject', 'title'] writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=fieldnames, lineterminator='\n') writer.writeheader() for filename, record in read_records(inputfiles): data = json.loads(record, encoding='utf-8') entry = summarize_titles_data(data, fieldnames) writer.writerow(entry)
def summarize_records(inputfiles, outputfile): print("Summarizing", len(inputfiles), "records in", outputfile) with open(outputfile, 'w') as csvfile: fieldnames = ['id', 'links_artwork', 'persons', 'links_person_gnd', 'links_person_dbpedia', 'links_person_viaf', 'related_europeana_items'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for filename, record in read_records(inputfiles): data = json.loads(record) entry = summarize(data) writer.writerow(entry)