def load_properties(): with codecs.open(WIKIDATA_PROP_FILE, 'w') as csvfile: writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=wikidata_property_fieldnames, lineterminator='\n') writer.writeheader() for id in range(MAX_PROP_ID)[1:]: wikidata_response_json = common.is_stored_as_json_file( WIKIDATA_PROPERTY_DIR + common.SLASH + str(id)) store = True if (wikidata_response_json == None): print 'file not exists for property:', id wikidata_response = retrieve_wikidata_property(id) try: wikidata_response_json = json.loads( wikidata_response.content) store_wikidata_property(id, wikidata_response_json) except Exception as ex: store = False print 'property response error. ID:', id, ex if store: entry = build_wikidata_property_entry(wikidata_response_json, id) with open(WIKIDATA_PROP_FILE, 'ab') as csvfile: writer = csv.DictWriter( csvfile, delimiter=';', fieldnames=wikidata_property_fieldnames, lineterminator='\n') writer.writerow(entry)
def retrieve_wikidata_compositions_by_freebase_id(inputfile): summary = summarize.read_csv_summary(inputfile) for row in summary[1:]: # ignore first row, which is a header FREEBASE_ID_COL = 1 print row[FREEBASE_ID_COL] wikidata_composition_response = retrieve_wikidata_composition_by_freebase_id( row[FREEBASE_ID_COL]) print wikidata_composition_response try: wikidata_composition_response_json = json.loads( wikidata_composition_response.content) items = wikidata_composition_response_json[ITEMS_JSON] if len(items) > 0: wikidata_composition_id = items[0] print 'wikidata_composition_id:', wikidata_composition_id composition_response_json = common.is_stored_as_json_file( WIKIDATA_API_URL + ITEMS_JSON + '[' + str(wikidata_composition_id) + ']&' + PROPS_JSON + '=*') if (composition_response_json == None): #inputfile = glob.glob(WIKIDATA_COMPOSITION_DATA_DIR + SLASH + str(wikidata_composition_id)) #if not inputfile: print 'composition data not exists for composition:', wikidata_composition_id #composition_response_json = retrieve_wikidata_composition_data(wikidata_composition_id) print 'composition json:', composition_response_json store_wikidata_composition_data(wikidata_composition_id, composition_response_json) # store_wikidata_composition_data(wikidata_composition_id, composition_response_json.content) except KeyError as ke: print 'no composition items found:', row[FREEBASE_ID_COL], ke
def retrieve_wikidata_compositions_by_musicbrainz_id(inputfile, outputfile): with codecs.open(outputfile, 'w') as csvfile: writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=common.map_compositions_fieldnames, lineterminator='\n') writer.writeheader() summary = summarize.read_csv_summary(inputfile) for row in summary[1:]: # ignore first row, which is a header MUSICBRAINZ_ID_COL = 0 MUSICBRAINZ_AUTHOR_NAME_COL = 1 MUSICBRAINZ_TITLE_COL = 2 musicbrainz_id = row[MUSICBRAINZ_ID_COL] print musicbrainz_id wikidata_composition_response = retrieve_wikidata_composition_by_musicbrainz_id( musicbrainz_id) print wikidata_composition_response try: wikidata_composition_response_json = json.loads( wikidata_composition_response.content) items = wikidata_composition_response_json[ITEMS_JSON] viaf_id = 0 wikidata_composition_id = 0 if len(items) > 0: wikidata_composition_id = items[0] print 'wikidata_composition_id:', wikidata_composition_id composition_response_json = common.is_stored_as_json_file( WIKIDATA_API_URL + ITEMS_JSON + '[' + str(wikidata_composition_id) + ']&' + PROPS_JSON + '=*') if (composition_response_json == None): print 'composition data not exists for composition:', wikidata_composition_id print 'composition json:', composition_response_json store_wikidata_composition_data( wikidata_composition_id, composition_response_json) wikidata_composition_viaf_response = retrieve_wikidata_composition_viaf_id_by_wikidata_id( wikidata_composition_id) try: #wikidata_composition_viaf_response_json = wikidata_composition_viaf_response.json() wikidata_composition_viaf_response_json = json.loads( wikidata_composition_viaf_response.content) #items = wikidata_composition_response_json[ITEMS_JSON] viaf_id = extract_viaf_id_from_wikidata_composition_id( wikidata_composition_viaf_response_json) except: print 'No VIAF id found for composition ID:', wikidata_composition_id print 'viaf id:', viaf_id entry = build_composition_mapping_entry( row[MUSICBRAINZ_TITLE_COL], row[MUSICBRAINZ_AUTHOR_NAME_COL], wikidata_composition_id, viaf_id, musicbrainz_id) writer.writerow(entry) except KeyError as ke: print 'no composition items found:', row[ MUSICBRAINZ_ID_COL], ke
def retrieve_wikidata_objects_by_internet_archive_id(inputfile, outputfile): with codecs.open(outputfile, 'w') as csvfile: writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=common.map_band_fieldnames, lineterminator='\n') writer.writeheader() summary = summarize.read_csv_summary(inputfile) for row in summary[1:]: # ignore first row, which is a header INTERNET_ARCHIVE_ID_COL = 0 BAND_NAME_COL = 1 internet_archive_id_path = row[INTERNET_ARCHIVE_ID_COL] internet_archive_id = internet_archive_id_path.split("/")[-1] print "internet_archive_id:", internet_archive_id wikidata_object_response = retrieve_wikidata_object_by_internet_archive_id( internet_archive_id) print wikidata_object_response try: wikidata_object_response_json = json.loads( wikidata_object_response.content) items = wikidata_object_response_json[ITEMS_JSON] wikidata_band_id = 0 musicbrainz_id = 0 if len(items) > 0: wikidata_band_id = items[0] print 'wikidata_band_id:', wikidata_band_id wikidata_band_response_json = common.is_stored_as_json_file( WIKIDATA_API_URL + ITEMS_JSON + '[' + str(wikidata_band_id) + ']&' + PROPS_JSON + '=*') if (wikidata_band_response_json == None): print 'band data not exists for id:', wikidata_band_id band_data_response = retrieve_wikidata_band_data( wikidata_band_id) wikidata_band_data_response_json = common.validate_response_json( band_data_response) store_wikidata_band_data(wikidata_band_id, wikidata_band_data_response_json) print 'band json:', wikidata_band_data_response_json try: musicbrainz_id = extract_property_value( wikidata_band_data_response_json, MUSIC_BRAINZ_ARTIST_ID_PROP) except: print 'No musicbrainz id found for band ID:', wikidata_band_id print 'musicbrainz id:', musicbrainz_id entry = build_band_mapping_entry(row[BAND_NAME_COL], wikidata_band_id, internet_archive_id, musicbrainz_id) writer.writerow(entry) except KeyError as ke: print 'no composition items found:', row[ INTERNET_ARCHIVE_ID_COL], ke
def store_compositions_data(composition_id, response): filename = str(composition_id).replace(FREEBASE_ID_PREFIX,'') + common.JSON_EXT response_json = common.is_stored_as_json_file(FREEBASE_COMPOSITIONS_DATA_DIR + common.SLASH + filename) if(response_json == None): #inputfile = glob.glob(FREEBASE_COMPOSITIONS_DATA_DIR + SLASH + filename) #if not inputfile: print 'composition not exists for ID:', composition_id common.write_json_file(FREEBASE_COMPOSITIONS_DATA_DIR, filename, response)
def store_compositions(author_id, response): filename = str(author_id).replace(FREEBASE_ID_PREFIX, '') + common.JSON_EXT response_json = common.is_stored_as_json_file(FREEBASE_COMPOSITIONS_DIR + common.SLASH + filename) if (response_json == None): #inputfile = glob.glob(FREEBASE_COMPOSITIONS_DIR + SLASH + filename) #if not inputfile: print 'composition not exists for author:', author_id common.write_json_file(FREEBASE_COMPOSITIONS_DIR, filename, response)
def get_wikidata_author_id_by_gnd(gnd, line): row = line.split(";") wikidata_author_id_response_json = common.is_stored_as_json_file(WIKIDATA_AUTHOR_DIR + common.SLASH + row[ONB_COL] + common.UNDERSCORE + gnd + '*') if(wikidata_author_id_response_json == None): print 'onb_wikidata not exists for ONB:', row[ONB_COL] wikidata_author_id_response = retrieve_wikidata_author_id(gnd) wikidata_author_id_response_json = wikidata_author_id_response.json() wikidata_author_id = extract_wikidata_author_id(wikidata_author_id_response_json) print 'wikidata_author_id', wikidata_author_id store_wikidata_author_id(line, wikidata_author_id, gnd, wikidata_author_id_response_json) return wikidata_author_id
def store_author_data(writer, gnd, gnd_cache, line): if gnd not in gnd_cache: wikidata_author_id = get_wikidata_author_id_by_gnd(gnd, line) if(wikidata_author_id and wikidata_author_id not in gnd_cache): gnd_cache.append(wikidata_author_id) wikidata_author_data_response_json = common.is_stored_as_json_file( WIKIDATA_AUTHOR_DATA_DIR + common.SLASH + str(wikidata_author_id) + '*') if(wikidata_author_data_response_json == None): print 'wikidata not exists for wikidata author ID:', wikidata_author_id author_data_response = retrieve_wikidata_author_data(wikidata_author_id) wikidata_author_data_response_json = json.loads(author_data_response.content) store_wikidata_author_data(wikidata_author_id, wikidata_author_data_response_json) property_dict = wikidata_author_data_response_json['entities']['Q'+str(wikidata_author_id)]['claims'] entry = build_wikidata_author_entry(property_dict, line, wikidata_author_id) writer.writerow(entry)
def get_wikidata_author_id_by_gnd(gnd, line): row = line.split(";") wikidata_author_id_response_json = common.is_stored_as_json_file( WIKIDATA_AUTHOR_DIR + common.SLASH + row[ONB_COL] + common.UNDERSCORE + gnd + '*') if (wikidata_author_id_response_json == None): print 'onb_wikidata not exists for ONB:', row[ONB_COL] wikidata_author_id_response = retrieve_wikidata_author_id(gnd) wikidata_author_id_response_json = wikidata_author_id_response.json() wikidata_author_id = extract_wikidata_author_id( wikidata_author_id_response_json) print 'wikidata_author_id', wikidata_author_id store_wikidata_author_id(line, wikidata_author_id, gnd, wikidata_author_id_response_json) return wikidata_author_id
def store_author_data(writer, gnd, gnd_cache, line): if gnd not in gnd_cache: wikidata_author_id = get_wikidata_author_id_by_gnd(gnd, line) if (wikidata_author_id and wikidata_author_id not in gnd_cache): gnd_cache.append(wikidata_author_id) wikidata_author_data_response_json = common.is_stored_as_json_file( common.WIKIDATA_AUTHOR_DATA_DIR + common.SLASH + str(wikidata_author_id) + '*') if (wikidata_author_data_response_json == None): print 'wikidata not exists for wikidata author ID:', wikidata_author_id author_data_response = retrieve_wikidata_author_data( wikidata_author_id) wikidata_author_data_response_json = common.validate_response_json( author_data_response) #author_data_response.json() store_wikidata_author_data(wikidata_author_id, wikidata_author_data_response_json) entry = build_wikidata_author_entry( wikidata_author_data_response_json, line, wikidata_author_id) writer.writerow(entry)
def store_author_data(writer, gnd, gnd_cache, line): if gnd not in gnd_cache: wikidata_author_id = get_wikidata_author_id_by_gnd(gnd, line) if (wikidata_author_id and wikidata_author_id not in gnd_cache): gnd_cache.append(wikidata_author_id) wikidata_author_data_response_json = common.is_stored_as_json_file( WIKIDATA_AUTHOR_DATA_DIR + common.SLASH + str(wikidata_author_id) + '*') if (wikidata_author_data_response_json == None): print 'wikidata not exists for wikidata author ID:', wikidata_author_id author_data_response = retrieve_wikidata_author_data( wikidata_author_id) wikidata_author_data_response_json = json.loads( author_data_response.content) store_wikidata_author_data(wikidata_author_id, wikidata_author_data_response_json) property_dict = wikidata_author_data_response_json['entities'][ 'Q' + str(wikidata_author_id)]['claims'] entry = build_wikidata_author_entry(property_dict, line, wikidata_author_id) writer.writerow(entry)
def load_properties(): with codecs.open(WIKIDATA_PROP_FILE, 'w') as csvfile: writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=wikidata_property_fieldnames, lineterminator='\n') writer.writeheader() for id in range(MAX_PROP_ID)[1:]: wikidata_response_json = common.is_stored_as_json_file(WIKIDATA_PROPERTY_DIR + common.SLASH + str(id)) store = True if(wikidata_response_json == None): print 'file not exists for property:', id wikidata_response = retrieve_wikidata_property(id) try: wikidata_response_json = json.loads(wikidata_response.content) store_wikidata_property(id, wikidata_response_json) except Exception as ex: store = False print 'property response error. ID:', id, ex if store: entry = build_wikidata_property_entry(wikidata_response_json, id) with open(WIKIDATA_PROP_FILE, 'ab') as csvfile: writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=wikidata_property_fieldnames, lineterminator='\n') writer.writerow(entry)