Ejemplo n.º 1
0
def load_properties():

    with codecs.open(WIKIDATA_PROP_FILE, 'w') as csvfile:
        writer = csv.DictWriter(csvfile,
                                delimiter=';',
                                fieldnames=wikidata_property_fieldnames,
                                lineterminator='\n')
        writer.writeheader()
        for id in range(MAX_PROP_ID)[1:]:
            wikidata_response_json = common.is_stored_as_json_file(
                WIKIDATA_PROPERTY_DIR + common.SLASH + str(id))
            store = True
            if (wikidata_response_json == None):
                print 'file not exists for property:', id
                wikidata_response = retrieve_wikidata_property(id)
                try:
                    wikidata_response_json = json.loads(
                        wikidata_response.content)
                    store_wikidata_property(id, wikidata_response_json)
                except Exception as ex:
                    store = False
                    print 'property response error. ID:', id, ex

            if store:
                entry = build_wikidata_property_entry(wikidata_response_json,
                                                      id)
                with open(WIKIDATA_PROP_FILE, 'ab') as csvfile:
                    writer = csv.DictWriter(
                        csvfile,
                        delimiter=';',
                        fieldnames=wikidata_property_fieldnames,
                        lineterminator='\n')
                    writer.writerow(entry)
Ejemplo n.º 2
0
def retrieve_wikidata_compositions_by_freebase_id(inputfile):

    summary = summarize.read_csv_summary(inputfile)
    for row in summary[1:]:  # ignore first row, which is a header
        FREEBASE_ID_COL = 1
        print row[FREEBASE_ID_COL]
        wikidata_composition_response = retrieve_wikidata_composition_by_freebase_id(
            row[FREEBASE_ID_COL])
        print wikidata_composition_response
        try:
            wikidata_composition_response_json = json.loads(
                wikidata_composition_response.content)
            items = wikidata_composition_response_json[ITEMS_JSON]
            if len(items) > 0:
                wikidata_composition_id = items[0]
                print 'wikidata_composition_id:', wikidata_composition_id
                composition_response_json = common.is_stored_as_json_file(
                    WIKIDATA_API_URL + ITEMS_JSON + '[' +
                    str(wikidata_composition_id) + ']&' + PROPS_JSON + '=*')
                if (composition_response_json == None):
                    #inputfile = glob.glob(WIKIDATA_COMPOSITION_DATA_DIR + SLASH + str(wikidata_composition_id))
                    #if not inputfile:
                    print 'composition data not exists for composition:', wikidata_composition_id
                    #composition_response_json = retrieve_wikidata_composition_data(wikidata_composition_id)
                    print 'composition json:', composition_response_json
                    store_wikidata_composition_data(wikidata_composition_id,
                                                    composition_response_json)
#                    store_wikidata_composition_data(wikidata_composition_id, composition_response_json.content)
        except KeyError as ke:
            print 'no composition items found:', row[FREEBASE_ID_COL], ke
Ejemplo n.º 3
0
def retrieve_wikidata_compositions_by_musicbrainz_id(inputfile, outputfile):

    with codecs.open(outputfile, 'w') as csvfile:
        writer = csv.DictWriter(csvfile,
                                delimiter=';',
                                fieldnames=common.map_compositions_fieldnames,
                                lineterminator='\n')
        writer.writeheader()

        summary = summarize.read_csv_summary(inputfile)
        for row in summary[1:]:  # ignore first row, which is a header
            MUSICBRAINZ_ID_COL = 0
            MUSICBRAINZ_AUTHOR_NAME_COL = 1
            MUSICBRAINZ_TITLE_COL = 2
            musicbrainz_id = row[MUSICBRAINZ_ID_COL]
            print musicbrainz_id
            wikidata_composition_response = retrieve_wikidata_composition_by_musicbrainz_id(
                musicbrainz_id)
            print wikidata_composition_response
            try:
                wikidata_composition_response_json = json.loads(
                    wikidata_composition_response.content)
                items = wikidata_composition_response_json[ITEMS_JSON]
                viaf_id = 0
                wikidata_composition_id = 0
                if len(items) > 0:
                    wikidata_composition_id = items[0]
                    print 'wikidata_composition_id:', wikidata_composition_id
                    composition_response_json = common.is_stored_as_json_file(
                        WIKIDATA_API_URL + ITEMS_JSON + '[' +
                        str(wikidata_composition_id) + ']&' + PROPS_JSON +
                        '=*')
                    if (composition_response_json == None):
                        print 'composition data not exists for composition:', wikidata_composition_id
                        print 'composition json:', composition_response_json
                        store_wikidata_composition_data(
                            wikidata_composition_id, composition_response_json)
                    wikidata_composition_viaf_response = retrieve_wikidata_composition_viaf_id_by_wikidata_id(
                        wikidata_composition_id)
                    try:
                        #wikidata_composition_viaf_response_json = wikidata_composition_viaf_response.json()
                        wikidata_composition_viaf_response_json = json.loads(
                            wikidata_composition_viaf_response.content)
                        #items = wikidata_composition_response_json[ITEMS_JSON]
                        viaf_id = extract_viaf_id_from_wikidata_composition_id(
                            wikidata_composition_viaf_response_json)
                    except:
                        print 'No VIAF id found for composition ID:', wikidata_composition_id

                    print 'viaf id:', viaf_id

                entry = build_composition_mapping_entry(
                    row[MUSICBRAINZ_TITLE_COL],
                    row[MUSICBRAINZ_AUTHOR_NAME_COL], wikidata_composition_id,
                    viaf_id, musicbrainz_id)
                writer.writerow(entry)
            except KeyError as ke:
                print 'no composition items found:', row[
                    MUSICBRAINZ_ID_COL], ke
Ejemplo n.º 4
0
def retrieve_wikidata_objects_by_internet_archive_id(inputfile, outputfile):

    with codecs.open(outputfile, 'w') as csvfile:
        writer = csv.DictWriter(csvfile,
                                delimiter=';',
                                fieldnames=common.map_band_fieldnames,
                                lineterminator='\n')
        writer.writeheader()

        summary = summarize.read_csv_summary(inputfile)
        for row in summary[1:]:  # ignore first row, which is a header
            INTERNET_ARCHIVE_ID_COL = 0
            BAND_NAME_COL = 1
            internet_archive_id_path = row[INTERNET_ARCHIVE_ID_COL]
            internet_archive_id = internet_archive_id_path.split("/")[-1]
            print "internet_archive_id:", internet_archive_id
            wikidata_object_response = retrieve_wikidata_object_by_internet_archive_id(
                internet_archive_id)
            print wikidata_object_response
            try:
                wikidata_object_response_json = json.loads(
                    wikidata_object_response.content)
                items = wikidata_object_response_json[ITEMS_JSON]
                wikidata_band_id = 0
                musicbrainz_id = 0
                if len(items) > 0:
                    wikidata_band_id = items[0]
                    print 'wikidata_band_id:', wikidata_band_id
                    wikidata_band_response_json = common.is_stored_as_json_file(
                        WIKIDATA_API_URL + ITEMS_JSON + '[' +
                        str(wikidata_band_id) + ']&' + PROPS_JSON + '=*')
                    if (wikidata_band_response_json == None):
                        print 'band data not exists for id:', wikidata_band_id
                        band_data_response = retrieve_wikidata_band_data(
                            wikidata_band_id)
                        wikidata_band_data_response_json = common.validate_response_json(
                            band_data_response)
                    store_wikidata_band_data(wikidata_band_id,
                                             wikidata_band_data_response_json)
                    print 'band json:', wikidata_band_data_response_json

                    try:
                        musicbrainz_id = extract_property_value(
                            wikidata_band_data_response_json,
                            MUSIC_BRAINZ_ARTIST_ID_PROP)
                    except:
                        print 'No musicbrainz id found for band ID:', wikidata_band_id

                    print 'musicbrainz id:', musicbrainz_id

                entry = build_band_mapping_entry(row[BAND_NAME_COL],
                                                 wikidata_band_id,
                                                 internet_archive_id,
                                                 musicbrainz_id)
                writer.writerow(entry)
            except KeyError as ke:
                print 'no composition items found:', row[
                    INTERNET_ARCHIVE_ID_COL], ke
Ejemplo n.º 5
0
def store_compositions_data(composition_id, response):

    filename = str(composition_id).replace(FREEBASE_ID_PREFIX,'') + common.JSON_EXT
    response_json = common.is_stored_as_json_file(FREEBASE_COMPOSITIONS_DATA_DIR + common.SLASH + filename)
    if(response_json == None):
    #inputfile = glob.glob(FREEBASE_COMPOSITIONS_DATA_DIR + SLASH + filename)
    #if not inputfile:
        print 'composition not exists for ID:', composition_id
        common.write_json_file(FREEBASE_COMPOSITIONS_DATA_DIR, filename, response)
Ejemplo n.º 6
0
def store_compositions(author_id, response):

    filename = str(author_id).replace(FREEBASE_ID_PREFIX, '') + common.JSON_EXT
    response_json = common.is_stored_as_json_file(FREEBASE_COMPOSITIONS_DIR +
                                                  common.SLASH + filename)
    if (response_json == None):
        #inputfile = glob.glob(FREEBASE_COMPOSITIONS_DIR + SLASH + filename)
        #if not inputfile:
        print 'composition not exists for author:', author_id
        common.write_json_file(FREEBASE_COMPOSITIONS_DIR, filename, response)
Ejemplo n.º 7
0
def get_wikidata_author_id_by_gnd(gnd, line):

    row = line.split(";")
    wikidata_author_id_response_json = common.is_stored_as_json_file(WIKIDATA_AUTHOR_DIR + common.SLASH
                                                                     + row[ONB_COL] + common.UNDERSCORE + gnd + '*')
    if(wikidata_author_id_response_json == None):
        print 'onb_wikidata not exists for ONB:', row[ONB_COL]
        wikidata_author_id_response = retrieve_wikidata_author_id(gnd)
        wikidata_author_id_response_json = wikidata_author_id_response.json()
    wikidata_author_id = extract_wikidata_author_id(wikidata_author_id_response_json)
    print 'wikidata_author_id', wikidata_author_id
    store_wikidata_author_id(line, wikidata_author_id, gnd, wikidata_author_id_response_json)
    return wikidata_author_id
Ejemplo n.º 8
0
def store_author_data(writer, gnd, gnd_cache, line):

    if gnd not in gnd_cache:
        wikidata_author_id = get_wikidata_author_id_by_gnd(gnd, line)
        if(wikidata_author_id and wikidata_author_id not in gnd_cache):
            gnd_cache.append(wikidata_author_id)
            wikidata_author_data_response_json = common.is_stored_as_json_file(
                WIKIDATA_AUTHOR_DATA_DIR + common.SLASH + str(wikidata_author_id) + '*')
            if(wikidata_author_data_response_json == None):
                print 'wikidata not exists for wikidata author ID:', wikidata_author_id
                author_data_response = retrieve_wikidata_author_data(wikidata_author_id)
                wikidata_author_data_response_json = json.loads(author_data_response.content)
            store_wikidata_author_data(wikidata_author_id, wikidata_author_data_response_json)
            property_dict = wikidata_author_data_response_json['entities']['Q'+str(wikidata_author_id)]['claims']
            entry = build_wikidata_author_entry(property_dict, line, wikidata_author_id)
            writer.writerow(entry)
Ejemplo n.º 9
0
def get_wikidata_author_id_by_gnd(gnd, line):

    row = line.split(";")
    wikidata_author_id_response_json = common.is_stored_as_json_file(
        WIKIDATA_AUTHOR_DIR + common.SLASH + row[ONB_COL] + common.UNDERSCORE +
        gnd + '*')
    if (wikidata_author_id_response_json == None):
        print 'onb_wikidata not exists for ONB:', row[ONB_COL]
        wikidata_author_id_response = retrieve_wikidata_author_id(gnd)
        wikidata_author_id_response_json = wikidata_author_id_response.json()
    wikidata_author_id = extract_wikidata_author_id(
        wikidata_author_id_response_json)
    print 'wikidata_author_id', wikidata_author_id
    store_wikidata_author_id(line, wikidata_author_id, gnd,
                             wikidata_author_id_response_json)
    return wikidata_author_id
Ejemplo n.º 10
0
def store_author_data(writer, gnd, gnd_cache, line):

    if gnd not in gnd_cache:
        wikidata_author_id = get_wikidata_author_id_by_gnd(gnd, line)
        if (wikidata_author_id and wikidata_author_id not in gnd_cache):
            gnd_cache.append(wikidata_author_id)
            wikidata_author_data_response_json = common.is_stored_as_json_file(
                common.WIKIDATA_AUTHOR_DATA_DIR + common.SLASH +
                str(wikidata_author_id) + '*')
            if (wikidata_author_data_response_json == None):
                print 'wikidata not exists for wikidata author ID:', wikidata_author_id
                author_data_response = retrieve_wikidata_author_data(
                    wikidata_author_id)
                wikidata_author_data_response_json = common.validate_response_json(
                    author_data_response)  #author_data_response.json()
            store_wikidata_author_data(wikidata_author_id,
                                       wikidata_author_data_response_json)
            entry = build_wikidata_author_entry(
                wikidata_author_data_response_json, line, wikidata_author_id)
            writer.writerow(entry)
Ejemplo n.º 11
0
def store_author_data(writer, gnd, gnd_cache, line):

    if gnd not in gnd_cache:
        wikidata_author_id = get_wikidata_author_id_by_gnd(gnd, line)
        if (wikidata_author_id and wikidata_author_id not in gnd_cache):
            gnd_cache.append(wikidata_author_id)
            wikidata_author_data_response_json = common.is_stored_as_json_file(
                WIKIDATA_AUTHOR_DATA_DIR + common.SLASH +
                str(wikidata_author_id) + '*')
            if (wikidata_author_data_response_json == None):
                print 'wikidata not exists for wikidata author ID:', wikidata_author_id
                author_data_response = retrieve_wikidata_author_data(
                    wikidata_author_id)
                wikidata_author_data_response_json = json.loads(
                    author_data_response.content)
            store_wikidata_author_data(wikidata_author_id,
                                       wikidata_author_data_response_json)
            property_dict = wikidata_author_data_response_json['entities'][
                'Q' + str(wikidata_author_id)]['claims']
            entry = build_wikidata_author_entry(property_dict, line,
                                                wikidata_author_id)
            writer.writerow(entry)
Ejemplo n.º 12
0
def load_properties():

    with codecs.open(WIKIDATA_PROP_FILE, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=wikidata_property_fieldnames, lineterminator='\n')
        writer.writeheader()
        for id in range(MAX_PROP_ID)[1:]:
            wikidata_response_json = common.is_stored_as_json_file(WIKIDATA_PROPERTY_DIR + common.SLASH + str(id))
            store = True
            if(wikidata_response_json == None):
                print 'file not exists for property:', id
                wikidata_response = retrieve_wikidata_property(id)
                try:
                    wikidata_response_json = json.loads(wikidata_response.content)
                    store_wikidata_property(id, wikidata_response_json)
                except Exception as ex:
                    store = False
                    print 'property response error. ID:', id, ex

            if store:
                entry = build_wikidata_property_entry(wikidata_response_json, id)
                with open(WIKIDATA_PROP_FILE, 'ab') as csvfile:
                    writer = csv.DictWriter(csvfile, delimiter=';', fieldnames=wikidata_property_fieldnames, lineterminator='\n')
                    writer.writerow(entry)