def artist_solo_end_activity_year(artist_wikidata_id, artist_type) -> 'year':
    """Extracts the year a solo artist has ended his musical career

    Arguments:
        artist_wikidata_id {str} -- 
        artist_solo {str} -- 

    Returns:
        str -- Year, formatted as a string with 4 digits. Eg. 1994
    """
    if artist_wikidata_id is not None and artist_type['value'] == 'Person':
        query = "select ?y where {" + \
            artist_wikidata_id['value'] + " wdt:P2032 ?y .}"
        results = query_sparql(query)
        if len(results) > 0:
            if len(results) == 1:
                date = results[0]['y']['value']

                # Check if satisfies the pattern
                if re.match(r"^\d{4}-\d{2}-\d{2}T00:00:00Z$", date):
                    year = date.split('-')[0]
                    return {'value': year}
                else:
                    logging.getLogger('root.features').warning(
                        f"Date {date} does not match pattern")
            else:
                logging.getLogger('root.features').warning(
                    f"Found more than one value for work period (end) for entity {artist_wikidata_id['value']}, skipping"
                )

        else:
            logging.getLogger('root.features').warning(
                f"No attribute work period (end) associated with entity {artist_wikidata_id['value']}"
            )
Ejemplo n.º 2
0
def build_genres_musicbrainz_to_wikidata_dict():
    """Construct the dictionary for the associations between musicbrainz id and wikidata id

    Raises:
        FileNotFoundError: When the musicrainz genres dictionary is missing. It records which are the possible musicbrainz ids,
        and so it will constitute the keys of the dictionary that we are building here
    """
    sys.setrecursionlimit(10000)
    try:
        d = np.load(
            f'{preprocessed_dataset_path}/musicbrainz_genres_dictionary.npy',
            allow_pickle=True).item()
    except FileNotFoundError:
        raise FileNotFoundError(
            "Musicbrainz genres dictionary is missing. Create using function build_genres_dictionary in genres_musicbrainz.py"
        )

    musicbrainz_ids = list(d.values())
    musicbrainz_to_wikidata = {}
    for musicbrainz_id in tqdm(musicbrainz_ids):
        query = 'select ?i where{?i wdt:P8052 "' + musicbrainz_id + '"}'
        results = query_sparql(query)
        if len(results) == 1:
            wikidata_id = f"wd:{results[0]['i']['value'].split('/')[-1]}"
            musicbrainz_to_wikidata[musicbrainz_id] = wikidata_id
        else:
            logging.getLogger(
                'root.data.build_genres_musicbrainz_to_wikidata_dict'
            ).warning(
                f"Found {len(results)} correspondences for genre {musicbrainz_id} in wikidata. Skipping"
            )

    np.save(f'{preprocessed_dataset_path}/genres_musicbrainz_to_wikidata',
            musicbrainz_to_wikidata)
def build_genres_ancestor_wikidata_dict():
    """Construct the wikidata genre ancestors dictionary

    Raises:
        FileNotFoundError: It relies on the file genres_musicbrainz_to_wikidata which states which genres we can encounter,
                           that is, the genres in wikidata that correspond to the possible genres in musicbrainz
    """
    sys.setrecursionlimit(10000)
    try:
        d = np.load(
            f'{preprocessed_dataset_path}/genres_musicbrainz_to_wikidata.npy',
            allow_pickle=True).item()
    except FileNotFoundError:
        raise FileNotFoundError(
            "Genres musicbrainz to wikidata dictionary is missing. Create using function build_genres_musicbrainz_to_wikidata_dict in genres_musicbrainz_to_wikidata.py"
        )

    wikidata_ids = list(d.values())
    d = {}
    for wikidata_id in tqdm(wikidata_ids):
        query = 'select ?i where{' + wikidata_id + ' wdt:P279 ?i}'
        results = query_sparql(query)
        if len(results) > 0:
            ancestors = [
                f"wd:{r['i']['value'].split('/')[-1]}" for r in results
            ]
            d[wikidata_id] = ancestors
        else:
            logging.getLogger('root.data.genres_ancestor_wikidata').warning(
                f"No ancestors found for the music genre {wikidata_id}")

    np.save(f'{preprocessed_dataset_path}/genres_ancestor_wikidata', d)
def get_label_entity(entity):
    r = query_sparql('select ?l where {' + entity +
                     ' rdfs:label ?l . FILTER(langMatches(lang(?l),"en")) }')

    if len(r) > 0:
        return r[0]['l']['value']
    else:
        raise ValueError(f"Entity {entity} has not label")
Ejemplo n.º 5
0
def artist_awards(artist_wikidata_id) -> 'award_wikidata':
    """
        Extracts the list of awards won by that artist
    """
    query = "SELECT ?award ?cerimony_label ?year {" + artist_wikidata_id[
        'value'] + " p:P166 ?a . \
        ?a ps:P166 ?award . ?a pq:P585 ?year . OPTIONAL {?a pq:P805 ?cerimony . \
        ?cerimony rdfs:label ?cerimony_label . FILTER (lang(?cerimony_label) = 'en')} }"

    results = query_sparql(query)
    if len(results) > 0:
        awards = []
        for r in results:
            """ For every award, we build a dictionary with three fields:

            - award id: Eg the wikidata id of Grammy award for best Song
            - year: The year the award was received
            - award series: Eg Grammy Award, Brit award, ..
            """

            d = {}

            year = r['year']['value']
            if re.match(r"^\d{4}-\d{2}-\d{2}T00:00:00Z$", year):
                d['year'] = year.split('-')[0]
            else:
                raise ValueError(
                    f"Year {year} associated with Award of artist {artist_wikidata_id['value']} is not well formated"
                )

            award_id = r['award']['value']
            d['award_id'] = f"wd:{award_id.split('/')[-1]}"

            try:
                ceremony_label = r['cerimony_label']['value']

                if 'Grammy' in ceremony_label:
                    d['award_series'] = 'Grammy Award'
                elif 'MTV Video Music Awards' in ceremony_label:
                    d['award_series'] = 'MTV Video Music Award'
                elif 'MTV Music Awards' in ceremony_label:
                    d['award_series'] = 'MTV Music Award'
                elif 'American Music Awards' in ceremony_label:
                    d['award_series'] = 'American Music Award'
                elif 'World Music Awards' in ceremony_label:
                    d['award_series'] = 'World Music Award'
                elif 'Tony Award' in ceremony_label:
                    d['award_series'] = 'Tony Award'
                elif 'Golden Raspberry Awards' in ceremony_label:
                    d['award_series'] = 'Golden Raspberry Award'
                elif 'BRIT Awards' in ceremony_label or 'Brit Awards' in ceremony_label:
                    d['award_series'] = 'Brit Award'
                elif 'BET' in ceremony_label:
                    d['award_series'] = 'BET Award'
                elif "People's Choice Awards" in ceremony_label:
                    d['award_series'] = "People's Choice Award"
                elif 'Academy Award' in ceremony_label:
                    d['awards_series'] = "Oscar"
                else:
                    logging.getLogger('root.features').warning(
                        f"Not able to associate any award series to the award {ceremony_label}"
                    )
            except KeyError:
                pass

            awards.append(d)

        return [{'value': a} for a in awards]