Beispiel #1
0
def get_MusicGenres_ents(lang):
    """Collect entities of type MusicGenre found in DBpedia
    :param lang: the targetted language
    :return: the music genres discovered
    """
    query_template = Template("""SELECT ?genre {{
                SELECT ?genre
                    WHERE {
                        ?genre rdf:type <http://dbpedia.org/ontology/MusicGenre>
                    }
                    ORDER BY ?genre
                }}
                OFFSET $offset
                LIMIT 10000""")
    endpoint = utils.get_endpoint_for_lang(lang)
    sparql_dbpedia = SPARQLWrapper(endpoint + "sparql")
    sparql_dbpedia.setReturnFormat(JSON)
    genres = set()
    offset = 0
    while (True):
        query = query_template.substitute({'offset': offset})
        sparql_dbpedia.setQuery(query)
        results = sparql_dbpedia.query().convert()
        for result in results["results"]["bindings"]:
            genres.add(result["genre"]["value"])
        if len(results["results"]["bindings"]) < 10000:
            break
        offset += 10000
    return genres
Beispiel #2
0
def collect_genres_from_seeds(genres, lang):
    """Collect new genres by crawling the input DBpedia music genres' relations
    :param genres: a list of DBpedia music genres
    :param lang: the targetted language
    :return: the input genres with their related genres discovered during crawling
    """
    query_template = Template("""SELECT ?property, ?genre2, ?genre1
                  WHERE {
                    ?genre2 ?property ?genre1.
                    FILTER (?genre1 IN ($list)).
                    FILTER (?property IN ($genre_rels))
                  }""")
    query_template_inv = Template("""SELECT ?property, ?genre2, ?genre1
                    WHERE {
                    ?genre1 ?property ?genre2.
                    FILTER (?genre1 IN ($list)).
                    FILTER (?property IN ($genre_rels))
                  }""")

    endpoint = utils.get_endpoint_for_lang(lang)
    sparql_dbpedia = SPARQLWrapper(endpoint + "sparql")
    sparql_dbpedia.setReturnFormat(JSON)
    genre_rels_cond = utils.get_genre_rels_filter(lang)

    seeds = list(genres)
    relations = {}

    start = 0
    while start < len(seeds):
        end = start + 50
        if end > len(seeds):
            end = len(seeds)
        #print(start, end)
        list_genres_str = utils.get_seeds_filter(seeds[start:end])
        for i in range(start, end):
            genres.add(seeds[i])
        start = end

        query = query_template.substitute({'list': list_genres_str, 'genre_rels': genre_rels_cond})
        process_query(query, sparql_dbpedia, relations, seeds, genres)
        query = query_template_inv.substitute({'list': list_genres_str, 'genre_rels': genre_rels_cond})
        process_query(query, sparql_dbpedia, relations, seeds, genres, True)

    return relations
def get_genres_for_entities(seeds, query_template, lang, ent_ids):
    """ Collect DBpedia genres for the previously collected artists and works
    :param seeds: the DBpedia URLs of the music items previously collected
    :param query_template: the SPARQL query template to be executed
    :param lang: the language of the DBpedia version to be queried
    :param ent_ids: the unique ids of the music items previously collected
    :return: seed music items with associated genres in the targetted language
    """
    if lang not in langs:
        raise Exception('Language not tested. It may require modifications of DBpedia entity names')
    print("Language, ", lang)
    endpoint = utils.get_endpoint_for_lang(lang)
    sparql_dbpedia = SPARQLWrapper(endpoint + "sparql")
    sparql_dbpedia.setReturnFormat(JSON)
    entities_with_genres = {}

    start = 0
    while start < len(seeds):
        if lang == 'ja':
            end = start + 50
        else:
            end = start + 100
        if end > len(seeds):
            end = len(seeds)
        print("Processing next 100 entities... ", start, end)

        list_genres_str = utils.get_seeds_filter(seeds[start:end])
        start = end
        query = query_template.substitute({'list': list_genres_str})
        #print(query)
        sparql_dbpedia.setQuery(query)

        results = sparql_dbpedia.query().convert()
        for result in results["results"]["bindings"]:
            entity = result["entity"]["value"]
            ent_id = ent_ids[entity]
            if ent_id not in entities_with_genres:
                entities_with_genres[ent_id] = []
            genre = result["genre"]["value"]
            entities_with_genres[ent_id].append(genre)

    return entities_with_genres
def get_relevant_music_entities(query_template,
                                lang,
                                ent_types=[
                                    'MusicalWork', 'MusicalArtist', 'Band'
                                ]):
    """ Collect DBpedia music artists and their works
    :param query_template: the SPARQL query template to be executed
    :param lang: the language of DBpedia to be crawled
    :param ent_types: the types of entities to be crawled
    """
    if lang not in langs:
        raise Exception('Language not tested.')
    other_langs_cond = utils.get_alias_filter(lang, langs)
    query_params = {}
    query_params['other_lang_cond'] = other_langs_cond
    endpoint = utils.get_endpoint_for_lang(lang)
    for ent in ent_types:
        print("Entity type: ", ent)
        query_params['entity_type'] = ent
        get_dbp_ents(endpoint, query_template, query_params, lang)
Beispiel #5
0
def get_MusicGenres_aliases(lang, genres):
    """Collect aliases of the given DBpedia music genres
    :param lang: the targetted language
    :param genres: a list of DBpedia music genres
    :return: the input genres with their aliases
    """
    query_template_alias = Template("""SELECT ?genre, ?alias {{
                SELECT ?genre, ?alias
                    WHERE {
                        ?genre rdf:type <http://dbpedia.org/ontology/MusicGenre>.
                        ?genre owl:sameAs ?alias.
                        FILTER ($other_lang_cond)
                    }
                ORDER BY ?genre
                }}
                OFFSET $offset
                LIMIT 10000""")

    endpoint = utils.get_endpoint_for_lang(lang)
    other_langs_cond = utils.get_alias_filter(lang, langs)
    sparql_dbpedia = SPARQLWrapper(endpoint + "sparql")
    sparql_dbpedia.setReturnFormat(JSON)
    genres_with_aliases = {}
    offset = 0
    while (True):
        query = query_template_alias.substitute({'offset': offset, 'other_lang_cond': other_langs_cond})
        #print(query)
        sparql_dbpedia.setQuery(query)
        results = sparql_dbpedia.query().convert()
        for result in results["results"]["bindings"]:
            genre = result["genre"]["value"]
            if genre not in genres_with_aliases:
                genres_with_aliases[genre] = set()
            alias = result["alias"]["value"]
            genres_with_aliases[genre].add(alias)
            other_lang = utils.get_lang(alias)
            genres[other_lang].add(alias)
        if len(results["results"]["bindings"]) < 10000:
            break
        offset += 10000
    return genres_with_aliases
Beispiel #6
0
def collect_aliases_from_seeds(seeds, lang, genre_aliases):
    """Collect aliases from a list of music genre seeds
    :param seeds: the seed music genres
    :param lang: the targetted language
    :param genre_aliases: the dictionary to be updated
    """
    query_template = Template("""SELECT DISTINCT ?genre, ?alias
                  WHERE {
                    ?genre owl:sameAs ?alias.
                    FILTER (?genre IN ($list)).
                    FILTER ($other_lang_cond)
                  }""")
    endpoint = utils.get_endpoint_for_lang(lang)
    other_langs_cond = utils.get_alias_filter(lang, langs)
    sparql_dbpedia = SPARQLWrapper(endpoint + "sparql")
    sparql_dbpedia.setReturnFormat(JSON)

    start = 0
    while start < len(seeds):
        end = start + 50
        if end > len(seeds):
            end = len(seeds)
        #print(start, end)

        list_genres_str = utils.get_seeds_filter(seeds[start:end])
        start = end
        query = query_template.substitute({'list': list_genres_str, 'other_lang_cond': other_langs_cond})
        #print(query)
        sparql_dbpedia.setQuery(query)

        results = sparql_dbpedia.query().convert()
        for result in results["results"]["bindings"]:
            genre = result["genre"]["value"]
            alias = result["alias"]["value"]
            if genre not in genre_aliases:
                genre_aliases[genre] = set()
            genre_aliases[genre].add(alias)