def get_MusicGenres_ents(lang): """Collect entities of type MusicGenre found in DBpedia :param lang: the targetted language :return: the music genres discovered """ query_template = Template("""SELECT ?genre {{ SELECT ?genre WHERE { ?genre rdf:type <http://dbpedia.org/ontology/MusicGenre> } ORDER BY ?genre }} OFFSET $offset LIMIT 10000""") endpoint = utils.get_endpoint_for_lang(lang) sparql_dbpedia = SPARQLWrapper(endpoint + "sparql") sparql_dbpedia.setReturnFormat(JSON) genres = set() offset = 0 while (True): query = query_template.substitute({'offset': offset}) sparql_dbpedia.setQuery(query) results = sparql_dbpedia.query().convert() for result in results["results"]["bindings"]: genres.add(result["genre"]["value"]) if len(results["results"]["bindings"]) < 10000: break offset += 10000 return genres
def get_genres_for_entities(seeds, query_template, lang, ent_ids): """ Collect DBpedia genres for the previously collected artists and works :param seeds: the DBpedia URLs of the music items previously collected :param query_template: the SPARQL query template to be executed :param lang: the language of the DBpedia version to be queried :param ent_ids: the unique ids of the music items previously collected :return: seed music items with associated genres in the targetted language """ if lang not in langs: raise Exception( 'Language not tested. It may require modifications of DBpedia entity names' ) print("Language, ", lang) endpoint = utils.get_endpoint_for_lang(lang) genre_keyword = utils.get_genre_keyword(lang) sparql_dbpedia = SPARQLWrapper(endpoint + "sparql") sparql_dbpedia.setReturnFormat(JSON) entities_with_genres = {} start = 0 while start < len(seeds): if lang == 'ja': end = start + 50 else: end = start + 100 if end > len(seeds): end = len(seeds) print("Processing next 100 entities... ", start, end) list_genres_str = utils.get_seeds_filter(seeds[start:end]) start = end query = query_template.substitute({ 'list': list_genres_str, 'genre_keyword': genre_keyword }) #print(query) sparql_dbpedia.setQuery(query) results = sparql_dbpedia.query().convert() for result in results["results"]["bindings"]: entity = result["entity"]["value"] ent_id = ent_ids[entity] if ent_id not in entities_with_genres: entities_with_genres[ent_id] = [] genre = result["genre"]["value"] entities_with_genres[ent_id].append(genre) return entities_with_genres
def collect_genres_from_seeds(genres, lang): """Collect new genres by crawling the input DBpedia music genres' relations :param genres: a list of DBpedia music genres :param lang: the targetted language :return: the input genres with their related genres discovered during crawling """ query_template = Template("""SELECT ?property, ?genre2, ?genre1 WHERE { ?genre2 ?property ?genre1. FILTER (?genre1 IN ($list)). FILTER (?property IN ($genre_rels)) }""") query_template_inv = Template("""SELECT ?property, ?genre2, ?genre1 WHERE { ?genre1 ?property ?genre2. FILTER (?genre1 IN ($list)). FILTER (?property IN ($genre_rels)) }""") endpoint = utils.get_endpoint_for_lang(lang) sparql_dbpedia = SPARQLWrapper(endpoint + "sparql") sparql_dbpedia.setReturnFormat(JSON) genre_rels_cond = utils.get_genre_rels_filter(lang) seeds = list(genres) relations = {} start = 0 while start < len(seeds): end = start + 50 if end > len(seeds): end = len(seeds) #print(start, end) list_genres_str = utils.get_seeds_filter(seeds[start:end]) for i in range(start, end): genres.add(seeds[i]) start = end query = query_template.substitute({'list': list_genres_str, 'genre_rels': genre_rels_cond}) process_query(query, sparql_dbpedia, relations, seeds, genres) query = query_template_inv.substitute({'list': list_genres_str, 'genre_rels': genre_rels_cond}) process_query(query, sparql_dbpedia, relations, seeds, genres, True) return relations
def get_MusicGenres_aliases(lang, genres): """Collect aliases of the given DBpedia music genres :param lang: the targetted language :param genres: a list of DBpedia music genres :return: the input genres with their aliases """ query_template_alias = Template("""SELECT ?genre, ?alias {{ SELECT ?genre, ?alias WHERE { ?genre rdf:type <http://dbpedia.org/ontology/MusicGenre>. ?genre owl:sameAs ?alias. FILTER ($other_lang_cond) } ORDER BY ?genre }} OFFSET $offset LIMIT 10000""") endpoint = utils.get_endpoint_for_lang(lang) other_langs_cond = utils.get_alias_filter(lang, langs) sparql_dbpedia = SPARQLWrapper(endpoint + "sparql") sparql_dbpedia.setReturnFormat(JSON) genres_with_aliases = {} offset = 0 while (True): query = query_template_alias.substitute({'offset': offset, 'other_lang_cond': other_langs_cond}) #print(query) sparql_dbpedia.setQuery(query) results = sparql_dbpedia.query().convert() for result in results["results"]["bindings"]: genre = result["genre"]["value"] if genre not in genres_with_aliases: genres_with_aliases[genre] = set() alias = result["alias"]["value"] genres_with_aliases[genre].add(alias) other_lang = utils.get_lang(alias) genres[other_lang].add(alias) if len(results["results"]["bindings"]) < 10000: break offset += 10000 return genres_with_aliases
def get_relevant_music_entities(query_template, lang, ent_types=[ 'MusicalWork', 'MusicalArtist', 'Band' ]): """ Collect DBpedia music artists and their works :param query_template: the SPARQL query template to be executed :param lang: the language of DBpedia to be crawled :param ent_types: the types of entities to be crawled """ if lang not in langs: raise Exception('Language not tested.') other_langs_cond = utils.get_alias_filter(lang, langs) query_params = {} query_params['other_lang_cond'] = other_langs_cond query_params['genre_keyword'] = utils.get_genre_keyword(lang) endpoint = utils.get_endpoint_for_lang(lang) for ent in ent_types: print("Entity type: ", ent) query_params['entity_type'] = ent get_dbp_ents(endpoint, query_template, query_params, lang)
def collect_aliases_from_seeds(seeds, lang, genre_aliases): """Collect aliases from a list of music genre seeds :param seeds: the seed music genres :param lang: the targetted language :param genre_aliases: the dictionary to be updated """ query_template = Template("""SELECT DISTINCT ?genre, ?alias WHERE { ?genre owl:sameAs ?alias. FILTER (?genre IN ($list)). FILTER ($other_lang_cond) }""") endpoint = utils.get_endpoint_for_lang(lang) other_langs_cond = utils.get_alias_filter(lang, langs) sparql_dbpedia = SPARQLWrapper(endpoint + "sparql") sparql_dbpedia.setReturnFormat(JSON) start = 0 while start < len(seeds): end = start + 50 if end > len(seeds): end = len(seeds) #print(start, end) list_genres_str = utils.get_seeds_filter(seeds[start:end]) start = end query = query_template.substitute({'list': list_genres_str, 'other_lang_cond': other_langs_cond}) #print(query) sparql_dbpedia.setQuery(query) results = sparql_dbpedia.query().convert() for result in results["results"]["bindings"]: genre = result["genre"]["value"] alias = result["alias"]["value"] if genre not in genre_aliases: genre_aliases[genre] = set() genre_aliases[genre].add(alias)