def test_generate(self): dataset = Dataset("A fancy dataset") parser = DiscogsSongParser(file_path=rel_path_to_file("../../files/releases_piece_small.xml", __file__), dataset=dataset) generator = get_clean_graph_generator_mongo_repos() generator.generate_turtle_song_graph( file_path=rel_path_to_file("../../files/out/large_discogs_release.ttl", __file__), song_parser=parser)
def test_entity_detection(self): parser = UsosSongParser(dataset=Dataset("some_dataset"), source_file=rel_path_to_file("../../files/in/mini_uso.tsv", __file__)) counter = 0 for song in parser.parse_songs(): counter += 1 self.assertEquals(8, counter, "Unexpected number of songs. Expected 8 bur parsed " + str(counter))
def test_titles_detection(self): parser = UsosSongParser(dataset=Dataset("some_dataset"), source_file=rel_path_to_file("../../files/in/mini_uso.tsv", __file__)) expected_dict = { "kentucky woman": False, "why tell me why": False, "acapella": False, "human": False, "carry out": False, "don't stop the party": False, "whistle": False, "somebody that i used to know": False } unexpected = [] for song in parser.parse_songs(): if song.canonical.strip() in expected_dict: expected_dict[song.canonical.strip()] = True else: unexpected.append(song.canonical) self.assertEquals(0, len(unexpected), "Some unexpected songs where parsed: " + str(unexpected)) not_found = [] for a_expected in expected_dict: if not expected_dict[a_expected]: not_found.append(a_expected) self.assertEquals(0, len(not_found), "Some expected songs were not found: " + str(not_found))
def test_writers_detection(self): parser = UsosSongParser(dataset=Dataset("some_dataset"), source_file=rel_path_to_file("../../files/in/mini_uso.tsv", __file__)) expected_dict = { "Amy Heidemann": False, "Martin Johnson": False, "Nick Noonan": False, "Sam Hollander": False, "Brandon Flowers": False, "Dave Keuning": False, "Ronnie Vannucci": False, "Jerome Harmon": False, "Jim Beanz": False, "Justin Timberlake": False, "Timothy Clayton": False, "Timothy Mosley": False, "Backer": False, "Wally De": False } unexpected = [] for song in parser.parse_songs(): for a_coll in song.collaborations: if a_coll.collaborator.canonical.strip() in expected_dict: expected_dict[a_coll.collaborator.canonical.strip()] = True else: unexpected.append(a_coll.collaborator.canonical) self.assertEquals(0, len(unexpected), "Some unexpected writers where parsed: " + str(unexpected)) not_found = [] for a_expected in expected_dict: if not expected_dict[a_expected]: not_found.append(a_expected) self.assertEquals(0, len(not_found), "Some expected writers were not found: " + str(not_found))
def test_artist_detection(self): parser = UsosSongParser(dataset=Dataset("some_dataset"), source_file=rel_path_to_file("../../files/in/mini_uso.tsv", __file__)) expected_dict = { "Neil Diamond": False, "Anita Meyer": False, "Karmin": False, "The Killers": False, "Justin Timberlake": False, "Timbaland": False, "The Black Eyed Peas": False, "Flo Rida": False, "Gotye": False } unexpected = [] for song in parser.parse_songs(): for artist in song.artists: if artist.canonical.strip() in expected_dict: expected_dict[artist.canonical.strip()] = True else: unexpected.append(artist.canonical) self.assertEquals(0, len(unexpected), "Some unexpected artists where parsed: " + str(unexpected)) not_found = [] for a_expected in expected_dict: if not expected_dict[a_expected]: not_found.append(a_expected) self.assertEquals(0, len(not_found), "Some expected artist were not found: " + str(not_found))
def test_many_real_songs(self): parser = DiscogsSongParser(file_path=rel_path_to_file("../../files/discogs_releases.xml", __file__), dataset=Dataset(title="TestDataset")) counter = 1 for a_song in parser.parse_songs(): counter += 1 if counter % 50000 == 0: # 50.000 break
def test_many_songs_parsed(self): parser = UsosSongParser(dataset=Dataset("some_dataset"), source_file=rel_path_to_file("../../files/in/bmat2heaven.tsv", __file__)) counter = 50000 for a_song in parser.parse_songs(): counter -= 1 if counter <= 0: break
def get_executer_memory_repos_file_rdflib_graph(str_json_config): print "loading graph" #Graph from file graph_path = rel_path_to_file("files/mini_usos/mini_usos_graph.ttl", __file__) rdflib_graph = Graph() rdflib_graph.load(graph_path, format="turtle") mera_graph = MeraRdflibGraph(rdflib_graph) print "loading artist ngrams" #Memory artist repo from file artist_ngram_path = rel_path_to_file("files/mini_usos/artist.json", __file__) repo_artist = MemoryEntityNgrams(base_entity_uri=base_entities_URI, type_of_entity_collection=ARTIST_COLLECTION, load_file=artist_ngram_path) print "loading songs ngrams" #Memory song repo from file song_ngram_path = rel_path_to_file("files/mini_usos/song.json", __file__) repo_song = MemoryEntityNgrams(base_entity_uri=base_entities_URI, type_of_entity_collection=SONG_COLLECTION, load_file=song_ngram_path) print "loading repo counter" #Memory counter repo from file counter_path = rel_path_to_file("files/mini_usos/counter.json", __file__) repo_counter = MemoryEntityCounter(load_file=counter_path) print "Loading matcher" #Matcher over those structures matcher = MeraMatcher(graph=mera_graph, artist_ngrams_repository=repo_artist, song_ngrams_repository=repo_song, entity_counter_repository=repo_counter, match_config=translate_json_to_mera_match_config(json.loads(str_json_config))) #Formater to json formater = FormaterToJson() print "loading executer" #Executer over those structures executer = QueryExecuter(matcher=matcher, formater=formater) return executer
def test_json_to_mera_match_config(self): with open(rel_path_to_file(rel_path="../../files/usos/base_config.json", base_file=__file__), "r") as json_source: source_content = json.load(json_source) config_result = translate_json_to_mera_match_config(source_content) self.assertEquals(0.65, config_result.get_minimum_of_type("artist")) self.assertEquals(0.65, config_result.get_minimum_of_type("song")) self.assertEquals(40, config_result.top_k_blocking_function()) self.assertEquals(5, config_result.top_k_results()) self.assertEquals(1.60, config_result.get_command_threshold("find_song")) self.assertEquals(0.80, config_result.get_command_relevance_of_a_type(command_name="find_song", target_type="artist"))
def test_entity_detection(self): parser = DiscogsSongParser(file_path=rel_path_to_file("../../files/releases_piece_big.xml", __file__), dataset=Dataset(title="TestDataset")) counter_songs = 0 counter_writers = 0 for a_song in parser.parse_songs(): counter_songs += 1 for a_coll in a_song.collaborations: if a_coll.role == ROLE_WRITER: counter_writers += 1 for an_alt in a_song.alternative_titles: print a_song.canonical, an_alt self.assertEqual(281, counter_songs, msg="Expected 281 songs, but parsed " + str(counter_songs)) self.assertEqual(427, counter_writers, msg="Expected 427 songs with writter, but parsed " + str(counter_writers))
__author__ = 'Dani' from wmera.query_gen.query_generator_cwr import CWRQueryGenerator from wmera.utils import rel_path_to_file query_gen = CWRQueryGenerator( queries_path=rel_path_to_file("../../files/in/cwr-json-to-mera-json/works_group_full.json", __file__), config_path="Doesntmatteryet") query_gen.gen_srialized_mera_json(file_path="../../files/out/cwr-json-to-mera-json/posible_queries.json")
__author__ = 'Dani' from wmera.mera_core.model.entities import Dataset from wmera.utils import rel_path_to_file from test.t_utils.t_factory import get_clean_graph_generator_memory_repos from wmera.parsers.usos.usos_song_parser import UsosSongParser parser = UsosSongParser(dataset=Dataset("Uso_bmat2heaven"), source_file=rel_path_to_file( "../../files/mini_usos/mini_bmat2heaven.tsv", __file__)) generator = get_clean_graph_generator_memory_repos() generator.generate_turtle_song_graph(file_path=rel_path_to_file("../../files/out/usos_graph.ttl", __file__), song_parser=parser, isolated=True) generator._repo_artists.save_content(rel_path_to_file("../../files/out/artist_ngrams_usos.json", __file__)) generator._repo_songs.save_content(rel_path_to_file("../../files/out/song_ngrams_usos.json", __file__)) generator._repo_counter.save_content(rel_path_to_file("../../files/out/counter_usos.json", __file__))
def test_generate_song(self): generator = get_clean_graph_generator_mongo_repos() generator.generate_turtle_song_graph(file_path=rel_path_to_file("../../files/out/test_song_gen.ttl", __file__), song_parser=FakeSongParser(dataset=Dataset("A_Dataset")))
__author__ = 'Dani' from wmera.mera_core.model.entities import Dataset from wmera.utils import rel_path_to_file from test.t_utils.t_factory import get_clean_graph_generator_mongo_repos from wmera.parsers.usos.usos_song_parser import UsosSongParser parser = UsosSongParser(dataset=Dataset("Uso_bmat2heaven"), source_file=rel_path_to_file( "../../files/mini_usos/mini_bmat2heaven.tsv", __file__)) generator = get_clean_graph_generator_mongo_repos() generator.generate_turtle_song_graph(file_path=rel_path_to_file("../../files/out/usos_graph.ttl", __file__), song_parser=parser, isolated=True)
print len(art1), len(art2_3), len(art4_plus) with open("songs_1_artist.tsv", "w") as target_file: for song in art1: target_file.write(serialize_song_tsv_artist(song) + "\n") with open("songs_2_3_artist.tsv", "w") as target_file: for song in art2_3: target_file.write(serialize_song_tsv_artist(song) + "\n") with open("songs_4_plus_artist.tsv", "w") as target_file: for song in art4_plus: target_file.write(serialize_song_tsv_artist(song) + "\n") num_songs = 45458287 # num_songs = 100 # desired_songs = 3500 extract_random_tsv_songs(file_path=rel_path_to_file("../../files/discogs_releases.xml", __file__), total_songs=num_songs, desired_songs=3500)
return result def name_included_in_min_levenshtein_distance(name, target_et): for elem in target_et: if levenshtein(elem, name) <= 1: return True return False ##################### ------------ Program ----------- ######################### print "Eh que soy yo" clean_artist = set_of_words_from_file_separator(rel_path_to_file("files/clean_artist.txt", __file__)) noisy_artist = set_of_words_from_file_separator(rel_path_to_file("files/noisy_artist.txt", __file__)) clean_songs = set_of_words_from_file_separator(rel_path_to_file("files/clean_song.txt", __file__)) noisy_songs = set_of_words_from_file_separator(rel_path_to_file("files/noisy_song.txt", __file__)) artist_parsed = clean_artist.union(noisy_artist) songs_parsed = clean_songs.union(noisy_songs) noisy_queries = list_of_lists_of_words_from_file_lines( rel_path_to_file("files/noisy-musical-queries.txt",
from wmera.utils import rel_path_to_file __author__ = 'Dani' from test.t_utils.t_factory import get_clean_repo_counter_memory, get_clean_repo_song_memory, \ get_clean_repo_artist_memory, get_clean_repo_artist_mongo, get_clean_repo_counter_mongo, \ get_clean_repo_songs_mongo from wmera.adapters.in_memory_to_mogno import dump_in_memory_ngrams_into_mongo_ngrams, \ dump_in_memory_counter_into_mongo_counter # Artist ngrmas mongo_artist = get_clean_repo_artist_mongo() memory_artist = get_clean_repo_artist_memory() memory_artist.load_content(rel_path_to_file("../../files/out/artist_ngrams_usos.json", __file__)) dump_in_memory_ngrams_into_mongo_ngrams(in_memory_repo=memory_artist, mongo_repo=mongo_artist) memory_artist = None # Free memory # Song ngrams mongo_song = get_clean_repo_songs_mongo() memory_song = get_clean_repo_song_memory() memory_song.load_content(rel_path_to_file("../../files/out/song_ngrams_usos.json", __file__)) dump_in_memory_ngrams_into_mongo_ngrams(in_memory_repo=memory_song, mongo_repo=mongo_song) memory_song = None # Free memory
from wmera.mera_core.model.entities import Dataset class FakeFormater(MeraFormaterInterface): def format_mera_results(self, list_of_dicts_with_list_of_base_results): for a_stuff in list_of_dicts_with_list_of_base_results: print a_stuff executer = QueryExecuter(matcher=get_mera_matcher_with_data(graph_path="../../files/out/usos_graph.ttl", ngram_song_path="../../files/out/song_ngrams_usos.json", ngram_artist_path="../../files/out/artist_ngrams_usos.json", counter_path="../../files/out/counter_usos.json"), formater=FormaterToJson()) # executer = QueryExecuter(matcher=get_empty_mera_matcher(), # formater=FormaterToJson()) res = executer.execute_queries_from_file( file_path=rel_path_to_file("../../files/out/cwr-json-to-mera-json/posible_queries.json", __file__)) print res with open("../../files/in/mera_results_mini_usos.json", "r") as file_io: json_matches = file_io.read() executer.introduce_json_matches_in_graph(json_matches_str=json_matches, dataset_obj=Dataset(title="MiDatasetCWR"), serialization_path="../../files/out/usos_graph_ENRICHED.ttl")
def test_graph_repeated_artist(self): file_path = rel_path_to_file("../../files/out/repeated_artists_mini_graph.ttl", __file__) artist_parser = FakeRepeatedArtistsParser(Dataset("MyTest", date="2000-feb-15")) generator = get_clean_graph_generator_mongo_repos() generator.generate_turtle_artist_graph(file_path, artist_parser)
result = {} with open("../files/consultas_aol/consultas-AOL.txt") as in_file: tmp_line = "" counter = 0 for line in in_file: counter += 1 if counter % 100000 == 0: print counter tmp_line = line.split("\t") if len(tmp_line) == 2: tmp_line = tmp_line[1].strip().lower() for a_feat_var in self._feat_vars: if a_feat_var in tmp_line: # print tmp_line if tmp_line in result: result[tmp_line] += 1 else: result[tmp_line] = 1 break return result ############## extractor = AolNoisyQueryExtractor(rel_path_to_file("../files/consultas_aol/", __file__)) extractor.run()