def test_many_real_songs(self): parser = DiscogsSongParser(file_path=rel_path_to_file("../../files/discogs_releases.xml", __file__), dataset=Dataset(title="TestDataset")) counter = 1 for a_song in parser.parse_songs(): counter += 1 if counter % 50000 == 0: # 50.000 break
def test_entity_detection(self): parser = DiscogsSongParser(file_path=rel_path_to_file("../../files/releases_piece_big.xml", __file__), dataset=Dataset(title="TestDataset")) counter_songs = 0 counter_writers = 0 for a_song in parser.parse_songs(): counter_songs += 1 for a_coll in a_song.collaborations: if a_coll.role == ROLE_WRITER: counter_writers += 1 for an_alt in a_song.alternative_titles: print a_song.canonical, an_alt self.assertEqual(281, counter_songs, msg="Expected 281 songs, but parsed " + str(counter_songs)) self.assertEqual(427, counter_writers, msg="Expected 427 songs with writter, but parsed " + str(counter_writers))
def _process_songs_node(songs_node, artists, collaborations, album, genres, country, release_date, release_id): for song_node in list(songs_node): title = None discogs_id = None extra_collaborations = [] for elem in list(song_node): if elem.tag == SONG_TITLE: title = normalize_discogs_name(elem.text) elif elem.tag == TRACK_POSITION: discogs_id = DiscogsSongParser.build_discogs_id(release_id, elem.text) elif elem.tag == COLLABORATIONS: for a_coll in DiscogsSongParserFilteringNoNamevars._process_collaborations_node(elem): if a_coll[0] == ROLE_FEATURER: artists.append(a_coll[1]) else: extra_collaborations.append(a_coll[1]) if title not in EMPTY_CONTENT: yield Song(canonical=title, discogs_id=discogs_id, artists=artists, collaborations=collaborations)