def test_scrape_songs_by_genre(self): songs = hh.get_all_files(BASEDIR)[:5] positive_correct = [] negative_correct = [] for idx, song_loc in enumerate(songs): h5 = hh.open_h5_file_read(song_loc) almost_mfccs = hh.get_segments_timbre(h5) if idx == 1: negative_correct.append(almost_mfccs) else: positive_correct.append(almost_mfccs) h5.close() positive_examples, negative_examples = gc.scrape_songs_by_genre( BASEDIR, 'pop', 5) positive_shapes = [example.shape for example in positive_examples] self.assertAlmostEqual(len(positive_correct), len(positive_examples), "scrape songs by genre") self.assertAlmostEqual(len(negative_correct), len(negative_examples), "scrape songs by genre") self.assertTrue(positive_correct[0].shape in positive_shapes, "scrape songs by genre") self.assertFalse(negative_correct[0].shape in positive_shapes, "scrape songs by genre")
def start_echonest_scrape(self): """ Runs through EchoNest song data, extracts artist name and title, searches YouTube, and extracts comments """ songs = hh.get_all_files(self.basedir) if self.limit: songs = songs[:self.limit] self.limit = len(songs) print('Scraping %d youtube songs' % len(songs)) for idx, song_loc in enumerate(songs): h5 = hh.open_h5_file_read(song_loc) artist = hh.get_artist_name(h5).decode('utf-8') title = hh.get_title(h5).decode('utf-8') song = YouTubeSong(artist, title) try: song.print_header(idx) song.obtain_comments(self.min_num_comments, self.min_num_comments, self.filter_list) self.add(song) except: # TODO: catch 403 error because it probably means i ran out of YouTube requests quota print(' EXCEPTION occurred with this piece, ignoring') self.add_exception() h5.close()
def scrape_songs_by_genre(basedir, genre, limit): """ Returns the mfccs of songs that are in the genre, and outside of the genre :param basedir: the directory of songs we should classify :param genre: the genre we are looking to classify :param limit: the maximum number of songs within the directory that should be classified :return: mffcs of songs in the genre, mfccs of songs not in the genre """ songs = hh.get_all_files(basedir) if limit: songs = songs[:limit] print("\n\nScraping %d songs for genre: %s" % (len(songs), genre)) positive_examples = [] negative_examples = [] for song_loc in songs: h5 = hh.open_h5_file_read(song_loc) artist_tags = [ term.decode('utf-8') for term in hh.get_artist_terms(h5) ] almost_mfccs = hh.get_segments_timbre(h5) if genre in artist_tags: positive_examples.append(almost_mfccs) else: negative_examples.append(almost_mfccs) h5.close() return positive_examples, negative_examples
def scrape_genres(basedir, weightType, limit=None): """ Scrapes a directory of TheEchoNest songs and compiles a list of most commonly-occurring song descriptors :param basedir: the directory to scrape :param weightType: WeightType enum for method to determine counts :param limit: the maximum number of songs within the directory that should be scraped :return: returns a sorted array of tuples of (descriptor, count) sorted by highest count first """ songs = hh.get_all_files(basedir) if limit: songs = songs[:limit] print("Scraping %d songs" % len(songs)) tags_counts = {} for song_loc in songs: h5 = hh.open_h5_file_read(song_loc) artist_tags = [ term.decode('utf-8') for term in hh.get_artist_terms(h5) ] if weightType == WeightType.BINARY: add_weights(tags_counts, artist_tags) elif weightType == WeightType.FREQUENCY: artist_tags_freqs = hh.get_artist_terms_freq(h5) add_weights(tags_counts, artist_tags, artist_tags_freqs) elif weightType == WeightType.WEIGHT: artist_tags_weights = hh.get_artist_terms_weight(h5) add_weights(tags_counts, artist_tags, artist_tags_weights) else: sys.exit("bad WeightType specified") h5.close() sorted_tags = sorted_array(tags_counts) return sorted_tags
elif song.error == Objects.ERROR_NO_TOKEN: aggregate[4] += 1 else: # assuming is a video duration error aggregate[5] += 1 songs = hh.get_all_files(BASEDIR) #TODO songs = random.shuffle(songs) print('Testing %d songs' % len(songs)) num_total = 0 filt_songs = [] aggregate = [0] * 7 for idx, song_loc in enumerate(songs): h5 = hh.open_h5_file_read(song_loc) artist = hh.get_artist_name(h5).decode('UTF-8') title = hh.get_title(h5).decode('UTF-8') print('%4d %s - %s' % (idx, artist, title)) try: # A songA = Objects.Song(artist, title) #filter_tag_list_A = [] filter_tag_list_A = [Filters.REMOVE_NONENGLISH] get_comments(songA, filter_tag_list_A, 'A', filt_songs) # B ''' songB = Objects.Song(artist, title) filter_tag_list_B = [Filters.REMOVE_LONG,