Beispiel #1
0
    def test_scrape_songs_by_genre(self):

        songs = hh.get_all_files(BASEDIR)[:5]

        positive_correct = []
        negative_correct = []

        for idx, song_loc in enumerate(songs):
            h5 = hh.open_h5_file_read(song_loc)
            almost_mfccs = hh.get_segments_timbre(h5)
            if idx == 1:
                negative_correct.append(almost_mfccs)
            else:
                positive_correct.append(almost_mfccs)
            h5.close()

        positive_examples, negative_examples = gc.scrape_songs_by_genre(
            BASEDIR, 'pop', 5)
        positive_shapes = [example.shape for example in positive_examples]

        self.assertAlmostEqual(len(positive_correct), len(positive_examples),
                               "scrape songs by genre")
        self.assertAlmostEqual(len(negative_correct), len(negative_examples),
                               "scrape songs by genre")

        self.assertTrue(positive_correct[0].shape in positive_shapes,
                        "scrape songs by genre")
        self.assertFalse(negative_correct[0].shape in positive_shapes,
                         "scrape songs by genre")
Beispiel #2
0
    def start_echonest_scrape(self):
        """ Runs through EchoNest song data, extracts artist name and title, searches YouTube, and extracts comments """
        songs = hh.get_all_files(self.basedir)
        if self.limit:
            songs = songs[:self.limit]
        self.limit = len(songs)
        print('Scraping %d youtube songs' % len(songs))

        for idx, song_loc in enumerate(songs):
            h5 = hh.open_h5_file_read(song_loc)
            artist = hh.get_artist_name(h5).decode('utf-8')
            title = hh.get_title(h5).decode('utf-8')
            song = YouTubeSong(artist, title)

            try:
                song.print_header(idx)
                song.obtain_comments(self.min_num_comments,
                                     self.min_num_comments, self.filter_list)
                self.add(song)

            except:
                # TODO: catch 403 error because it probably means i ran out of YouTube requests quota
                print('     EXCEPTION occurred with this piece, ignoring')
                self.add_exception()

            h5.close()
Beispiel #3
0
def scrape_songs_by_genre(basedir, genre, limit):
    """
    Returns the mfccs of songs that are in the genre, and outside of the genre

    :param basedir: the directory of songs we should classify
    :param genre: the genre we are looking to classify
    :param limit: the maximum number of songs within the directory that should be classified
    :return: mffcs of songs in the genre, mfccs of songs not in the genre
    """
    songs = hh.get_all_files(basedir)
    if limit:
        songs = songs[:limit]
    print("\n\nScraping %d songs for genre: %s" % (len(songs), genre))

    positive_examples = []
    negative_examples = []

    for song_loc in songs:
        h5 = hh.open_h5_file_read(song_loc)

        artist_tags = [
            term.decode('utf-8') for term in hh.get_artist_terms(h5)
        ]
        almost_mfccs = hh.get_segments_timbre(h5)
        if genre in artist_tags:
            positive_examples.append(almost_mfccs)
        else:
            negative_examples.append(almost_mfccs)

        h5.close()

    return positive_examples, negative_examples
def scrape_genres(basedir, weightType, limit=None):
    """
    Scrapes a directory of TheEchoNest songs and compiles a list of most commonly-occurring song descriptors
    :param basedir: the directory to scrape
    :param weightType: WeightType enum for method to determine counts
    :param limit: the maximum number of songs within the directory that should be scraped
    :return: returns a sorted array of tuples of (descriptor, count) sorted by highest count first
    """
    songs = hh.get_all_files(basedir)
    if limit:
        songs = songs[:limit]
    print("Scraping %d songs" % len(songs))

    tags_counts = {}

    for song_loc in songs:
        h5 = hh.open_h5_file_read(song_loc)

        artist_tags = [
            term.decode('utf-8') for term in hh.get_artist_terms(h5)
        ]

        if weightType == WeightType.BINARY:
            add_weights(tags_counts, artist_tags)

        elif weightType == WeightType.FREQUENCY:
            artist_tags_freqs = hh.get_artist_terms_freq(h5)
            add_weights(tags_counts, artist_tags, artist_tags_freqs)

        elif weightType == WeightType.WEIGHT:
            artist_tags_weights = hh.get_artist_terms_weight(h5)
            add_weights(tags_counts, artist_tags, artist_tags_weights)

        else:
            sys.exit("bad WeightType specified")

        h5.close()

    sorted_tags = sorted_array(tags_counts)
    return sorted_tags
Beispiel #5
0
        elif song.error == Objects.ERROR_NO_TOKEN:
            aggregate[4] += 1
        else:
            # assuming is a video duration error
            aggregate[5] += 1


songs = hh.get_all_files(BASEDIR)  #TODO songs = random.shuffle(songs)
print('Testing %d songs' % len(songs))
num_total = 0
filt_songs = []

aggregate = [0] * 7

for idx, song_loc in enumerate(songs):
    h5 = hh.open_h5_file_read(song_loc)
    artist = hh.get_artist_name(h5).decode('UTF-8')
    title = hh.get_title(h5).decode('UTF-8')
    print('%4d %s - %s' % (idx, artist, title))

    try:
        # A
        songA = Objects.Song(artist, title)
        #filter_tag_list_A = []
        filter_tag_list_A = [Filters.REMOVE_NONENGLISH]
        get_comments(songA, filter_tag_list_A, 'A', filt_songs)

        # B
        '''
        songB = Objects.Song(artist, title)
        filter_tag_list_B = [Filters.REMOVE_LONG,