Ejemplo n.º 1
0
    def test_run_filters_simple(self):
        """ testing run_filters method with english filter alone """
        aggr = SongAggregateScraper()
        aggr.load_from_file(SAVEFILE)
        passing_songs = aggr.get_passing_songs()

        song = passing_songs[2]
        filtered_comments = fltr.run_filters([english_filter], song.comments)
        self.english_filter_3rd_song(filtered_comments)
    def test_scrape_5m(self):
        video_ids = random_songs(LIMIT, 0)

        aggr = SongAggregateScraper()
        aggr.scrape_video_ids(video_ids, MIN_NUM_COMMENTS, MAX_NUM_COMMENTS,
                              FILTER_LIST)
        #aggr.print_summary()
        self.assertTrue(
            len(aggr.get_passing_songs()) >= 2, "5 million youtube scraper")
 def test_scrape_echonest(self):
     aggr = SongAggregateScraper()
     aggr.scrape_from_echonest(BASEDIR,
                               MIN_NUM_COMMENTS,
                               MAX_NUM_COMMENTS,
                               FILTER_LIST,
                               limit=LIMIT)
     #aggr.print_summary()
     self.assertTrue(
         len(aggr.get_passing_songs()) >= 2, "echonest youtube scraper")
Ejemplo n.º 4
0
    def test_run_filters_medium(self):
        """ testing english filter with the length filter """
        aggr = SongAggregateScraper()
        aggr.load_from_file(SAVEFILE)
        passing_songs = aggr.get_passing_songs()

        song = passing_songs[8]
        filtered_comments = fltr.run_filters([english_filter, length_filter],
                                             song.comments)
        # for idx, comment in enumerate(filtered_comments):
        #     print(idx, comment)

        self.assertEqual(20, len(filtered_comments), "run_filters 2 filters")
def load_comments():
    aggr = SongAggregateScraper()
    aggr.load_from_file(SAVEFILE)
    passing_songs = aggr.get_passing_songs()
    #filter_list = [fltr.english_filter, fltr.length_filter, fltr.youtube_topics_filter]
    filter_list = [
        fltr.english_filter, fltr.length_filter, fltr.youtube_topics_filter,
        fltr.brutish_music_filter
    ]

    print(len(passing_songs))

    for song in passing_songs:
        filtered_comments = fltr.run_filters(filter_list, song.comments)
        for idx, comment in enumerate(filtered_comments):
            print(idx, comment)
def scrape():

    video_ids = random_songs(NUM_SONGS)

    aggr = SongAggregateScraper()
    aggr.scrape_video_ids(video_ids, MIN_NUM_COMMENTS, MAX_NUM_COMMENTS,
                          FILTER_LIST)
    aggr.print_summary()
    aggr.save_to_file(SAVEFILE)
Ejemplo n.º 7
0
    def test_are_key_words_in_comment(self):
        """ Running english filter on a song with Brazilian comments """
        aggr = SongAggregateScraper()
        aggr.load_from_file(SAVEFILE)
        passing_songs = aggr.get_passing_songs()

        song = passing_songs[2]
        filtered_comments = []
        for idx, comment in enumerate(song.comments):
            #print(idx, comment)

            if english_filter.are_key_words_in_comment(comment):
                filtered_comments.append(comment)

        # for idx, comment in enumerate(filtered_comments):
        #     print(idx, comment)

        self.assertEqual(len(song.comments), 40, "unfiltered length")
        self.english_filter_3rd_song(filtered_comments)
Ejemplo n.º 8
0
    def test_length_filter(self):
        """ Running a length filter on a song with lyrics as part of comments"""
        aggr = SongAggregateScraper()
        aggr.load_from_file(SAVEFILE)
        passing_songs = aggr.get_passing_songs()

        song = passing_songs[8]
        filtered_comments = fltr.run_filters([english_filter], song.comments)
        # for idx, comment in enumerate(filtered_comments):
        #     print(idx, comment)

        length_filtered_comments = fltr.run_filters([length_filter],
                                                    filtered_comments)
        # for idx, comment in enumerate(length_filtered_comments):
        #     print(idx, comment)

        self.assertEqual(26, len(filtered_comments),
                         "length unfiltered quantity")
        self.assertEqual(20, len(length_filtered_comments),
                         "length filtered quantity")
# Project Libraries
import filter as fltr
from song_aggregate import SongAggregateScraper
from five_million import random_songs

# General Libraries
""" Scrapes comments from a subset of 5 million YouTube song dataset """

BASEDIR = '../../../../Thesis/data_sample/W/D'
MIN_NUM_COMMENTS = 40
MAX_NUM_COMMENTS = 80
NUM_SONGS = 100
filter_list = [fltr.english_filter]

video_ids = random_songs(NUM_SONGS)

aggr_5m = SongAggregateScraper()
aggr_5m.scrape_video_ids(video_ids, MIN_NUM_COMMENTS, MAX_NUM_COMMENTS,
                         filter_list)

aggr_echo = SongAggregateScraper()
aggr_echo.scrape_from_echonest(BASEDIR,
                               MIN_NUM_COMMENTS,
                               MAX_NUM_COMMENTS,
                               filter_list,
                               limit=NUM_SONGS)

aggr_5m.print_summary()
aggr_echo.print_summary()
    def test_save_and_load(self):
        aggr = SongAggregateScraper()
        aggr.scrape_from_echonest(BASEDIR,
                                  MIN_NUM_COMMENTS,
                                  MAX_NUM_COMMENTS,
                                  FILTER_LIST,
                                  limit=LIMIT)
        good_songs = aggr.get_passing_songs()
        aggr.save_to_file('test_file')
        # self.assertTrue(aggr.max_num_comments == MAX_NUM_COMMENTS, "aggregator metadata")
        # self.assertTrue(aggr.aggregate[3] == 2, "aggregate statistics")
        # self.assertTrue(len(good_songs) == 1, "scraper passing songs")
        # self.assertEqual(good_songs[0].comments[0], "Who still listening in 2018?", "passing songs comments")

        aggr2 = SongAggregateScraper()
        self.assertTrue(aggr2.max_num_comments is None, "aggr2 is empty")

        aggr2.load_from_file('test_file')
        recover_good_songs = aggr2.get_passing_songs()
        self.assertEqual(aggr.max_num_comments, aggr2.max_num_comments,
                         "metadata preserved")
        self.assertEqual(aggr.aggregate[3], aggr2.aggregate[3],
                         "aggregate statistics preserved")
        self.assertEqual(len(good_songs[0].comments),
                         len(recover_good_songs[0].comments),
                         "passing songs preserved")
        self.assertEqual(good_songs[0].comments[0],
                         recover_good_songs[0].comments[0],
                         "passing songs comments preserved")