def test_get_movielist_movie_attributes(): crit_scraper = CritickerScraper() raw_html = read_file( 'test/fixtures/criticker-normal-movie-in-movie-list.html') html_info = BeautifulSoup(raw_html, "lxml").find('li') movie_info = crit_scraper.get_movielist_movie_attributes(html_info) assert set(movie_info.keys()) == { 'crit_id', 'crit_url', 'title', 'year', 'date_added', 'my_ratings' } assert movie_info['my_ratings']['tijl'] == {'psi': 55} assert movie_info['crit_id'] == 26496 assert movie_info[ 'crit_url'] == 'https://www.criticker.com/film/Issiz-adam/' assert movie_info['title'] == 'Issiz adam' assert movie_info['year'] == 2008 assert 'date_added' in movie_info assert arrow.get(movie_info['date_added']).humanize() == 'just now' raw_html = read_file( 'test/fixtures/criticker-rated-movie-in-movie-list.html') html_info = BeautifulSoup(raw_html, "lxml").find('li') movie_info = crit_scraper.get_movielist_movie_attributes(html_info) assert set(movie_info.keys()) == { 'crit_id', 'crit_url', 'title', 'year', 'date_added', 'my_ratings' } assert movie_info['my_ratings']['tijl'] == {'rating': 61} raw_html = read_file( 'test/fixtures/criticker-nopsi-movie-in-movie-list.html') html_info = BeautifulSoup(raw_html, "lxml").find('li') movie_info = crit_scraper.get_movielist_movie_attributes(html_info) assert set(movie_info.keys()) == { 'crit_id', 'crit_url', 'title', 'year', 'date_added' }
def test_fibonacci(): crit_scraper = CritickerScraper() assert crit_scraper.fibonacci(0) == 0 assert crit_scraper.fibonacci(1) == 1 assert crit_scraper.fibonacci(2) == 1 assert crit_scraper.fibonacci(3) == 2 assert crit_scraper.fibonacci(5) == 5
def test_get_movie_info_no_votes(): crit_scraper = CritickerScraper() with requests_mock.mock() as m: m.get('https://www.criticker.com/film/16-Fathoms-Deep/', text=read_file('test/fixtures/criticker-16-fathoms-deep.html')) movie_info = crit_scraper.get_movie_info( 'https://www.criticker.com/film/16-Fathoms-Deep/') assert movie_info.get('crit_votes') == 0
def test_get_movie_info_no_poster(): crit_scraper = CritickerScraper() with requests_mock.mock() as m: m.get('https://www.criticker.com/film/8-Tire-on-the-Ice/', text=read_file('test/fixtures/criticker-8-tire-on-the-ice.html')) movie_info = crit_scraper.get_movie_info( 'https://www.criticker.com/film/8-Tire-on-the-Ice/') assert movie_info.get('poster_url') is None
def test_get_movie_info_no_rating_of_my_own(): crit_scraper = CritickerScraper() with requests_mock.mock() as m: m.get('http://www.criticker.com/film/The-Mask/', text=read_file('test/fixtures/criticker-the-mask.html')) movie_info = crit_scraper.get_movie_info( 'http://www.criticker.com/film/The-Mask/') assert movie_info['my_ratings']['tijl'].get('rating') is None
def test_get_movie_info_no_trailer(): crit_scraper = CritickerScraper() with requests_mock.mock() as m: m.get('http://www.criticker.com/film/Daens/', text=read_file('test/fixtures/criticker-daens.html')) movie_info = crit_scraper.get_movie_info( 'http://www.criticker.com/film/Daens/') assert movie_info['trailer_url'] is None
def test_get_movie_list_popularity_page(mocker): crit_scraper = CritickerScraper() movies, nr_pages = crit_scraper.get_movie_list_page( 'https://www.criticker.com/films/?filter=n9zp9zf2000zor&p=1', pagenr=1, popularity=9) assert nr_pages == 6 assert len(movies) == 5 assert movies[0] == {'crit_id': 1}
def test_get_movie_list_html(): crit_scraper = CritickerScraper() with requests_mock.mock() as m: m.get('https://www.criticker.com/films/?filter=or&view=all', text=read_file('test/fixtures/criticker-movie-list.html')) movie_list, nr_pages = crit_scraper.get_movie_list_html( 'https://www.criticker.com/films/?filter=or&view=all') assert nr_pages == 2283 assert len(movie_list) == 60 assert isinstance(movie_list, ResultSet) assert isinstance(movie_list[0], Tag)
def test_get_movies_of_popularity(mocker): crit_scraper = CritickerScraper() movies = crit_scraper.get_movies_of_popularity(popularity=8, min_year=2000) assert movies == [1, 2, 3, 1, 2, 3, 1, 2, 3] assert crit_scraper.get_movie_list_popularity_page.call_count == 4 assert crit_scraper.get_movie_list_popularity_page.call_args_list[0][ 1] == { 'min_year': 2000, 'popularity': 8 } assert crit_scraper.get_movie_list_popularity_page.call_args_list[1][1] == \ {'min_year': 2000, 'popularity': 8, 'pagenr': 1}
def test_get_movies(mocker): create_test_tables() db = MySQLDatabase(schema='qmdb_test', env='test') crit_scraper = CritickerScraper() crit_scraper.get_movies(db, start_popularity=8) save_movies_call_args = db.save_movies.call_args_list[0][0] assert save_movies_call_args[0] == [1, 1, 1] assert crit_scraper.get_movies_of_popularity.call_args_list[0][1] ==\ {'debug': False, 'min_year': 2013, 'popularity': 10} assert crit_scraper.get_movies_of_popularity.call_args_list[1][1] ==\ {'debug': False, 'min_year': 2016, 'popularity': 9} assert crit_scraper.get_movies_of_popularity.call_args_list[2][1] ==\ {'debug': False, 'min_year': 2018, 'popularity': 8} remove_test_tables(db)
def test_get_movie_info(): crit_scraper = CritickerScraper() with requests_mock.mock() as m: m.get('http://www.criticker.com/film/The-Matrix/', text=read_file('test/fixtures/criticker-the-matrix.html')) movie_info = crit_scraper.get_movie_info( 'http://www.criticker.com/film/The-Matrix/') assert movie_info[ 'poster_url'] == 'https://www.criticker.com/img/films/posters/The-Matrix.jpg' assert movie_info['imdbid'] == 133093 assert movie_info['crit_rating'] == pytest.approx(7.71, 0.3) assert movie_info['crit_votes'] == pytest.approx(27493, 1000) assert movie_info['my_ratings']['tijl']['rating'] == 93 assert movie_info['my_ratings']['tijl']['psi'] == pytest.approx(80, 10) assert movie_info[ 'trailer_url'] == 'https://www.youtube.com/watch?v=vKQi3bBA1y8'
def __init__(self, **kwargs): self.sources = [ 'criticker', 'omdb', 'imdb_main', 'imdb_release', 'imdb_metacritic', 'imdb_keywords', 'imdb_taglines', 'imdb_vote_details', 'imdb_plot', 'ptp' ] self.multipliers = { 'base_multiplier': 1, 'base_multiplier_criticker': 1, 'base_multiplier_omdb': 10, 'base_multiplier_imdb_main': 1, 'base_multiplier_imdb_release': 10, 'base_multiplier_imdb_metacritic': 5, 'base_multiplier_imdb_keywords': 10, 'base_multiplier_imdb_taglines': 10, 'base_multiplier_imdb_vote_details': 2, 'base_multiplier_imdb_plot': 10, 'base_multiplier_ptp': 1, 'firsttime_speedup': 50000 } self.multipliers.update(kwargs) for source in self.sources: self.multipliers['multiplier_' + source] = \ self.multipliers['base_multiplier_' + source] * self.multipliers['base_multiplier'] self.multipliers['multiplier_' + source + '_firsttime'] = \ self.multipliers['base_multiplier_' + source] * self.multipliers['base_multiplier'] / \ self.multipliers['firsttime_speedup'] self.crit_scraper = CritickerScraper() self.omdb_scraper = OMDBScraper() self.imdb_scraper = IMDBScraper() self.ptp_scraper = PassThePopcornScraper() self.years = None self.crit_pop = None self.earliest_date_added = None self.max_connections_per_hour = { 'criticker': 400, 'omdb': 40, 'imdb': 800, 'ptp': 400 }
class Updater(object): def __init__(self, **kwargs): self.sources = [ 'criticker', 'omdb', 'imdb_main', 'imdb_release', 'imdb_metacritic', 'imdb_keywords', 'imdb_taglines', 'imdb_vote_details', 'imdb_plot', 'ptp' ] self.multipliers = { 'base_multiplier': 1, 'base_multiplier_criticker': 1, 'base_multiplier_omdb': 10, 'base_multiplier_imdb_main': 1, 'base_multiplier_imdb_release': 10, 'base_multiplier_imdb_metacritic': 5, 'base_multiplier_imdb_keywords': 10, 'base_multiplier_imdb_taglines': 10, 'base_multiplier_imdb_vote_details': 2, 'base_multiplier_imdb_plot': 10, 'base_multiplier_ptp': 1, 'firsttime_speedup': 50000 } self.multipliers.update(kwargs) for source in self.sources: self.multipliers['multiplier_' + source] = \ self.multipliers['base_multiplier_' + source] * self.multipliers['base_multiplier'] self.multipliers['multiplier_' + source + '_firsttime'] = \ self.multipliers['base_multiplier_' + source] * self.multipliers['base_multiplier'] / \ self.multipliers['firsttime_speedup'] self.crit_scraper = CritickerScraper() self.omdb_scraper = OMDBScraper() self.imdb_scraper = IMDBScraper() self.ptp_scraper = PassThePopcornScraper() self.years = None self.crit_pop = None self.earliest_date_added = None self.max_connections_per_hour = { 'criticker': 400, 'omdb': 40, 'imdb': 800, 'ptp': 400 } def update_movies(self, db, n=None, weibull_lambda=1.5): self.get_movies_stats(db) updates = self.get_all_next_updates(db, weibull_lambda=weibull_lambda) crit_updates = self.get_source_update_sequence(updates, 'criticker') omdb_updates = self.get_source_update_sequence(updates, 'omdb') imdb_updates = self.get_source_update_sequence(updates, 'imdb') ptp_updates = self.get_source_update_sequence(updates, 'ptp') sorted_seq = sorted(crit_updates + omdb_updates + imdb_updates + ptp_updates, key=itemgetter('next_update')) if n is not None: sources_to_update = sorted_seq[:n] else: sources_to_update = sorted_seq for i, source_to_update in enumerate(sources_to_update): time_to_sleep = max(1, (source_to_update['next_update'] - arrow.now()).total_seconds()) last_updated = getattr(db.movies[source_to_update['crit_id']], source_to_update['source'] + '_updated') crit_popularity = db.movies[ source_to_update['crit_id']].crit_popularity if crit_popularity is None: crit_popularity = 5 print( "{}: Updating {} info for '{}' ({}, popularity {:.1f}) {}. Last updated {}." .format(arrow.now().format('HH:mm:ss'), source_to_update['source'], db.movies[source_to_update['crit_id']].title, db.movies[source_to_update['crit_id']].year, crit_popularity, arrow.now().shift(seconds=time_to_sleep).humanize(), humanized_time(last_updated))) time.sleep(time_to_sleep) self.update_source(db, source_to_update) return def get_movies_stats(self, db): years_numbers = [ db.movies[crit_id].get_floating_release_year() for crit_id in db.movies ] years = { 'min': np.min(years_numbers), 'median': np.median(years_numbers), 'max': np.max(years_numbers) } years['b_parameter'] = self.b_parameter(years['max'] - years['median'], years['max'] - years['min']) years['a_parameter'] = self.a_parameter(years['max'] - years['median'], years['b_parameter']) crit_pop_nrs = [ db.movies[crit_id].crit_popularity for crit_id in db.movies if db.movies[crit_id].crit_popularity is not None ] crit_pop = { 'min': np.min(crit_pop_nrs), 'median': np.median(crit_pop_nrs), 'max': np.max(crit_pop_nrs) } crit_pop['b_parameter'] = self.b_parameter( crit_pop['max'] - crit_pop['median'], crit_pop['max'] - crit_pop['min']) crit_pop['a_parameter'] = self.a_parameter( crit_pop['max'] - crit_pop['median'], crit_pop['b_parameter']) self.years = years self.crit_pop = crit_pop self.earliest_date_added = np.min( [db.movies[crit_id].date_added for crit_id in db.movies]) @staticmethod def b_parameter(median_feature, max_feature, median_period=6, max_period=36): return np.log(np.log(max_period) / np.log(median_period)) / np.log( max_feature / median_feature) @staticmethod def a_parameter(median_feature, b_parameter, median_period=8): return np.log(median_period) / np.power(median_feature, b_parameter) def get_all_next_updates(self, db, weibull_lambda=1.5): seq = [] for crit_id, movie in db.movies.items(): updates = self.calculate_next_updates( movie, weibull_lambda=weibull_lambda) seq += list(updates) now = arrow.now() seq = [u for u in seq if u['next_update'] <= now] return seq def calculate_next_updates(self, movie, weibull_lambda=1.5): year_period_score = self.calculate_period_score( self.years['max'] - movie.year, self.years) crit_popularity = self.crit_pop[ 'median'] if movie.crit_popularity is None else movie.crit_popularity crit_pop_period_score = self.calculate_period_score( self.crit_pop['max'] - crit_popularity, self.crit_pop) base_update_period = self.calculate_update_period( year_period_score, crit_pop_period_score) update_periods = {} for source in self.sources: if (source in ('omdb', 'ptp') or source.startswith('imdb')) and movie.imdbid is None: break update_periods[source] = dict() update_periods[source]['source'] = source update_periods[source]['crit_id'] = movie.crit_id update_periods[source][ 'update_period'] = base_update_period * self.multipliers[ 'multiplier_' + source] update_periods[source]['update_period_firsttime'] = \ base_update_period * self.multipliers['multiplier_' + source + '_firsttime'] next_update, period = self.calculate_next_update( getattr(movie, source + '_updated'), update_periods[source]['update_period'], update_periods[source]['update_period_firsttime'], weibull_lambda=weibull_lambda) update_periods[source]['next_update'] = next_update update_periods[source]['actual_update_period'] = period return list(update_periods.values()) @staticmethod def calculate_period_score(feature, stats): return np.exp(stats['a_parameter'] * np.power(feature, stats['b_parameter'])) @staticmethod def calculate_update_period(year_period_score, crit_pop_period_score, year_power=2, crit_pop_power=1): period = np.exp((year_power * np.log(year_period_score) + crit_pop_power * np.log(crit_pop_period_score)) / (year_power + crit_pop_power)) return period def calculate_next_update(self, date_updated, period, firsttime_period, weibull_lambda=1.5, min_period=500): weibull = (np.random.weibull(weibull_lambda, 1) / np.power(np.log(2), 1 / weibull_lambda))[0] if date_updated is None: next_update = self.earliest_date_added.shift( weeks=min(firsttime_period * weibull, min_period)) return next_update, min(firsttime_period, min_period) else: next_update = date_updated.shift(weeks=min(period * weibull, min_period)) return next_update, min(period, min_period) def get_source_update_sequence(self, all_updates, source): updates = [u for u in all_updates if u['source'].startswith(source)] sorted_updates = sorted(updates, key=itemgetter('next_update')) if len(sorted_updates) > 0: uph = min(len(sorted_updates), self.max_connections_per_hour[source]) print("Updates needed per hour for {}: {:.0f}".format(source, uph)) update_intervals = np.random.exponential(3600 / uph, len(sorted_updates)) for i, u in enumerate(sorted_updates): if i > 0: sorted_updates[i]['next_update'] = sorted_updates[ i - 1]['next_update'].shift(seconds=update_intervals[i]) else: sorted_updates[i]['next_update'] = arrow.now().shift( seconds=update_intervals[i]) return sorted_updates def update_source(self, db, source_to_update): movie = db.movies[source_to_update['crit_id']] if source_to_update['source'] == 'criticker': movie = self.crit_scraper.refresh_movie(movie) elif source_to_update['source'] == 'omdb': movie = self.omdb_scraper.refresh_movie(movie) elif source_to_update['source'].startswith('imdb'): infoset = re.search(r'imdb_(.*)', source_to_update['source']).groups()[0] movie = self.imdb_scraper.refresh_movie(movie, infoset=infoset) elif source_to_update['source'] == 'ptp': movie = self.ptp_scraper.refresh_movie(movie) if movie is not None: db.set_movie(movie) def update_movie_completely(self, db, imdbid): movie = [ db.movies[crit_id] for crit_id in db.movies if db.movies[crit_id].imdbid == imdbid ][0] movie = self.crit_scraper.refresh_movie(movie) movie = self.omdb_scraper.refresh_movie(movie) movie = self.imdb_scraper.refresh_movie(movie, infoset='main') movie = self.ptp_scraper.refresh_movie(movie) if movie is not None: db.set_movie(movie)
def test_get_year_from_movielist_title(): crit_scraper = CritickerScraper() movielist_title = 'The Matrix (1999)' assert crit_scraper.get_year_from_movielist_title(movielist_title) == 1999
def test_config_cookies(): crit_scraper = CritickerScraper() assert isinstance(crit_scraper.cookies, dict) assert 'uid2' in crit_scraper.cookies
from qmdb.database.database import MySQLDatabase from qmdb.interfaces.omdb import OMDBScraper from qmdb.interfaces.criticker import CritickerScraper from qmdb.interfaces.updater import Updater from qmdb.model.predictions import RatingModeler from qmdb.interfaces.netflix import NetflixScraper import time if __name__ == "__main__": db = MySQLDatabase(from_scratch=False) omdb_scraper = OMDBScraper() crit_scraper = CritickerScraper(user='******') updater = Updater() modeler = RatingModeler(db) netflix_scraper = NetflixScraper(db) while True: print("\nRefreshing movie information from Criticker, IMDb and OMDB\n") time0 = time.time() while time.time() - time0 <= 12 * 3600: updater.update_movies(db, n=30, weibull_lambda=3) crit_scraper.get_movies(db, start_popularity=2) netflix_scraper.get_genre_ids() netflix_scraper.get_movies_for_genres() crit_scraper.get_ratings(db) modeler.get_predictions()