def test_process_release_info(mocker): imdb_scraper = IMDBScraper() info = imdb_scraper.process_release_info('0133093') assert isinstance(info, dict) assert info['original_release_date'] == arrow.get(dt.datetime(1999, 3, 31)) assert info['dutch_release_date'] == arrow.get(dt.datetime(1999, 6, 17)) assert info['original_title'] is None
def test_process_release_info_with_original_title(mocker): imdb_scraper = IMDBScraper() info = imdb_scraper.process_release_info('0133093') assert isinstance(info, dict) assert info['original_release_date'] == arrow.get(dt.datetime(1999, 3, 31)) assert info['dutch_release_date'] == arrow.get(dt.datetime(1999, 6, 17)) assert info['original_title'] == 'Intouchables' assert info['english_title'] == 'The Intouchables'
def test_process_title(): imdb_scraper = IMDBScraper() assert imdb_scraper.process_title('Argentina::Matrix') == \ {'country': 'Argentina', 'title': 'Matrix', 'tag': None} assert imdb_scraper.process_title('Canada (French title)::La matrice') == \ {'country': 'Canada', 'title': 'La matrice', 'tag': 'French title'} assert imdb_scraper.process_title('Finland (video box title)::Matrix') == \ {'country': 'Finland', 'title': 'Matrix', 'tag': 'video box title'}
def test_get_movie_info_taglines_present(): imdb_scraper = IMDBScraper() info = imdb_scraper.process_taglines_info('0123456') assert isinstance(info, dict) assert info['taglines'] == [ 'Free your mind', 'In a world of 1s and 0s...are you a zero, or The One?' ]
def test_refresh_movie(): imdb_scraper = IMDBScraper() movie = Movie({'crit_id': 1234, 'imdbid': 133093}) imdb_scraper.refresh_movie(movie) assert imdb_scraper.process_main_info.call_count == 1 assert imdb_scraper.process_main_info.call_args_list[0][0] == ('0133093', ) assert movie.imdb_votes == 2000 assert movie.crit_id == 1234 assert movie.imdb_title == 'The Matrix'
def test_process_vote_details_info(): imdb_scraper = IMDBScraper() info = imdb_scraper.process_vote_details_info('0133093') assert isinstance(info, dict) assert info['vote_details']['imdb users'] == { 'votes': 1379920, 'rating': 8.7 } assert info['vote_details']['non us users'] == { 'votes': 604641, 'rating': 8.7 }
def test_person_to_dict(): class Person: pass imdb_scraper = IMDBScraper() person = Person() person.data = {'name': 'Tijl Kindt'} person.personID = 1234 person_dict = imdb_scraper.person_to_dict(person) assert person_dict == { 'canonical_name': 'Kindt, Tijl', 'name': 'Tijl Kindt', 'person_id': 1234 }
def test_parse_store_load_persons(): create_test_tables() db = MySQLDatabase(schema='qmdb_test', env='test') imdb_scraper = IMDBScraper() imdbid = '3315342' movie_info = imdb_scraper.process_main_info(imdbid) movie_info.update({ 'crit_id': 1234, 'title': 'Logan', 'year': 2017, 'crit_url': 'http://www.criticker.com/film/Logan/', 'date_added': '2018-01-01' }) movie = Movie(movie_info) db.set_movie(movie) db = MySQLDatabase(schema='qmdb_test', env='test') assert db.movies[1234].cast[0]['name'] == 'Hugh Jackman' assert db.movies[1234].director[0]['name'] == 'James Mangold' assert db.movies[1234].writer[1]['name'] == 'Scott Frank'
def test_process_main_info(): imdb_scraper = IMDBScraper() info = imdb_scraper.process_main_info('0133093') assert isinstance(info, dict) assert info['imdb_year'] == 1999 assert info['kind'] == 'movie' assert info['cast'][0]['name'] == 'Keanu Reeves' assert len(info['cast']) == 39 assert info['director'][0]['name'] == 'Lana Wachowski' assert len(info['director']) == 2 assert info['writer'][0]['name'] == 'Lilly Wachowski' assert len(info['writer']) == 2 assert info['genres'] == ['Action', 'Sci-Fi'] assert info['runtime'] == 136 assert info['countries'] == ['United States'] assert info['imdb_rating'] == 8.7 assert info['imdb_votes'] == 1379790 assert info['plot_storyline'][:10] == 'Thomas A. ' assert info['languages'] == ['English']
def test_remove_duplicate_dicts(): l = [{ 'a': 3, 'b': 4 }, { 'a': 1, 'b': 2 }, { 'a': 3, 'b': 4 }, { 'a': 1, 'b': 2 }, { 'a': 5, 'b': 6 }] imdb_scraper = IMDBScraper() new_l = imdb_scraper.remove_duplicate_dicts(l) assert new_l == [{'a': 3, 'b': 4}, {'a': 1, 'b': 2}, {'a': 5, 'b': 6}]
def __init__(self, **kwargs): self.sources = [ 'criticker', 'omdb', 'imdb_main', 'imdb_release', 'imdb_metacritic', 'imdb_keywords', 'imdb_taglines', 'imdb_vote_details', 'imdb_plot', 'ptp' ] self.multipliers = { 'base_multiplier': 1, 'base_multiplier_criticker': 1, 'base_multiplier_omdb': 10, 'base_multiplier_imdb_main': 1, 'base_multiplier_imdb_release': 10, 'base_multiplier_imdb_metacritic': 5, 'base_multiplier_imdb_keywords': 10, 'base_multiplier_imdb_taglines': 10, 'base_multiplier_imdb_vote_details': 2, 'base_multiplier_imdb_plot': 10, 'base_multiplier_ptp': 1, 'firsttime_speedup': 50000 } self.multipliers.update(kwargs) for source in self.sources: self.multipliers['multiplier_' + source] = \ self.multipliers['base_multiplier_' + source] * self.multipliers['base_multiplier'] self.multipliers['multiplier_' + source + '_firsttime'] = \ self.multipliers['base_multiplier_' + source] * self.multipliers['base_multiplier'] / \ self.multipliers['firsttime_speedup'] self.crit_scraper = CritickerScraper() self.omdb_scraper = OMDBScraper() self.imdb_scraper = IMDBScraper() self.ptp_scraper = PassThePopcornScraper() self.years = None self.crit_pop = None self.earliest_date_added = None self.max_connections_per_hour = { 'criticker': 400, 'omdb': 40, 'imdb': 800, 'ptp': 400 }
def test_get_english_original_title(): imdb_scraper = IMDBScraper() akas_matrix = [ 'Argentina::Matrix', 'Belgium (French title)::Matrix', 'Finland (video box title)::Matrix', 'Japan (English title)::Matrix', 'Panama (alternative title)::La matriz' ] english_title, original_title = imdb_scraper.get_english_original_title( akas_matrix) assert english_title is None assert original_title is None akas_intouchables = [ '(original title)::Intouchables', 'Argentina::Amigos intocables', 'Bulgaria (Bulgarian title)::Недосегаемите', 'Europe (festival title) (English title)::Untouchable', 'World-wide::The Intouchables' ] english_title, original_title = imdb_scraper.get_english_original_title( akas_intouchables) assert english_title == 'The Intouchables' assert original_title == 'Intouchables' akas_le_fils = [ '(original title)::Le fils', 'Argentina::El hijo', 'Bulgaria (Bulgarian title)::Синът', 'World-wide (English title)::The Son' ] english_title, original_title = imdb_scraper.get_english_original_title( akas_le_fils) assert english_title == 'The Son' assert original_title == 'Le fils' akas_on_body_and_soul = [ '(original title)::Teströl és lélekröl', 'Argentina::En cuerpo y alma', 'Bulgaria (Bulgarian title)::За тялото и душата', 'World-wide (English title)::On Body and Soul' ] english_title, original_title = imdb_scraper.get_english_original_title( akas_on_body_and_soul) assert english_title == 'On Body and Soul' assert original_title == 'Teströl és lélekröl' akas_the_lift = [ '(original title)::De lift', 'Argentina::El ascensor', 'Philippines (English title)::The Lift', 'Soviet Union (Russian title)::Лифт', 'USA::The Lift' ] english_title, original_title = imdb_scraper.get_english_original_title( akas_the_lift) assert english_title == 'The Lift' assert original_title == 'De lift'
def test_process_release_date(): imdb_scraper = IMDBScraper() date_dict = imdb_scraper.process_release_date( 'Czech Republic::5 August 1999') assert isinstance(date_dict, dict) assert date_dict['country'] == 'Czech Republic' assert date_dict['date'] == arrow.get(dt.datetime(1999, 8, 5)) assert date_dict['tags'] is None date_dict = imdb_scraper.process_release_date( 'Argentina::2 October 2003 (re-release)') assert date_dict['country'] == 'Argentina' assert date_dict['date'] == arrow.get(dt.datetime(2003, 10, 2)) assert date_dict['tags'] == ['re-release'] date_dict = imdb_scraper.process_release_date( 'Portugal::9 June 1999 (Oporto)\n (premiere)') assert date_dict['country'] == 'Portugal' assert date_dict['date'] == arrow.get(dt.datetime(1999, 6, 9)) assert date_dict['tags'] == ['Oporto', 'premiere'] date_dict = imdb_scraper.process_release_date( 'USA::24 March 1999 (Westwood, California)\n (premiere)') assert date_dict['country'] == 'USA' assert date_dict['date'] == arrow.get(dt.datetime(1999, 3, 24)) assert date_dict['tags'] == ['Westwood, California', 'premiere']
class Updater(object): def __init__(self, **kwargs): self.sources = [ 'criticker', 'omdb', 'imdb_main', 'imdb_release', 'imdb_metacritic', 'imdb_keywords', 'imdb_taglines', 'imdb_vote_details', 'imdb_plot', 'ptp' ] self.multipliers = { 'base_multiplier': 1, 'base_multiplier_criticker': 1, 'base_multiplier_omdb': 10, 'base_multiplier_imdb_main': 1, 'base_multiplier_imdb_release': 10, 'base_multiplier_imdb_metacritic': 5, 'base_multiplier_imdb_keywords': 10, 'base_multiplier_imdb_taglines': 10, 'base_multiplier_imdb_vote_details': 2, 'base_multiplier_imdb_plot': 10, 'base_multiplier_ptp': 1, 'firsttime_speedup': 50000 } self.multipliers.update(kwargs) for source in self.sources: self.multipliers['multiplier_' + source] = \ self.multipliers['base_multiplier_' + source] * self.multipliers['base_multiplier'] self.multipliers['multiplier_' + source + '_firsttime'] = \ self.multipliers['base_multiplier_' + source] * self.multipliers['base_multiplier'] / \ self.multipliers['firsttime_speedup'] self.crit_scraper = CritickerScraper() self.omdb_scraper = OMDBScraper() self.imdb_scraper = IMDBScraper() self.ptp_scraper = PassThePopcornScraper() self.years = None self.crit_pop = None self.earliest_date_added = None self.max_connections_per_hour = { 'criticker': 400, 'omdb': 40, 'imdb': 800, 'ptp': 400 } def update_movies(self, db, n=None, weibull_lambda=1.5): self.get_movies_stats(db) updates = self.get_all_next_updates(db, weibull_lambda=weibull_lambda) crit_updates = self.get_source_update_sequence(updates, 'criticker') omdb_updates = self.get_source_update_sequence(updates, 'omdb') imdb_updates = self.get_source_update_sequence(updates, 'imdb') ptp_updates = self.get_source_update_sequence(updates, 'ptp') sorted_seq = sorted(crit_updates + omdb_updates + imdb_updates + ptp_updates, key=itemgetter('next_update')) if n is not None: sources_to_update = sorted_seq[:n] else: sources_to_update = sorted_seq for i, source_to_update in enumerate(sources_to_update): time_to_sleep = max(1, (source_to_update['next_update'] - arrow.now()).total_seconds()) last_updated = getattr(db.movies[source_to_update['crit_id']], source_to_update['source'] + '_updated') crit_popularity = db.movies[ source_to_update['crit_id']].crit_popularity if crit_popularity is None: crit_popularity = 5 print( "{}: Updating {} info for '{}' ({}, popularity {:.1f}) {}. Last updated {}." .format(arrow.now().format('HH:mm:ss'), source_to_update['source'], db.movies[source_to_update['crit_id']].title, db.movies[source_to_update['crit_id']].year, crit_popularity, arrow.now().shift(seconds=time_to_sleep).humanize(), humanized_time(last_updated))) time.sleep(time_to_sleep) self.update_source(db, source_to_update) return def get_movies_stats(self, db): years_numbers = [ db.movies[crit_id].get_floating_release_year() for crit_id in db.movies ] years = { 'min': np.min(years_numbers), 'median': np.median(years_numbers), 'max': np.max(years_numbers) } years['b_parameter'] = self.b_parameter(years['max'] - years['median'], years['max'] - years['min']) years['a_parameter'] = self.a_parameter(years['max'] - years['median'], years['b_parameter']) crit_pop_nrs = [ db.movies[crit_id].crit_popularity for crit_id in db.movies if db.movies[crit_id].crit_popularity is not None ] crit_pop = { 'min': np.min(crit_pop_nrs), 'median': np.median(crit_pop_nrs), 'max': np.max(crit_pop_nrs) } crit_pop['b_parameter'] = self.b_parameter( crit_pop['max'] - crit_pop['median'], crit_pop['max'] - crit_pop['min']) crit_pop['a_parameter'] = self.a_parameter( crit_pop['max'] - crit_pop['median'], crit_pop['b_parameter']) self.years = years self.crit_pop = crit_pop self.earliest_date_added = np.min( [db.movies[crit_id].date_added for crit_id in db.movies]) @staticmethod def b_parameter(median_feature, max_feature, median_period=6, max_period=36): return np.log(np.log(max_period) / np.log(median_period)) / np.log( max_feature / median_feature) @staticmethod def a_parameter(median_feature, b_parameter, median_period=8): return np.log(median_period) / np.power(median_feature, b_parameter) def get_all_next_updates(self, db, weibull_lambda=1.5): seq = [] for crit_id, movie in db.movies.items(): updates = self.calculate_next_updates( movie, weibull_lambda=weibull_lambda) seq += list(updates) now = arrow.now() seq = [u for u in seq if u['next_update'] <= now] return seq def calculate_next_updates(self, movie, weibull_lambda=1.5): year_period_score = self.calculate_period_score( self.years['max'] - movie.year, self.years) crit_popularity = self.crit_pop[ 'median'] if movie.crit_popularity is None else movie.crit_popularity crit_pop_period_score = self.calculate_period_score( self.crit_pop['max'] - crit_popularity, self.crit_pop) base_update_period = self.calculate_update_period( year_period_score, crit_pop_period_score) update_periods = {} for source in self.sources: if (source in ('omdb', 'ptp') or source.startswith('imdb')) and movie.imdbid is None: break update_periods[source] = dict() update_periods[source]['source'] = source update_periods[source]['crit_id'] = movie.crit_id update_periods[source][ 'update_period'] = base_update_period * self.multipliers[ 'multiplier_' + source] update_periods[source]['update_period_firsttime'] = \ base_update_period * self.multipliers['multiplier_' + source + '_firsttime'] next_update, period = self.calculate_next_update( getattr(movie, source + '_updated'), update_periods[source]['update_period'], update_periods[source]['update_period_firsttime'], weibull_lambda=weibull_lambda) update_periods[source]['next_update'] = next_update update_periods[source]['actual_update_period'] = period return list(update_periods.values()) @staticmethod def calculate_period_score(feature, stats): return np.exp(stats['a_parameter'] * np.power(feature, stats['b_parameter'])) @staticmethod def calculate_update_period(year_period_score, crit_pop_period_score, year_power=2, crit_pop_power=1): period = np.exp((year_power * np.log(year_period_score) + crit_pop_power * np.log(crit_pop_period_score)) / (year_power + crit_pop_power)) return period def calculate_next_update(self, date_updated, period, firsttime_period, weibull_lambda=1.5, min_period=500): weibull = (np.random.weibull(weibull_lambda, 1) / np.power(np.log(2), 1 / weibull_lambda))[0] if date_updated is None: next_update = self.earliest_date_added.shift( weeks=min(firsttime_period * weibull, min_period)) return next_update, min(firsttime_period, min_period) else: next_update = date_updated.shift(weeks=min(period * weibull, min_period)) return next_update, min(period, min_period) def get_source_update_sequence(self, all_updates, source): updates = [u for u in all_updates if u['source'].startswith(source)] sorted_updates = sorted(updates, key=itemgetter('next_update')) if len(sorted_updates) > 0: uph = min(len(sorted_updates), self.max_connections_per_hour[source]) print("Updates needed per hour for {}: {:.0f}".format(source, uph)) update_intervals = np.random.exponential(3600 / uph, len(sorted_updates)) for i, u in enumerate(sorted_updates): if i > 0: sorted_updates[i]['next_update'] = sorted_updates[ i - 1]['next_update'].shift(seconds=update_intervals[i]) else: sorted_updates[i]['next_update'] = arrow.now().shift( seconds=update_intervals[i]) return sorted_updates def update_source(self, db, source_to_update): movie = db.movies[source_to_update['crit_id']] if source_to_update['source'] == 'criticker': movie = self.crit_scraper.refresh_movie(movie) elif source_to_update['source'] == 'omdb': movie = self.omdb_scraper.refresh_movie(movie) elif source_to_update['source'].startswith('imdb'): infoset = re.search(r'imdb_(.*)', source_to_update['source']).groups()[0] movie = self.imdb_scraper.refresh_movie(movie, infoset=infoset) elif source_to_update['source'] == 'ptp': movie = self.ptp_scraper.refresh_movie(movie) if movie is not None: db.set_movie(movie) def update_movie_completely(self, db, imdbid): movie = [ db.movies[crit_id] for crit_id in db.movies if db.movies[crit_id].imdbid == imdbid ][0] movie = self.crit_scraper.refresh_movie(movie) movie = self.omdb_scraper.refresh_movie(movie) movie = self.imdb_scraper.refresh_movie(movie, infoset='main') movie = self.ptp_scraper.refresh_movie(movie) if movie is not None: db.set_movie(movie)
def test_parse_imdb_votes(): imdb_scraper = IMDBScraper() assert imdb_scraper.parse_imdb_votes(None) is None assert imdb_scraper.parse_imdb_votes('1000') == 1000 assert imdb_scraper.parse_imdb_votes('(2,345)') == 2345 assert imdb_scraper.parse_imdb_votes('xdfw2,345)') is None
def test_process_plot_info_present(): imdb_scraper = IMDBScraper() info = imdb_scraper.process_plot_info('0133093') assert isinstance(info, dict) assert info['plot_summary'] is None
def test_process_plot_info_present(): imdb_scraper = IMDBScraper() info = imdb_scraper.process_plot_info('0133093') assert isinstance(info, dict) assert info['plot_summary'] == 'A computer hacker learns from mysterious rebels about the true nature of his ' \ 'reality and his role in the war against its controllers.'
def test_process_metacritic_info_not_present(): imdb_scraper = IMDBScraper() info = imdb_scraper.process_metacritic_info('0123456') assert isinstance(info, dict) assert info['metacritic_score'] is None
def test_process_vote_details_info(): imdb_scraper = IMDBScraper() info = imdb_scraper.process_vote_details_info('1630029') assert isinstance(info, dict) assert info['vote_details'] is None
def test_get_movie_info_taglines_present(): imdb_scraper = IMDBScraper() info = imdb_scraper.process_taglines_info('0123456') assert isinstance(info, dict) assert info['taglines'] is None
def test_get_release_data(): imdb_scraper = IMDBScraper() release_info = load_obj( 'imdb-release-dates-info-the-matrix')['data']['release dates'] original_release_date, dutch_release_date = imdb_scraper.get_release_date( release_info) assert original_release_date == arrow.get(dt.datetime(1999, 3, 31)) assert dutch_release_date == arrow.get(dt.datetime(1999, 6, 17)) release_info = [ 'USA::24 March 1999 (Westwood, California)\n (premiere)', 'USA::28 March 1999 (Westwood, California)\n (limited)', 'USA::30 March 1999' ] original_release_date, dutch_release_date = imdb_scraper.get_release_date( release_info) assert original_release_date == arrow.get(dt.datetime(1999, 3, 30)) assert dutch_release_date is None release_info = [ 'Netherlands::4 April 1999 (premiere)', 'USA::24 March 1999 (Westwood, California)\n (premiere)', 'USA::28 March 1999 (Westwood, California)\n (limited)' ] original_release_date, dutch_release_date = imdb_scraper.get_release_date( release_info) assert original_release_date == arrow.get(dt.datetime(1999, 3, 28)) assert dutch_release_date == arrow.get(dt.datetime(1999, 4, 4)) release_info = [ 'Netherlands::4 April 1999 (premiere)', 'Netherlands::8 April 1999', 'USA::25 March 1999' ] original_release_date, dutch_release_date = imdb_scraper.get_release_date( release_info) assert original_release_date == arrow.get(dt.datetime(1999, 3, 25)) assert dutch_release_date == arrow.get(dt.datetime(1999, 4, 8)) release_info = [ 'USA::24 March 1999 (Westwood, California)\n (premiere)', 'Bangladesh::27 March 1999', 'Netherlands::5 April 1999 (limited)', 'Netherlands::2 April 1999 (premiere)' ] original_release_date, dutch_release_date = imdb_scraper.get_release_date( release_info) assert original_release_date == arrow.get(dt.datetime(1999, 3, 27)) assert dutch_release_date == arrow.get(dt.datetime(1999, 4, 5)) release_info = [ 'Italy::21 December 1968', 'Italy::24 December 1968 (Turin)', 'USA::28 May 1969 (New York, New York City)', 'USA::4 July 1969', 'Netherlands::3 November 2016 (re-release)' ] original_release_date, dutch_release_date = imdb_scraper.get_release_date( release_info) assert original_release_date == arrow.get(dt.datetime(1968, 12, 21)) assert dutch_release_date is None release_info = ['Afghanistan::29 March 1999', 'Bangladesh::27 March 1999'] original_release_date, dutch_release_date = imdb_scraper.get_release_date( release_info) assert original_release_date == arrow.get(dt.datetime(1999, 3, 27)) assert dutch_release_date is None release_info = None original_release_date, dutch_release_date = imdb_scraper.get_release_date( release_info) assert original_release_date is None assert dutch_release_date is None
def test_process_keywords_not_present(): imdb_scraper = IMDBScraper() info = imdb_scraper.process_keywords_info('0123456') assert isinstance(info, dict) assert info['keywords'] is None
def test_process_keywords_present(): imdb_scraper = IMDBScraper() info = imdb_scraper.process_keywords_info('0123456') assert isinstance(info, dict) assert info['keywords'] == ['artificial-reality', 'post-apocalypse']
def test_strip_author_from_plot(): imdb_scraper = IMDBScraper() assert imdb_scraper.strip_author_from_plot( 'This is a plot.::Tijl Kindt') == 'This is a plot.' assert imdb_scraper.strip_author_from_plot( 'This is a plot.') == 'This is a plot.'
def test_process_metacritic_info_present(): imdb_scraper = IMDBScraper() info = imdb_scraper.process_metacritic_info('0133093') assert isinstance(info, dict) assert info['metacritic_score'] == 73