class Test_DBClient(unittest.TestCase): def setUp(self): self.dbc = DBClient('test.db') self.mock_ideal_game = { 'Name': 'sample', 'RawgID': 20, 'Metacritic': 100, 'Genres': 'Action', 'Indie': False, 'Presence': 83, 'Platform': 'Windows', 'Graphics': '4gb GPU', 'Storage': '180gb', 'Memory': '8gb', 'RatingsBreakdown': '34/45/15', 'ReleaseDate': 'January 14, 2020', 'Soundtrack': False, 'Franchise': None, 'OriginalCost': '$39.99', 'DiscountedCost': None, 'Players': 'singleplayer, multiplayer', 'Controller': True, 'Languages': 'English, Mandarin', 'ESRB': 'Teen', 'Achievements': 55, 'Publisher': 'idSoftware', 'Description': 'lots of stuff', 'Tags': 'Fun, Violent', 'SteamURL': 'https://store.steampowered.com/app/42700/?snr=1_5_9__205', } def tearDown(self): self.dbc.close() os.remove('test.db') def test_create_table(self): self.dbc.create_table() self.dbc.cursor.execute( "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name") res = self.dbc.cursor.fetchall() self.assertIn(('games', ), res, 'game not found in tables') def test_drop_table(self): self.dbc.drop_table() self.dbc.cursor.execute( "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name") res = self.dbc.cursor.fetchall() self.assertNotIn(('games', ), res, 'game not found in tables') def test_add_game(self): self.dbc.create_table() self.dbc.add_game(self.mock_ideal_game) self.dbc.cursor.execute("SELECT * FROM games") res = self.dbc.cursor.fetchone() self.assertIn(self.mock_ideal_game['SteamURL'], res, 'game with url not in table') def test_get_game(self): idx = 1 self.dbc.create_table() self.dbc.add_game(self.mock_ideal_game) self.dbc.cursor.execute("SELECT id FROM games") res = [features[0] for features in self.dbc.cursor.fetchall()] self.assertIn(idx, res, 'game with idx no in table') game = self.dbc.get_game(idx) self.assertIs(game[0], idx, 'game idx does not match') def test_get_game_by_url(self): url = self.mock_ideal_game['SteamURL'] self.dbc.create_table() self.dbc.add_game(self.mock_ideal_game) self.dbc.cursor.execute("SELECT SteamURL FROM games;") res = [features[0] for features in self.dbc.cursor.fetchall()] self.assertIn(url, res, 'game with url not in table') game = self.dbc.get_game_by_url(url) self.assertIn(url, game, 'returned game url does not match') def test_get_all_games(self): self.dbc.create_table() num_games = 5 for idx in range(num_games): self.dbc.add_game(self.mock_ideal_game) games = self.dbc.get_all_games() self.assertIs(num_games, len(games), 'returned game length should equal num_games') def test_to_csv(self): self.dbc.create_table() num_games = 5 for idx in range(num_games): self.dbc.add_game(self.mock_ideal_game) self.dbc.to_csv('test.csv') test_csv_df = pd.read_csv('test.csv') self.assertIs(num_games, test_csv_df.shape[0], 'test csv should have num_games rows') os.remove('test.csv') def test_delete_game(self): self.dbc.create_table() delete_idx = 2 num_games = 5 for idx in range(num_games): self.dbc.add_game(self.mock_ideal_game) self.dbc.cursor.execute("SELECT id FROM games;") game_ids = [result[0] for result in self.dbc.cursor.fetchall()] self.assertIn(delete_idx, game_ids, 'no game present with delete_idx') self.dbc.delete_game(delete_idx) self.dbc.cursor.execute("SELECT id FROM games;") game_ids = [result[0] for result in self.dbc.cursor.fetchall()] self.assertNotIn(delete_idx, game_ids, 'game with delete_idx still present')
class SteamCrawl: def __init__(self): self.session = requests.Session() self.base_url = 'https://store.steampowered.com/search' self.base_url += '/?category1=998&supportedlang=english' self.urls = [] self.dbc = DBClient('games.db') self.dbc.create_table() def crawl(self, fetch_urls=False): # get list of urls if fetch_urls: self.__download_urls_page_source() self.__parse_urls() else: self.__parse_urls() # loop through list for url in tqdm(self.urls): if self.__already_downloaded(url): return # get features for each url game = {} game.update(self.__get_steam_features(url)) game.update(self.__get_rawg_features(url)) # save features in db self.dbc.add_game(game) self.dbc.to_csv('games.csv') return 'finished' def __download_urls_page_source(self): self.browser = webdriver.Safari() self.browser.get(self.base_url) self.__short_pause() lastHeight = self.browser.execute_script( "return document.body.scrollHeight") while True: self.browser.execute_script( "window.scrollTo(0, document.body.scrollHeight);") self.__short_pause() newHeight = self.browser.execute_script( "return document.body.scrollHeight") if newHeight == lastHeight: break lastHeight = newHeight self.__save_game_list_source() self.browser.close() def __parse_urls(self): html = self.__load_game_list_source() soup = BeautifulSoup(html) a_tags = soup.find('div', id='search_results').find_all('a') self.urls = [a_tag.get('href') for a_tag in a_tags] def __save_game_list_source(self): with open("game_list.html", "w") as f: f.write(self.browser.page_source) def __load_game_list_source(self): with open("game_list.html", "r") as f: game_list_source = f.read() return game_list_source def __short_pause(self): duration = random.uniform(0, 3) time.sleep(duration) def __already_downloaded(self, url): game = self.dbc.get_game_by_url(url) return game def __get_steam_features(self, url): sgi = SteamGameInfo() html = sgi.get_game_html(url) if html: features = sgi.strip_features(html) return features def __get_rawg_features(self, url): name = url.split('/')[5].replace('_', ' ') rawg = RAWG() features = rawg.get_game(name) return features