def steamUsers(pbar=False): logging = common.setupLogging() try: logging.info("Running Steam Users Online") client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port) db = client['steam'] collection = db['steamusers'] collection.create_index("epochint", unique=True) collection.create_index("date", unique=True) # pull Steam online users over the last 24 hours # https://store.steampowered.com/stats/ r = requests.get("https://store.steampowered.com/stats/userdata.json") if (r.ok): data = r.json()[0]['data'] if (pbar): bar = progressbar.ProgressBar(max_value=len(data)).start() for i, users in enumerate(data): if (pbar): bar.update(i + 1) # convert Epoch seconds to UTC time # https://stackoverflow.com/questions/1697815/how-do-you-convert-a-python-time-struct-time-object-into-a-datetime-object conv_time = datetime.datetime.fromtimestamp( time.mktime(time.gmtime(int(users[0]) / 1000))) #update_one will keep whatever information already exists collection.update_one({'epochint': int(users[0])}, { '$set': { 'numberonlineusers': int(users[1]), 'date': conv_time } }, upsert=True) if (pbar): bar.finish() logging.info("Finished downloading Steam users online.") logging.info("Downloaded: " + common.sizeof_fmt(len(r.content))) common.writeBandwidth(db, len(r.content)) else: logging.error("status code: " + str(r.status_code)) except Exception as e: logging.error(str(e)) time.sleep(1)
def downloadAllAppIDs(pbar=False): logging = common.setupLogging() try: logging.info("Downloading All AppIDs") # downloads a list of every appid and name from the API # and stores in MongoDB collection client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port) db = client['steam'] collection = db['apps'] r = requests.get("https://api.steampowered.com/ISteamApps/GetAppList/v0002/") if (r.ok): data = r.json() # create an index for appid, this vastly improves performance collection.create_index("appid", unique=True) if (pbar): bar = progressbar.ProgressBar(max_value=len(data['applist']['apps'])).start() requests_list = [] for i,app in enumerate(data['applist']['apps']): if (pbar): bar.update(i+1) #UpdateOne will keep whatever information already exists requests_list.append(UpdateOne({'appid': int(app['appid'])}, {'$set': app}, upsert=True)) # do bulk writes in batches, instead of one at a time if (i % 1000 == 0 or i+1 == len(data['applist']['apps'])): try: collection.bulk_write(requests_list) requests_list = [] except BulkWriteError as bwe: logging.error(bwe) if (pbar): bar.finish() logging.info("Finished downloading AppIDs.") logging.info("Downloaded: " + common.sizeof_fmt(len(r.content))) common.writeBandwidth(db, len(r.content)) else: logging.error("status code: " + str(r.status_code)) except Exception as e: logging.error(str(e)) time.sleep(1)
def updateOpenCritic(refresh_type="OLDEST", pbar=False): logging = common.setupLogging() try: logging.info("Updating OpenCritic games via " + refresh_type) client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port) db = client['steam'] collection_oc = db['opencritic'] # create an index for id, this vastly improves performance collection_oc.create_index("id", unique=True) collection_oc.create_index("date") collection_oc.create_index("steamId") if (refresh_type == "OLDEST"): # find a sampling of OpenCritic IDs to work on ordered by date # will run on the oldest entries first names_cur = collection_oc.aggregate([ { "$match": {} }, { "$sort": { "date": 1 } }, # oldest first { "$limit": 25 }, { "$project": { "id": 1, "_id": 0 } } ]) # convert cursor to Python list to_update = [] for item in names_cur: to_update.append(item['id']) if (pbar): bar = progressbar.ProgressBar(max_value=len(to_update)).start() bytes_downloaded = 0 for i, oc_id in enumerate(to_update): if (pbar): bar.update(i + 1) try: # OpenCritic Game API e.g. # https://opencritic.com/api/game/7592 r = requests.get( requests.Request( 'GET', "https://opencritic.com/api/game/" + str(oc_id)).prepare().url) if (r.ok): data = r.json() bytes_downloaded = bytes_downloaded + len(r.content) oc = data # add current datetimestamp oc['date'] = datetime.datetime.utcnow() #update_one will keep whatever information already exists collection_oc.update_one({'id': int(oc['id'])}, {'$set': oc}, upsert=True) else: logging.error("status code: " + str(r.status_code)) logging.error("opencritic game id: " + str(oc_id)) # sleep for a bit, there's no information on API throttling time.sleep(2) #seconds # grab review information which is a separate URL # e.g. https://opencritic.com/api/review/game/7592 r = requests.get( requests.Request( 'GET', "https://opencritic.com/api/review/game/" + str(oc_id)).prepare().url) if (r.ok): data = r.json() bytes_downloaded = bytes_downloaded + len(r.content) oc['Reviews'] = data #update_one will keep whatever information already exists collection_oc.update_one({'id': int(oc['id'])}, {'$set': oc}, upsert=True) else: logging.error("status code: " + str(r.status_code)) logging.error("opencritic game id: " + str(oc_id)) except Exception as e: logging.error(str(e) + " - id: " + str(oc_id)) # sleep for a bit, there's no information on API throttling time.sleep(2) #seconds if (pbar): bar.finish() logging.info("Finished updating OpenCritic games via " + refresh_type) logging.info("Downloaded: " + common.sizeof_fmt(bytes_downloaded)) common.writeBandwidth(db, bytes_downloaded) except Exception as e: logging.error(str(e)) time.sleep(1)
import schedule, time import common # common.py import steamtopgames, steamusers, steamreviews, updatepricehistory, refreshsteam, downloadappids, opencriticsearch, opencriticgames, twitchtopgames # *.py files logging = common.setupLogging() logging.info("Starting steam-analysis") # run a few things right off the bat since they run infrequently steamusers.steamUsers() downloadappids.downloadAllAppIDs() # schedule items to run schedule.every(15).minutes.do(steamtopgames.steamTopGames) schedule.every(23).hours.do(steamusers.steamUsers) schedule.every(1).hours.do(updatepricehistory.updatePriceHistory, "PARTIAL", False) schedule.every(30).minutes.do(steamreviews.steamReviews) schedule.every(3).hours.do(refreshsteam.refreshSteamAppIDs, "SAMPLING", False) schedule.every(6).hours.do(refreshsteam.refreshSteamAppIDs, "MISSING", False) schedule.every(24).hours.do(downloadappids.downloadAllAppIDs) schedule.every(1).hours.do(opencriticsearch.updateOpenCritic, "PARTIAL", False) schedule.every(1).hours.do(opencriticgames.updateOpenCritic, "OLDEST", False) schedule.every(9).hours.do(twitchtopgames.updateTwitchTopGames, "TOP", False) sec = 0 while True: schedule.run_pending() if sec % 7200 == 0: # every roughly 2 hours save the scheduling information to the log for job in schedule.jobs: logging.info(str(job))
def updatePriceHistory(refresh_type="FULL", pbar=False): logging = common.setupLogging() try: logging.info("Updating Price History via " + refresh_type) client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port) db = client['steam'] collection_hist = db['pricehistory'] collection_apps = db['apps'] # create an index for appid, this vastly improves performance collection_hist.create_index("appid") collection_hist.create_index("date") # e.g.: CS Source # https://store.steampowered.com/api/appdetails?appids=240&cc=us&l=en # https://wiki.teamfortress.com/wiki/User:RJackson/StorefrontAPI#Known_methods # https://stackoverflow.com/questions/13784059/how-to-get-the-price-of-an-app-in-steam-webapi # find prices for all games and dlc to_update = collection_apps.distinct( "appid", { "updated_date": { "$exists": True }, "type": { "$in": ["game", "dlc"] }, "is_free": False, "price_overview": { "$exists": True }, "failureCount": { "$exists": False } }) if (refresh_type == "PARTIAL"): # sort by newest to oldest updated in pricehistory appid_dict = collection_hist.aggregate([ { "$group": { "_id": "$appid", "maxDate": { "$max": "$date" } } }, { "$sort": { "maxDate": -1 } } # newest first ]) for item in appid_dict: if len(to_update) == 1200: break else: if item['_id'] in to_update: # remove this fairly "new" appid from our list items to run on and refresh to_update.remove(item['_id']) if (pbar): bar = progressbar.ProgressBar(max_value=len(to_update)).start() if (refresh_type == "FULL"): # shuffle the appids so we hit new ones each time random.shuffle(to_update) #in-place bytes_downloaded = 0 appids = [] for i, appid in enumerate(to_update): appids.append(appid) if (pbar): bar.update(i + 1) # run 20 or so at a time if ((i + 1) % 20 == 0 or (i + 1) == len(to_update)): try: # create a comma-delimited string of appids appids_str = ','.join(map(str, appids)) # https://github.com/BrakeValve/dataflow/issues/5 # e.g. # https://store.steampowered.com/api/appdetails?appids=662400,833310,317832,39150,830810,224540,931720,261900,431290,914410,812110,216464,826503,509681,71115,24679,231474,202452,863900,457100&cc=us&l=en&filters=price_overview r = requests.get( "https://store.steampowered.com/api/appdetails?appids=" + appids_str + "&cc=us&l=en&filters=price_overview") if (r.ok): data = r.json() bytes_downloaded = bytes_downloaded + len(r.content) for k, value in data.items(): if (value["success"] is True): if (value['data']): price_hist = value['data'][ 'price_overview'] # set the appid based on the key price_hist['appid'] = int(k) # add current datetimestamp price_hist[ 'date'] = datetime.datetime.utcnow() # remove formatted values, not needed # if they ever get added to the database, this will remove them # db.getCollection('pricehistory').update({},{"$unset": {"initial_formatted":1, "final_formatted":1, "currency":1}}, {multi: true}) # and to validate that it worked, this should return nothing: # db.getCollection('pricehistory').find({"$or": [{"initial_formatted":{"$exists":true}}, {"final_formatted":{"$exists":true}}, {"currency":{"$exists":true}} ]}) price_hist.pop('initial_formatted', None) price_hist.pop('final_formatted', None) price_hist.pop('currency', None) collection_hist.insert_one(price_hist) else: # No price_overview information returned, remove it from the entry # to prevent future unnecessary calls. This is also an indicator # of stale app information. collection_apps.update_one( {'appid': int(k)}, {"$unset": { "price_overview": "" }}) logging.info( "No price information returned for appid: " + str(k) + " - clearing app price info.") else: logging.error("status code: " + str(r.status_code)) logging.error("price history appids: " + appids_str) except Exception as e: logging.error( str(e) + " - appids: " + str(appids_str) + " - data: " + str(value)) appids = [] # sleep for a bit, the API is throttled # limited to 200 requests every 5 minutes or so... # 10 requests every 10 seconds # 100,000 requests per day time.sleep(1.75) #seconds if (pbar): bar.finish() logging.info("Finished updating price history via " + refresh_type) logging.info("Downloaded: " + common.sizeof_fmt(bytes_downloaded)) common.writeBandwidth(db, bytes_downloaded) except Exception as e: logging.error(str(e)) time.sleep(1)
def updateTwitchTopGames(refresh_type="TOP", pbar=False): logging = common.setupLogging() try: logging.info("Updating Twitch top games via " + refresh_type) client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port) db = client['steam'] collection_twitchhistorical = db['twitchhistorical'] collection_apps = db['apps'] # create an index for id, this vastly improves performance collection_twitchhistorical.create_index("id") collection_twitchhistorical.create_index("date") collection_twitchhistorical.create_index("steamId") # API page w/examples # https://dev.twitch.tv/docs/api/ # grab the top X number of games on Twitch top_x = 100 # number of results to return in each top games request first_x = 50 # number of streams to return for each game, max 100 num_streams = 100 access_token = getTwitchToken(logging) if (pbar): bar = progressbar.ProgressBar(max_value=int(top_x * num_streams)).start() bytes_downloaded = 0 game_rank = 1 # for game rank/order returned via Twitch i = 1 # for progress bar while (i < top_x * num_streams): try: # Twitch Top Games # https://dev.twitch.tv/docs/api/reference/#get-top-games params = {'first':first_x} if i != 1: params = {'first':first_x, 'after':pagination} r = requests.get("https://api.twitch.tv/helix/games/top", headers={'Client-ID':config.twitch_client_id, 'Authorization':"Bearer "+access_token}, params=params) if (r.ok): if (int(r.headers['Ratelimit-Remaining']) < 4): logging.info("rate limit: " + r.headers['Ratelimit-Limit']) logging.info("rate limit remaining: " + r.headers['Ratelimit-Remaining']) data = r.json() bytes_downloaded = bytes_downloaded + len(r.content) if (data['pagination']['cursor']): pagination = data['pagination']['cursor'] else: logging.error("Unable to find pagination cursor") break # out of while loop for value in data['data']: # add to our historical listing # https://dev.twitch.tv/docs/api/reference/#get-streams r_g = requests.get("https://api.twitch.tv/helix/streams", headers={'Client-ID': config.twitch_client_id, 'Authorization':"Bearer "+access_token}, params={'first':num_streams, 'game_id':int(value['id'])}) if (r_g.ok): if (int(r_g.headers['Ratelimit-Remaining']) < 4): logging.info("rate limit: " + r_g.headers['Ratelimit-Limit']) logging.info("rate limit remaining: " + r_g.headers['Ratelimit-Remaining']) data_g = r_g.json() for v in data_g['data']: v['date'] = datetime.datetime.utcnow() v.pop('thumbnail_url', None) v['name'] = value['name'] # pull the game name from our top games listing v['gamerank'] = game_rank appid = getSteamId(value['name'], collection_apps) if (appid): v['steamId'] = appid collection_twitchhistorical.insert_one(v) if (pbar): bar.update(i) i = i + 1 else: logging.error("status code: " + str(r.status_code)) # check OAuth and tokens if (r_g.status_code == 401): sys.exit(1) game_rank = game_rank + 1 # https://dev.twitch.tv/docs/api/guide/#rate-limits time.sleep(2) #seconds else: logging.error("status code: " + str(r.status_code)) # check OAuth and tokens if (r.status_code == 401): sys.exit(1) # sleep for a bit # https://dev.twitch.tv/docs/api/guide/#rate-limits time.sleep(2) #seconds # in some cases, there aren't the max number of streams for a game, thus we can jump ahead i = int(game_rank * num_streams) except Exception as e: logging.error(str(e)) time.sleep(1) if (pbar): bar.finish() logging.info("Finished updating Twitch top games via " + refresh_type) logging.info("Downloaded: " + common.sizeof_fmt(bytes_downloaded)) common.writeBandwidth(db, bytes_downloaded) except Exception as e: logging.error(str(e)) time.sleep(1)
def steamReviews(pbar=False): logging = common.setupLogging() try: logging.info("Running Steam Reviews") client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port) db = client['steam'] collection = db['apps'] to_update = collection.aggregate([ { "$match": { "type": { "$in": ["game", "dlc"] } } }, { "$sort": { "reviews.last_updated": 1 } }, # oldest first { "$limit": 50 }, { "$project": { "appid": 1, "_id": 0 } } ]) to_update = ([item['appid'] for item in to_update]) if (pbar): bar = progressbar.ProgressBar(max_value=len(to_update)).start() bytes_downloaded = 0 for i, appid in enumerate(to_update): if (pbar): bar.update(i + 1) #logging.info("Running on appid: " + str(appid)) r = requests.get( "https://store.steampowered.com/appreviewhistogram/" + str(appid) + "?l=english&review_score_preference=0") if (r.ok): bytes_downloaded = bytes_downloaded + len(r.content) data = r.json()['results'] # add current datetimestamp data['last_updated'] = datetime.datetime.utcnow() # convert Epoch seconds to UTC time # https://stackoverflow.com/questions/1697815/how-do-you-convert-a-python-time-struct-time-object-into-a-datetime-object if ('start_date' in data and data['start_date']): data['start_date'] = datetime.datetime.fromtimestamp( time.mktime( time.gmtime(round(float(data['start_date']))))) if ('end_date' in data and data['end_date']): data['end_date'] = datetime.datetime.fromtimestamp( time.mktime(time.gmtime(round(float( data['end_date']))))) if ('recent_events' in data): for k, event in enumerate(data['recent_events']): if (event['start_date']): data['recent_events'][k][ 'start_date'] = datetime.datetime.fromtimestamp( time.mktime( time.gmtime( round(float( event['start_date']))))) data['recent_events'][k][ 'end_date'] = datetime.datetime.fromtimestamp( time.mktime( time.gmtime( round(float(event['end_date']))))) if ('rollups' in data): for k, event in enumerate(data['rollups']): if (event['date']): data['rollups'][k][ 'date'] = datetime.datetime.fromtimestamp( time.mktime( time.gmtime(round(float( event['date']))))) if ('recent' in data): for k, event in enumerate(data['recent']): if (event['date']): data['recent'][k][ 'date'] = datetime.datetime.fromtimestamp( time.mktime( time.gmtime(round(float( event['date']))))) #update_one will keep whatever information already exists collection.update_one({'appid': int(appid)}, {'$set': { 'reviews': data }}, upsert=True) else: logging.error("status code: " + str(r.status_code)) if (pbar): bar.update(i + 1) time.sleep(1) if (pbar): bar.finish() logging.info("Finished downloading Steam reviews.") logging.info("Downloaded: " + common.sizeof_fmt(bytes_downloaded)) common.writeBandwidth(db, bytes_downloaded) except Exception as e: logging.error(str(e)) time.sleep(3)
def steamTopGames(pbar=False): logging = common.setupLogging() try: logging.info("Running Steam Top Games") client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port) db = client['steam'] collection = db['topgames'] collection.create_index("appid", unique=False) collection.create_index("date", unique=False) # pull Steam top 100 games # https://store.steampowered.com/stats/ # also see here for historical charting using the same data # https://steamcharts.com/ r = requests.get("https://store.steampowered.com/stats/") if (r.ok): soup = BeautifulSoup(r.text, 'html.parser') rows = soup.find_all('tr', class_="player_count_row") if (pbar): bar = progressbar.ProgressBar(max_value=len(rows)).start() date_now = datetime.datetime.utcnow() for i, row in enumerate(rows): if (pbar): bar.update(i + 1) towrite = dict() towrite['date'] = date_now link = row.find_all('a', class_="gameLink") towrite['game'] = link[0].text towrite['link'] = link[0].get('href') appID = re.search(r'\/app\/(\d*)', link[0].get('href'), re.I) if appID and appID.group(1): towrite['appid'] = appID.group(1) else: logging.info("No appID found in URL: " + link[0].get('href')) online = row.find_all('span', class_="currentServers") towrite['currentplayers'] = int(online[0].text.replace( ",", "")) towrite['peaktoday'] = int(online[1].text.replace(",", "")) collection.insert_one(towrite) if (pbar): bar.finish() logging.info("Finished downloading top games.") logging.info("Downloaded: " + common.sizeof_fmt(len(r.content))) common.writeBandwidth(db, len(r.content)) else: logging.error("status code: " + str(r.status_code)) except Exception as e: logging.error(str(e)) time.sleep(1)
def refreshSteamAppIDs(refresh_type="SAMPLING_GAMES", pbar=False): logging = common.setupLogging() try: logging.info("Updating AppIDs via " + refresh_type) client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port) db = client['steam'] collection = db['apps'] collection_hist = db['pricehistory'] # create an index for appid, this vastly improves performance collection.create_index("appid", unique=True) collection.create_index("updated_date") collection.create_index("name") # e.g.: CS Source # https://store.steampowered.com/api/appdetails?appids=240&cc=us&l=en # https://wiki.teamfortress.com/wiki/User:RJackson/StorefrontAPI#Known_methods # https://stackoverflow.com/questions/13784059/how-to-get-the-price-of-an-app-in-steam-webapi to_update = [] if (refresh_type == "FULL"): to_update = collection.distinct("appid", {}) # shuffle the appids so we hit new ones each time random.shuffle(to_update) #in-place elif (refresh_type == "ALL_NON_FAILURE" or refresh_type == "SAMPLING"): # see all appids that have had failures in descending order # db.getCollection('apps').find({"failureCount": {"$exists": true}}).sort({"failureCount":-1}) # when the failureCount gets to 3 or higher, stop trying to pull data any more # pull the oldest most "stale" entries first to_update = collection.find( { "$or": [{ "failureCount": { "$lt": 3 } }, { "failureCount": { "$exists": False } }] }, { "appid": 1, "updated_date": 1, "_id": False }).sort("updated_date", 1) to_update = ([item['appid'] for item in to_update]) elif (refresh_type == "MISSING"): # count of missing entries # db.getCollection('apps').count({"updated_date": {"$exists": false}}) to_update = collection.distinct( "appid", { "updated_date": { "$exists": False }, "$or": [{ "failureCount": { "$lt": 3 } }, { "failureCount": { "$exists": False } }] }) elif (refresh_type == "GAMES" or refresh_type == "SAMPLING_GAMES"): # when the failureCount gets to 3 or higher, stop trying to pull data any more # pull the oldest most "stale" entries first to_update = collection.find( { "type": { "$in": ["game", "dlc"] }, "$or": [{ "failureCount": { "$lt": 3 } }, { "failureCount": { "$exists": False } }] }, { "appid": 1, "updated_date": 1, "_id": False }).sort("updated_date", 1) to_update = ([item['appid'] for item in to_update]) if (pbar): bar = progressbar.ProgressBar(max_value=len(to_update)).start() if (refresh_type == "SAMPLING" or refresh_type == "SAMPLING_GAMES"): # take only a small sampling of appids to_update = to_update[:500] bytes_downloaded = 0 for i, appid in enumerate(to_update): if (pbar): bar.update(i + 1) r = requests.get( "https://store.steampowered.com/api/appdetails?appids=" + str(appid) + "&cc=us&l=en") if (r.ok): try: data = r.json() bytes_downloaded = bytes_downloaded + len(r.content) for k, value in data.items(): # for some reason, querying an appid sometimes yields a different number, e.g. 100 yields 80 # it appears that "stale" records/appids can be re-pointed to existing working records if (value["success"] is True and appid == value['data']['steam_appid']): # rename "steam_appid" to "appid" so we insert properly into Mongo value['data']['appid'] = int( value['data'].pop('steam_appid')) # add current datetimestamp value['data'][ 'updated_date'] = datetime.datetime.utcnow() try: if (value['data']['release_date']['date'] != ""): # fix release_date -> date, change from string to ISODate() for Mongo value['data']['release_date'][ 'date'] = datetime.datetime.strptime( value['data']['release_date'] ['date'], "%b %d, %Y") except ValueError as ve: logging.warning(ve) # do nothing, we couldn't parse the date # replace_one will completely replace the record, this is different than update_one collection.replace_one( {'appid': int(value['data']['appid'])}, value['data'], upsert=True) if ('price_overview' in value['data']): # add a record to the price history since we grabbed it price_hist = value['data']['price_overview'] # set the appid price_hist['appid'] = int( value['data']['appid']) # add current datetimestamp price_hist['date'] = datetime.datetime.utcnow() # remove formatted values, not needed # if they ever get added to the database, this will remove them # db.getCollection('pricehistory').update({},{"$unset": {"initial_formatted":1, "final_formatted":1, "currency":1}}, {multi: true}) # and to validate that it worked, this should return nothing: # db.getCollection('pricehistory').find({"$or": [{"initial_formatted":{"$exists":true}}, {"final_formatted":{"$exists":true}}, {"currency":{"$exists":true}} ]}) price_hist.pop('initial_formatted', None) price_hist.pop('final_formatted', None) price_hist.pop('currency', None) collection_hist.insert_one(price_hist) else: # increment the failure record count so we can start pruning off bad data collection.update_one( {'appid': int(appid)}, {"$inc": { "failureCount": 1 }}, upsert=True) logging.info("Failed to get data for appid: " + str(appid) + " - incrementing failureCount.") except ValueError: logging.error("Malformed JSON for appid: " + str(appid)) else: logging.error("status code: " + str(r.status_code)) logging.error("appid: " + str(appid)) # sleep for a bit, the API is throttled # limited to 200 requests every 5 minutes or so... # 10 requests every 10 seconds # 100,000 requests per day time.sleep(1.75) #seconds if (pbar): bar.finish() logging.info("Finished updating AppIDs via " + refresh_type) logging.info("Downloaded: " + common.sizeof_fmt(bytes_downloaded)) common.writeBandwidth(db, bytes_downloaded) except Exception as e: logging.error(str(e)) time.sleep(1)
def updateOpenCritic(refresh_type="PARTIAL", pbar=False): logging = common.setupLogging() try: logging.info("Updating OpenCritic search via " + refresh_type) client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port) db = client['steam'] collection_oc = db['opencritic'] collection_apps = db['apps'] # create an index for id, this vastly improves performance collection_oc.create_index("id", unique=True) collection_oc.create_index("date") collection_oc.create_index("steamId") # API page w/examples # https://api.opencritic.com/ if (refresh_type == "PARTIAL"): # find appids for all games and dlc # https://stackoverflow.com/questions/54440636/the-field-name-must-be-an-accumulator-object names_cur = collection_apps.aggregate([ {"$match": {"updated_date": {"$exists": True}, "type": {"$in": ["game", "dlc"]}, "failureCount": {"$exists": False} } }, {"$group": { "_id": "$appid", "name": {"$first": "$name"} } }, {"$sample": { "size": 150 } } ]) # convert cursor to Python list to_update = [] appids = [] for k,item in enumerate(names_cur): to_update.append(item['name']) appids.append(item['_id']) if (pbar): bar = progressbar.ProgressBar(max_value=len(to_update)).start() search_count = 0 bytes_downloaded = 0 for i,name in enumerate(to_update): if (pbar): bar.update(i+1) try: # if we already have a record for that steamId, don't bother doing the search, we already have a link between # the OpenCritic 'id' and the 'appid' if (not entryExistsSteam(appids[i], to_update[i], collection_oc)): # OpenCritic Game API e.g. # https://api.opencritic.com/api/game/search?criteria=steel%20division%202R r = requests.get(requests.Request('GET', "https://api.opencritic.com/api/game/search", params={'criteria':name}).prepare().url) if (r.ok): search_count = search_count + 1 data = r.json() bytes_downloaded = bytes_downloaded + len(r.content) for value in data: # we don't have an existing record, insert one if (not entryExistsId(value['id'], collection_oc)): oc = value # add current datetimestamp oc['date'] = datetime.datetime.utcnow() # remove "dist" value which shows proximity match via the search entry oc.pop('dist', None) collection_oc.insert_one(oc) #else: #logging.info("id: " + str(oc['id']) + " already exists in the database") else: logging.error("status code: " + str(r.status_code)) logging.error("opencritic search name: " + name) # sleep for a bit, there's no information on API throttling time.sleep(2) #seconds #else: #logging.info("appid: " + appids[i] + " found already in OpenCritic as an entry") except Exception as e: logging.error(str(e) + " - name: " + str(name)) time.sleep(1) if (pbar): bar.finish() logging.info("Searched for " + str(search_count) + " games in OpenCritic.") logging.info("Finished updating OpenCritic search via " + refresh_type) logging.info("Downloaded: " + common.sizeof_fmt(bytes_downloaded)) common.writeBandwidth(db, bytes_downloaded) except Exception as e: logging.error(str(e)) time.sleep(1)