Python setupLogging Examples, common.setupLogging Python Examples

Example #1

0

Show file

def steamUsers(pbar=False):
    logging = common.setupLogging()
    try:
        logging.info("Running Steam Users Online")

        client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port)

        db = client['steam']
        collection = db['steamusers']

        collection.create_index("epochint", unique=True)
        collection.create_index("date", unique=True)

        # pull Steam online users over the last 24 hours
        # https://store.steampowered.com/stats/

        r = requests.get("https://store.steampowered.com/stats/userdata.json")
        if (r.ok):
            data = r.json()[0]['data']

            if (pbar):
                bar = progressbar.ProgressBar(max_value=len(data)).start()

            for i, users in enumerate(data):
                if (pbar):
                    bar.update(i + 1)
                # convert Epoch seconds to UTC time
                # https://stackoverflow.com/questions/1697815/how-do-you-convert-a-python-time-struct-time-object-into-a-datetime-object
                conv_time = datetime.datetime.fromtimestamp(
                    time.mktime(time.gmtime(int(users[0]) / 1000)))
                #update_one will keep whatever information already exists
                collection.update_one({'epochint': int(users[0])}, {
                    '$set': {
                        'numberonlineusers': int(users[1]),
                        'date': conv_time
                    }
                },
                                      upsert=True)
            if (pbar):
                bar.finish()
            logging.info("Finished downloading Steam users online.")
            logging.info("Downloaded: " + common.sizeof_fmt(len(r.content)))
            common.writeBandwidth(db, len(r.content))
        else:
            logging.error("status code: " + str(r.status_code))
    except Exception as e:
        logging.error(str(e))
        time.sleep(1)

Example #2

0

Show file

def downloadAllAppIDs(pbar=False):
	logging = common.setupLogging()
	try:
		logging.info("Downloading All AppIDs")

		# downloads a list of every appid and name from the API
		# and stores in MongoDB collection

		client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port)
		db = client['steam']
		collection = db['apps']
	
		r = requests.get("https://api.steampowered.com/ISteamApps/GetAppList/v0002/")

		if (r.ok):
			data = r.json()

			# create an index for appid, this vastly improves performance
			collection.create_index("appid", unique=True)

			if (pbar):
				bar = progressbar.ProgressBar(max_value=len(data['applist']['apps'])).start()

			requests_list = []
			for i,app in enumerate(data['applist']['apps']):
				if (pbar):
					bar.update(i+1)
				#UpdateOne will keep whatever information already exists
				requests_list.append(UpdateOne({'appid': int(app['appid'])}, {'$set': app}, upsert=True))
				# do bulk writes in batches, instead of one at a time
				if (i % 1000 == 0 or i+1 == len(data['applist']['apps'])):
					try:
						collection.bulk_write(requests_list)
						requests_list = []
					except BulkWriteError as bwe:
						logging.error(bwe)
			if (pbar):
				bar.finish()
			logging.info("Finished downloading AppIDs.")
			logging.info("Downloaded: " + common.sizeof_fmt(len(r.content)))
			common.writeBandwidth(db, len(r.content))
		else:
			logging.error("status code: " + str(r.status_code))
	except Exception as e:
		logging.error(str(e))
		time.sleep(1)

Example #3

0

Show file

File: opencriticgames.py Project: carlsonp/steam-analysis

def updateOpenCritic(refresh_type="OLDEST", pbar=False):
    logging = common.setupLogging()
    try:
        logging.info("Updating OpenCritic games via " + refresh_type)

        client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port)
        db = client['steam']
        collection_oc = db['opencritic']

        # create an index for id, this vastly improves performance
        collection_oc.create_index("id", unique=True)
        collection_oc.create_index("date")
        collection_oc.create_index("steamId")

        if (refresh_type == "OLDEST"):
            # find a sampling of OpenCritic IDs to work on ordered by date
            # will run on the oldest entries first
            names_cur = collection_oc.aggregate([
                {
                    "$match": {}
                },
                {
                    "$sort": {
                        "date": 1
                    }
                },  # oldest first
                {
                    "$limit": 25
                },
                {
                    "$project": {
                        "id": 1,
                        "_id": 0
                    }
                }
            ])
            # convert cursor to Python list
            to_update = []
            for item in names_cur:
                to_update.append(item['id'])

        if (pbar):
            bar = progressbar.ProgressBar(max_value=len(to_update)).start()

        bytes_downloaded = 0
        for i, oc_id in enumerate(to_update):
            if (pbar):
                bar.update(i + 1)

            try:
                # OpenCritic Game API e.g.
                # https://opencritic.com/api/game/7592
                r = requests.get(
                    requests.Request(
                        'GET', "https://opencritic.com/api/game/" +
                        str(oc_id)).prepare().url)
                if (r.ok):
                    data = r.json()
                    bytes_downloaded = bytes_downloaded + len(r.content)

                    oc = data
                    # add current datetimestamp
                    oc['date'] = datetime.datetime.utcnow()
                    #update_one will keep whatever information already exists
                    collection_oc.update_one({'id': int(oc['id'])},
                                             {'$set': oc},
                                             upsert=True)
                else:
                    logging.error("status code: " + str(r.status_code))
                    logging.error("opencritic game id: " + str(oc_id))

                # sleep for a bit, there's no information on API throttling
                time.sleep(2)  #seconds

                # grab review information which is a separate URL
                # e.g. https://opencritic.com/api/review/game/7592

                r = requests.get(
                    requests.Request(
                        'GET', "https://opencritic.com/api/review/game/" +
                        str(oc_id)).prepare().url)
                if (r.ok):
                    data = r.json()
                    bytes_downloaded = bytes_downloaded + len(r.content)

                    oc['Reviews'] = data
                    #update_one will keep whatever information already exists
                    collection_oc.update_one({'id': int(oc['id'])},
                                             {'$set': oc},
                                             upsert=True)
                else:
                    logging.error("status code: " + str(r.status_code))
                    logging.error("opencritic game id: " + str(oc_id))
            except Exception as e:
                logging.error(str(e) + " - id: " + str(oc_id))

            # sleep for a bit, there's no information on API throttling
            time.sleep(2)  #seconds

        if (pbar):
            bar.finish()
        logging.info("Finished updating OpenCritic games via " + refresh_type)
        logging.info("Downloaded: " + common.sizeof_fmt(bytes_downloaded))
        common.writeBandwidth(db, bytes_downloaded)
    except Exception as e:
        logging.error(str(e))
        time.sleep(1)

Example #4

0

Show file

File: run-schedule.py Project: carlsonp/steam-analysis

import schedule, time
import common # common.py

import steamtopgames, steamusers, steamreviews, updatepricehistory, refreshsteam, downloadappids, opencriticsearch, opencriticgames, twitchtopgames # *.py files

logging = common.setupLogging()

logging.info("Starting steam-analysis")

# run a few things right off the bat since they run infrequently
steamusers.steamUsers()
downloadappids.downloadAllAppIDs()

# schedule items to run
schedule.every(15).minutes.do(steamtopgames.steamTopGames)
schedule.every(23).hours.do(steamusers.steamUsers)
schedule.every(1).hours.do(updatepricehistory.updatePriceHistory, "PARTIAL", False)
schedule.every(30).minutes.do(steamreviews.steamReviews)
schedule.every(3).hours.do(refreshsteam.refreshSteamAppIDs, "SAMPLING", False)
schedule.every(6).hours.do(refreshsteam.refreshSteamAppIDs, "MISSING", False)
schedule.every(24).hours.do(downloadappids.downloadAllAppIDs)
schedule.every(1).hours.do(opencriticsearch.updateOpenCritic, "PARTIAL", False)
schedule.every(1).hours.do(opencriticgames.updateOpenCritic, "OLDEST", False)
schedule.every(9).hours.do(twitchtopgames.updateTwitchTopGames, "TOP", False)

sec = 0
while True:
    schedule.run_pending()
    if sec % 7200 == 0: # every roughly 2 hours save the scheduling information to the log
        for job in schedule.jobs:
            logging.info(str(job))

Example #5

0

Show file

File: updatepricehistory.py Project: carlsonp/steam-analysis

def updatePriceHistory(refresh_type="FULL", pbar=False):
    logging = common.setupLogging()
    try:
        logging.info("Updating Price History via " + refresh_type)

        client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port)
        db = client['steam']
        collection_hist = db['pricehistory']
        collection_apps = db['apps']

        # create an index for appid, this vastly improves performance
        collection_hist.create_index("appid")
        collection_hist.create_index("date")

        # e.g.: CS Source
        # https://store.steampowered.com/api/appdetails?appids=240&cc=us&l=en

        # https://wiki.teamfortress.com/wiki/User:RJackson/StorefrontAPI#Known_methods
        # https://stackoverflow.com/questions/13784059/how-to-get-the-price-of-an-app-in-steam-webapi

        # find prices for all games and dlc
        to_update = collection_apps.distinct(
            "appid", {
                "updated_date": {
                    "$exists": True
                },
                "type": {
                    "$in": ["game", "dlc"]
                },
                "is_free": False,
                "price_overview": {
                    "$exists": True
                },
                "failureCount": {
                    "$exists": False
                }
            })

        if (refresh_type == "PARTIAL"):
            # sort by newest to oldest updated in pricehistory
            appid_dict = collection_hist.aggregate([
                {
                    "$group": {
                        "_id": "$appid",
                        "maxDate": {
                            "$max": "$date"
                        }
                    }
                },
                {
                    "$sort": {
                        "maxDate": -1
                    }
                }  # newest first
            ])
            for item in appid_dict:
                if len(to_update) == 1200:
                    break
                else:
                    if item['_id'] in to_update:
                        # remove this fairly "new" appid from our list items to run on and refresh
                        to_update.remove(item['_id'])

        if (pbar):
            bar = progressbar.ProgressBar(max_value=len(to_update)).start()

        if (refresh_type == "FULL"):
            # shuffle the appids so we hit new ones each time
            random.shuffle(to_update)  #in-place

        bytes_downloaded = 0
        appids = []
        for i, appid in enumerate(to_update):
            appids.append(appid)
            if (pbar):
                bar.update(i + 1)
            # run 20 or so at a time
            if ((i + 1) % 20 == 0 or (i + 1) == len(to_update)):
                try:
                    # create a comma-delimited string of appids
                    appids_str = ','.join(map(str, appids))
                    # https://github.com/BrakeValve/dataflow/issues/5
                    # e.g.
                    # https://store.steampowered.com/api/appdetails?appids=662400,833310,317832,39150,830810,224540,931720,261900,431290,914410,812110,216464,826503,509681,71115,24679,231474,202452,863900,457100&cc=us&l=en&filters=price_overview
                    r = requests.get(
                        "https://store.steampowered.com/api/appdetails?appids="
                        + appids_str + "&cc=us&l=en&filters=price_overview")
                    if (r.ok):
                        data = r.json()
                        bytes_downloaded = bytes_downloaded + len(r.content)

                        for k, value in data.items():
                            if (value["success"] is True):
                                if (value['data']):
                                    price_hist = value['data'][
                                        'price_overview']
                                    # set the appid based on the key
                                    price_hist['appid'] = int(k)
                                    # add current datetimestamp
                                    price_hist[
                                        'date'] = datetime.datetime.utcnow()
                                    # remove formatted values, not needed
                                    # if they ever get added to the database, this will remove them
                                    # db.getCollection('pricehistory').update({},{"$unset": {"initial_formatted":1, "final_formatted":1, "currency":1}}, {multi: true})
                                    # and to validate that it worked, this should return nothing:
                                    # db.getCollection('pricehistory').find({"$or": [{"initial_formatted":{"$exists":true}}, {"final_formatted":{"$exists":true}}, {"currency":{"$exists":true}} ]})
                                    price_hist.pop('initial_formatted', None)
                                    price_hist.pop('final_formatted', None)
                                    price_hist.pop('currency', None)
                                    collection_hist.insert_one(price_hist)
                                else:
                                    # No price_overview information returned, remove it from the entry
                                    # to prevent future unnecessary calls.  This is also an indicator
                                    # of stale app information.
                                    collection_apps.update_one(
                                        {'appid': int(k)},
                                        {"$unset": {
                                            "price_overview": ""
                                        }})
                                    logging.info(
                                        "No price information returned for appid: "
                                        + str(k) +
                                        " - clearing app price info.")
                    else:
                        logging.error("status code: " + str(r.status_code))
                        logging.error("price history appids: " + appids_str)
                except Exception as e:
                    logging.error(
                        str(e) + " - appids: " + str(appids_str) +
                        " - data: " + str(value))

                appids = []

                # sleep for a bit, the API is throttled
                # limited to 200 requests every 5 minutes or so...
                # 10 requests every 10 seconds
                # 100,000 requests per day
                time.sleep(1.75)  #seconds

        if (pbar):
            bar.finish()
        logging.info("Finished updating price history via " + refresh_type)
        logging.info("Downloaded: " + common.sizeof_fmt(bytes_downloaded))
        common.writeBandwidth(db, bytes_downloaded)
    except Exception as e:
        logging.error(str(e))
        time.sleep(1)

Example #6

0

Show file

File: twitchtopgames.py Project: carlsonp/steam-analysis

def updateTwitchTopGames(refresh_type="TOP", pbar=False):
	logging = common.setupLogging()
	try:
		logging.info("Updating Twitch top games via " + refresh_type)

		client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port)
		db = client['steam']
		collection_twitchhistorical = db['twitchhistorical']
		collection_apps = db['apps']

		# create an index for id, this vastly improves performance
		collection_twitchhistorical.create_index("id")
		collection_twitchhistorical.create_index("date")
		collection_twitchhistorical.create_index("steamId")

		# API page w/examples
        # https://dev.twitch.tv/docs/api/

		# grab the top X number of games on Twitch
		top_x = 100
		# number of results to return in each top games request
		first_x = 50
		# number of streams to return for each game, max 100
		num_streams = 100

		access_token = getTwitchToken(logging)

		if (pbar):
			bar = progressbar.ProgressBar(max_value=int(top_x * num_streams)).start()

		bytes_downloaded = 0
		game_rank = 1 # for game rank/order returned via Twitch
		i = 1 # for progress bar
		while (i < top_x * num_streams):
			try:
				# Twitch Top Games
				# https://dev.twitch.tv/docs/api/reference/#get-top-games
				params = {'first':first_x}
				if i != 1:
					params = {'first':first_x, 'after':pagination}
				r = requests.get("https://api.twitch.tv/helix/games/top", headers={'Client-ID':config.twitch_client_id, 'Authorization':"Bearer "+access_token}, params=params)
				if (r.ok):
					if (int(r.headers['Ratelimit-Remaining']) < 4):
						logging.info("rate limit: " + r.headers['Ratelimit-Limit'])
						logging.info("rate limit remaining: " + r.headers['Ratelimit-Remaining'])
					data = r.json()
					bytes_downloaded = bytes_downloaded + len(r.content)
					if (data['pagination']['cursor']):
						pagination = data['pagination']['cursor']
					else:
						logging.error("Unable to find pagination cursor")
						break # out of while loop

					for value in data['data']:
						# add to our historical listing
						# https://dev.twitch.tv/docs/api/reference/#get-streams
						r_g = requests.get("https://api.twitch.tv/helix/streams", headers={'Client-ID': config.twitch_client_id, 'Authorization':"Bearer "+access_token}, params={'first':num_streams, 'game_id':int(value['id'])})
						if (r_g.ok):
							if (int(r_g.headers['Ratelimit-Remaining']) < 4):
								logging.info("rate limit: " + r_g.headers['Ratelimit-Limit'])
								logging.info("rate limit remaining: " + r_g.headers['Ratelimit-Remaining'])
							data_g = r_g.json()
							for v in data_g['data']:
								v['date'] = datetime.datetime.utcnow()
								v.pop('thumbnail_url', None)
								v['name'] = value['name'] # pull the game name from our top games listing
								v['gamerank'] = game_rank
								appid = getSteamId(value['name'], collection_apps)
								if (appid):
									v['steamId'] = appid
								collection_twitchhistorical.insert_one(v)
								if (pbar):
									bar.update(i)
								i = i + 1
						else:
							logging.error("status code: " + str(r.status_code))
							# check OAuth and tokens
							if (r_g.status_code == 401):
								sys.exit(1)

						game_rank = game_rank + 1
						# https://dev.twitch.tv/docs/api/guide/#rate-limits
						time.sleep(2) #seconds
				else:
					logging.error("status code: " + str(r.status_code))
					# check OAuth and tokens
					if (r.status_code == 401):
						sys.exit(1)

				# sleep for a bit
				# https://dev.twitch.tv/docs/api/guide/#rate-limits
				time.sleep(2) #seconds
				# in some cases, there aren't the max number of streams for a game, thus we can jump ahead
				i = int(game_rank * num_streams)
			except Exception as e:
				logging.error(str(e))
				time.sleep(1)

		if (pbar):
			bar.finish()
		
		logging.info("Finished updating Twitch top games via " + refresh_type)
		logging.info("Downloaded: " + common.sizeof_fmt(bytes_downloaded))
		common.writeBandwidth(db, bytes_downloaded)
	except Exception as e:
		logging.error(str(e))
		time.sleep(1)

Example #7

0

Show file

File: steamreviews.py Project: carlsonp/steam-analysis

def steamReviews(pbar=False):
    logging = common.setupLogging()
    try:
        logging.info("Running Steam Reviews")

        client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port)

        db = client['steam']
        collection = db['apps']

        to_update = collection.aggregate([
            {
                "$match": {
                    "type": {
                        "$in": ["game", "dlc"]
                    }
                }
            },
            {
                "$sort": {
                    "reviews.last_updated": 1
                }
            },  # oldest first
            {
                "$limit": 50
            },
            {
                "$project": {
                    "appid": 1,
                    "_id": 0
                }
            }
        ])

        to_update = ([item['appid'] for item in to_update])

        if (pbar):
            bar = progressbar.ProgressBar(max_value=len(to_update)).start()

        bytes_downloaded = 0
        for i, appid in enumerate(to_update):
            if (pbar):
                bar.update(i + 1)

            #logging.info("Running on appid: " + str(appid))
            r = requests.get(
                "https://store.steampowered.com/appreviewhistogram/" +
                str(appid) + "?l=english&review_score_preference=0")
            if (r.ok):
                bytes_downloaded = bytes_downloaded + len(r.content)

                data = r.json()['results']

                # add current datetimestamp
                data['last_updated'] = datetime.datetime.utcnow()

                # convert Epoch seconds to UTC time
                # https://stackoverflow.com/questions/1697815/how-do-you-convert-a-python-time-struct-time-object-into-a-datetime-object
                if ('start_date' in data and data['start_date']):
                    data['start_date'] = datetime.datetime.fromtimestamp(
                        time.mktime(
                            time.gmtime(round(float(data['start_date'])))))
                if ('end_date' in data and data['end_date']):
                    data['end_date'] = datetime.datetime.fromtimestamp(
                        time.mktime(time.gmtime(round(float(
                            data['end_date'])))))

                if ('recent_events' in data):
                    for k, event in enumerate(data['recent_events']):
                        if (event['start_date']):
                            data['recent_events'][k][
                                'start_date'] = datetime.datetime.fromtimestamp(
                                    time.mktime(
                                        time.gmtime(
                                            round(float(
                                                event['start_date'])))))
                            data['recent_events'][k][
                                'end_date'] = datetime.datetime.fromtimestamp(
                                    time.mktime(
                                        time.gmtime(
                                            round(float(event['end_date'])))))

                if ('rollups' in data):
                    for k, event in enumerate(data['rollups']):
                        if (event['date']):
                            data['rollups'][k][
                                'date'] = datetime.datetime.fromtimestamp(
                                    time.mktime(
                                        time.gmtime(round(float(
                                            event['date'])))))

                if ('recent' in data):
                    for k, event in enumerate(data['recent']):
                        if (event['date']):
                            data['recent'][k][
                                'date'] = datetime.datetime.fromtimestamp(
                                    time.mktime(
                                        time.gmtime(round(float(
                                            event['date'])))))

                #update_one will keep whatever information already exists
                collection.update_one({'appid': int(appid)},
                                      {'$set': {
                                          'reviews': data
                                      }},
                                      upsert=True)
            else:
                logging.error("status code: " + str(r.status_code))

            if (pbar):
                bar.update(i + 1)

            time.sleep(1)

        if (pbar):
            bar.finish()

        logging.info("Finished downloading Steam reviews.")
        logging.info("Downloaded: " + common.sizeof_fmt(bytes_downloaded))
        common.writeBandwidth(db, bytes_downloaded)

    except Exception as e:
        logging.error(str(e))
        time.sleep(3)

Example #8

0

Show file

File: steamtopgames.py Project: carlsonp/steam-analysis

def steamTopGames(pbar=False):
    logging = common.setupLogging()
    try:
        logging.info("Running Steam Top Games")

        client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port)

        db = client['steam']
        collection = db['topgames']

        collection.create_index("appid", unique=False)
        collection.create_index("date", unique=False)

        # pull Steam top 100 games
        # https://store.steampowered.com/stats/
        # also see here for historical charting using the same data
        # https://steamcharts.com/

        r = requests.get("https://store.steampowered.com/stats/")
        if (r.ok):
            soup = BeautifulSoup(r.text, 'html.parser')
            rows = soup.find_all('tr', class_="player_count_row")

            if (pbar):
                bar = progressbar.ProgressBar(max_value=len(rows)).start()

            date_now = datetime.datetime.utcnow()
            for i, row in enumerate(rows):
                if (pbar):
                    bar.update(i + 1)

                towrite = dict()
                towrite['date'] = date_now

                link = row.find_all('a', class_="gameLink")

                towrite['game'] = link[0].text
                towrite['link'] = link[0].get('href')

                appID = re.search(r'\/app\/(\d*)', link[0].get('href'), re.I)
                if appID and appID.group(1):
                    towrite['appid'] = appID.group(1)
                else:
                    logging.info("No appID found in URL: " +
                                 link[0].get('href'))

                online = row.find_all('span', class_="currentServers")
                towrite['currentplayers'] = int(online[0].text.replace(
                    ",", ""))
                towrite['peaktoday'] = int(online[1].text.replace(",", ""))

                collection.insert_one(towrite)
            if (pbar):
                bar.finish()
            logging.info("Finished downloading top games.")
            logging.info("Downloaded: " + common.sizeof_fmt(len(r.content)))
            common.writeBandwidth(db, len(r.content))
        else:
            logging.error("status code: " + str(r.status_code))
    except Exception as e:
        logging.error(str(e))
        time.sleep(1)

Example #9

0

Show file

File: refreshsteam.py Project: carlsonp/steam-analysis

def refreshSteamAppIDs(refresh_type="SAMPLING_GAMES", pbar=False):
    logging = common.setupLogging()
    try:
        logging.info("Updating AppIDs via " + refresh_type)

        client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port)
        db = client['steam']
        collection = db['apps']
        collection_hist = db['pricehistory']

        # create an index for appid, this vastly improves performance
        collection.create_index("appid", unique=True)
        collection.create_index("updated_date")
        collection.create_index("name")

        # e.g.: CS Source
        # https://store.steampowered.com/api/appdetails?appids=240&cc=us&l=en

        # https://wiki.teamfortress.com/wiki/User:RJackson/StorefrontAPI#Known_methods
        # https://stackoverflow.com/questions/13784059/how-to-get-the-price-of-an-app-in-steam-webapi

        to_update = []
        if (refresh_type == "FULL"):
            to_update = collection.distinct("appid", {})
            # shuffle the appids so we hit new ones each time
            random.shuffle(to_update)  #in-place
        elif (refresh_type == "ALL_NON_FAILURE" or refresh_type == "SAMPLING"):
            # see all appids that have had failures in descending order
            # db.getCollection('apps').find({"failureCount": {"$exists": true}}).sort({"failureCount":-1})
            # when the failureCount gets to 3 or higher, stop trying to pull data any more
            # pull the oldest most "stale" entries first
            to_update = collection.find(
                {
                    "$or": [{
                        "failureCount": {
                            "$lt": 3
                        }
                    }, {
                        "failureCount": {
                            "$exists": False
                        }
                    }]
                }, {
                    "appid": 1,
                    "updated_date": 1,
                    "_id": False
                }).sort("updated_date", 1)
            to_update = ([item['appid'] for item in to_update])
        elif (refresh_type == "MISSING"):
            # count of missing entries
            # db.getCollection('apps').count({"updated_date": {"$exists": false}})
            to_update = collection.distinct(
                "appid", {
                    "updated_date": {
                        "$exists": False
                    },
                    "$or": [{
                        "failureCount": {
                            "$lt": 3
                        }
                    }, {
                        "failureCount": {
                            "$exists": False
                        }
                    }]
                })
        elif (refresh_type == "GAMES" or refresh_type == "SAMPLING_GAMES"):
            # when the failureCount gets to 3 or higher, stop trying to pull data any more
            # pull the oldest most "stale" entries first
            to_update = collection.find(
                {
                    "type": {
                        "$in": ["game", "dlc"]
                    },
                    "$or": [{
                        "failureCount": {
                            "$lt": 3
                        }
                    }, {
                        "failureCount": {
                            "$exists": False
                        }
                    }]
                }, {
                    "appid": 1,
                    "updated_date": 1,
                    "_id": False
                }).sort("updated_date", 1)
            to_update = ([item['appid'] for item in to_update])

        if (pbar):
            bar = progressbar.ProgressBar(max_value=len(to_update)).start()

        if (refresh_type == "SAMPLING" or refresh_type == "SAMPLING_GAMES"):
            # take only a small sampling of appids
            to_update = to_update[:500]

        bytes_downloaded = 0
        for i, appid in enumerate(to_update):
            if (pbar):
                bar.update(i + 1)
            r = requests.get(
                "https://store.steampowered.com/api/appdetails?appids=" +
                str(appid) + "&cc=us&l=en")

            if (r.ok):
                try:
                    data = r.json()
                    bytes_downloaded = bytes_downloaded + len(r.content)

                    for k, value in data.items():
                        # for some reason, querying an appid sometimes yields a different number, e.g. 100 yields 80
                        # it appears that "stale" records/appids can be re-pointed to existing working records
                        if (value["success"] is True
                                and appid == value['data']['steam_appid']):
                            # rename "steam_appid" to "appid" so we insert properly into Mongo
                            value['data']['appid'] = int(
                                value['data'].pop('steam_appid'))
                            # add current datetimestamp
                            value['data'][
                                'updated_date'] = datetime.datetime.utcnow()
                            try:
                                if (value['data']['release_date']['date'] !=
                                        ""):
                                    # fix release_date -> date, change from string to ISODate() for Mongo
                                    value['data']['release_date'][
                                        'date'] = datetime.datetime.strptime(
                                            value['data']['release_date']
                                            ['date'], "%b %d, %Y")
                            except ValueError as ve:
                                logging.warning(ve)
                                # do nothing, we couldn't parse the date
                            # replace_one will completely replace the record, this is different than update_one
                            collection.replace_one(
                                {'appid': int(value['data']['appid'])},
                                value['data'],
                                upsert=True)

                            if ('price_overview' in value['data']):
                                # add a record to the price history since we grabbed it
                                price_hist = value['data']['price_overview']
                                # set the appid
                                price_hist['appid'] = int(
                                    value['data']['appid'])
                                # add current datetimestamp
                                price_hist['date'] = datetime.datetime.utcnow()
                                # remove formatted values, not needed
                                # if they ever get added to the database, this will remove them
                                # db.getCollection('pricehistory').update({},{"$unset": {"initial_formatted":1, "final_formatted":1, "currency":1}}, {multi: true})
                                # and to validate that it worked, this should return nothing:
                                # db.getCollection('pricehistory').find({"$or": [{"initial_formatted":{"$exists":true}}, {"final_formatted":{"$exists":true}}, {"currency":{"$exists":true}} ]})
                                price_hist.pop('initial_formatted', None)
                                price_hist.pop('final_formatted', None)
                                price_hist.pop('currency', None)
                                collection_hist.insert_one(price_hist)
                        else:
                            # increment the failure record count so we can start pruning off bad data
                            collection.update_one(
                                {'appid': int(appid)},
                                {"$inc": {
                                    "failureCount": 1
                                }},
                                upsert=True)
                            logging.info("Failed to get data for appid: " +
                                         str(appid) +
                                         " - incrementing failureCount.")
                except ValueError:
                    logging.error("Malformed JSON for appid: " + str(appid))
            else:
                logging.error("status code: " + str(r.status_code))
                logging.error("appid: " + str(appid))
            # sleep for a bit, the API is throttled
            # limited to 200 requests every 5 minutes or so...
            # 10 requests every 10 seconds
            # 100,000 requests per day
            time.sleep(1.75)  #seconds
        if (pbar):
            bar.finish()
        logging.info("Finished updating AppIDs via " + refresh_type)
        logging.info("Downloaded: " + common.sizeof_fmt(bytes_downloaded))
        common.writeBandwidth(db, bytes_downloaded)
    except Exception as e:
        logging.error(str(e))
        time.sleep(1)

Example #10

0

Show file

File: opencriticsearch.py Project: carlsonp/steam-analysis

def updateOpenCritic(refresh_type="PARTIAL", pbar=False):
	logging = common.setupLogging()
	try:
		logging.info("Updating OpenCritic search via " + refresh_type)

		client = MongoClient(host=config.mongodb_ip, port=config.mongodb_port)
		db = client['steam']
		collection_oc = db['opencritic']
		collection_apps = db['apps']

		# create an index for id, this vastly improves performance
		collection_oc.create_index("id", unique=True)
		collection_oc.create_index("date")
		collection_oc.create_index("steamId")

		# API page w/examples
        # https://api.opencritic.com/
	
		if (refresh_type == "PARTIAL"):
			# find appids for all games and dlc
			# https://stackoverflow.com/questions/54440636/the-field-name-must-be-an-accumulator-object
			names_cur = collection_apps.aggregate([
				{"$match": {"updated_date": {"$exists": True},
					"type": {"$in": ["game", "dlc"]},
					"failureCount": {"$exists": False}
					}
				},
				{"$group": {
					"_id": "$appid",
					"name": {"$first": "$name"}
					}
				},
				{"$sample": {
					"size": 150
					}
				}
			])
			# convert cursor to Python list
			to_update = []
			appids = []
			for k,item in enumerate(names_cur):
				to_update.append(item['name'])
				appids.append(item['_id'])

		if (pbar):
			bar = progressbar.ProgressBar(max_value=len(to_update)).start()

		search_count = 0
		bytes_downloaded = 0
		for i,name in enumerate(to_update):
			if (pbar):
				bar.update(i+1)

			try:
				# if we already have a record for that steamId, don't bother doing the search, we already have a link between
				# the OpenCritic 'id' and the 'appid'
				if (not entryExistsSteam(appids[i], to_update[i], collection_oc)):
					# OpenCritic Game API e.g.
					# https://api.opencritic.com/api/game/search?criteria=steel%20division%202R
					r = requests.get(requests.Request('GET', "https://api.opencritic.com/api/game/search", params={'criteria':name}).prepare().url)
					if (r.ok):
						search_count = search_count + 1
						data = r.json()
						bytes_downloaded = bytes_downloaded + len(r.content)

						for value in data:
							# we don't have an existing record, insert one
							if (not entryExistsId(value['id'], collection_oc)):
								oc = value
								# add current datetimestamp
								oc['date'] = datetime.datetime.utcnow()
								# remove "dist" value which shows proximity match via the search entry
								oc.pop('dist', None)
								collection_oc.insert_one(oc)
							#else:
								#logging.info("id: " + str(oc['id']) + " already exists in the database")
					else:
						logging.error("status code: " + str(r.status_code))
						logging.error("opencritic search name: " + name)

					# sleep for a bit, there's no information on API throttling
					time.sleep(2) #seconds
				#else:
					#logging.info("appid: " + appids[i] + " found already in OpenCritic as an entry")
			except Exception as e:
				logging.error(str(e) + " - name: " + str(name))
				time.sleep(1)

		if (pbar):
			bar.finish()
		
		logging.info("Searched for " + str(search_count) + " games in OpenCritic.")
		logging.info("Finished updating OpenCritic search via " + refresh_type)
		logging.info("Downloaded: " + common.sizeof_fmt(bytes_downloaded))
		common.writeBandwidth(db, bytes_downloaded)
	except Exception as e:
		logging.error(str(e))
		time.sleep(1)