def usersList(): """ Parse through data from /thepub to get unique usernames, user ids, and locations. Stores this information in a csv file to be used in later api requests. Limited to 100 api calls per hour requiring sleep method. May be run multiple times to retrieve Continuously run until user stops script. """ usersList = files.readUsers() apiCount = 0 userNameCountAdditions = 0 while (True): # get 25 most recent updates data = untappd.getPubFeed() apiCount += 1 print 'apiCount: ' + str(apiCount) checkins = data['response']['checkins']['items'] # each response has 25 items, each with a username for checkin in checkins: userId = checkin['user']['uid'] username = checkin['user']['user_name'] userLocation = checkin['user']['location'] if hash(str(userId)) not in usersList: if userLocation != '': userNameCountAdditions += 1 userAttribs = {'uid': str(userId), 'username': username, 'location': {'name': unicode(userLocation).encode("utf-8")}, 'ratings': {}} user = UT.UntappdUser(userAttribs) usersList[hash(str(userId))] = user writeJSONFile('../data/users.json', usersList) userCount = len(usersList) print 'Total Users: ' + str(userCount) # Untappd only allows 100 api requests per hour. Sleep for 38 # seconds between requests sleep(37)
def createDataPoints(): """Make the data points of user locations for the map generation.""" usersList = files.readUsers() beersList = files.readBeers() points = [] i = 1 for hashId, user in usersList.iteritems(): if 'lat' in user.location and user.ratings: for bid, rating in user.ratings.iteritems(): country = None if 'country' in user.location: country = user.location['country'] pointAttribs = { 'lat': user.location['lat'], 'lng': user.location['lng'], 'country': country, 'abv': beersList[str(hash(bid))].abv, 'rating': rating, 'style': beersList[str(hash(bid))].style } point = dp.dataPoint(pointAttribs) points.append(point) if i % 1000 == 0: print "Points added: " + str(i) i += 1 data = dp.dataPoints(points) writeJSONFile('../data/dataPoints.json', data)
def normalizeUsers(): """ Change the user ids so the information can be made public and use the googlemaps module to determine the user's location. """ usersList = files.readUsers() newUsersList = {} i = 1 newUid = 1 for hashId, user in usersList.iteritems(): uid = user.uid user.uid = str(newUid) location = user.location if location['name'] != "" and 'lat' not in location: if isinstance(location['name'], unicode): location = location['name'].encode('utf-8') else: location = location['name'] mapInfo = PBAMap.getLatLong(location, i) i += 1 if mapInfo == 'apiLimit': print str( i ) + " At daily API limit. Update script and repeat tomorrow" elif mapInfo != '': user.location = { 'name': location, 'lat': mapInfo['lat'], 'lng': mapInfo['lng'], } if 'country' in mapInfo: user.location['country'] = mapInfo['country'] print str(i), user.location else: print str(i), "checked: none" user.location = {'name': ''} newUid += 1 newUsersList[hash(str(uid))] = user writeJSONFile('../data/users.json', newUsersList) print "User ids, usernames, and locations updated\n"
def usersList(): """ Parse through data from /thepub to get unique usernames, user ids, and locations. Stores this information in a csv file to be used in later api requests. Limited to 100 api calls per hour requiring sleep method. May be run multiple times to retrieve Continuously run until user stops script. """ usersList = files.readUsers() apiCount = 0 userNameCountAdditions = 0 while (True): # get 25 most recent updates data = untappd.getPubFeed() apiCount += 1 print 'apiCount: ' + str(apiCount) checkins = data['response']['checkins']['items'] # each response has 25 items, each with a username for checkin in checkins: userId = checkin['user']['uid'] username = checkin['user']['user_name'] userLocation = checkin['user']['location'] if hash(str(userId)) not in usersList: if userLocation != '': userNameCountAdditions += 1 userAttribs = { 'uid': str(userId), 'username': username, 'location': { 'name': unicode(userLocation).encode("utf-8") }, 'ratings': {} } user = UT.UntappdUser(userAttribs) usersList[hash(str(userId))] = user writeJSONFile('../data/users.json', usersList) userCount = len(usersList) print 'Total Users: ' + str(userCount) # Untappd only allows 100 api requests per hour. Sleep for 38 # seconds between requests sleep(37)
def normalizeUsers(): """ Change the user ids so the information can be made public and use the googlemaps module to determine the user's location. """ usersList = files.readUsers() newUsersList = {} i = 1 newUid = 1 for hashId, user in usersList.iteritems(): uid = user.uid user.uid = str(newUid) location = user.location if location['name'] != "" and 'lat' not in location: if isinstance(location['name'], unicode): location = location['name'].encode('utf-8') else: location = location['name'] mapInfo = PBAMap.getLatLong(location, i) i += 1 if mapInfo == 'apiLimit': print str(i) + " At daily API limit. Update script and repeat tomorrow" elif mapInfo != '': user.location = { 'name': location, 'lat': mapInfo['lat'], 'lng': mapInfo['lng'], } if 'country' in mapInfo: user.location['country'] = mapInfo['country'] print str(i), user.location else: print str(i), "checked: none" user.location = {'name': ''} newUid += 1 newUsersList[hash(str(uid))] = user writeJSONFile('../data/users.json', newUsersList) print "User ids, usernames, and locations updated\n"
def createDataPoints(): """Make the data points of user locations for the map generation.""" usersList = files.readUsers() beersList = files.readBeers() points = [] i = 1 for hashId, user in usersList.iteritems(): if 'lat' in user.location and user.ratings: for bid, rating in user.ratings.iteritems(): country = None if 'country' in user.location: country = user.location['country'] pointAttribs = {'lat': user.location['lat'], 'lng': user.location['lng'], 'country': country, 'abv': beersList[str(hash(bid))].abv, 'rating': rating, 'style': beersList[str(hash(bid))].style} point = dp.dataPoint(pointAttribs) points.append(point) if i % 1000 == 0: print "Points added: " + str(i) i += 1 data = dp.dataPoints(points) writeJSONFile('../data/dataPoints.json', data)
Single-purpose script for easy monitoring of data quantity. Load each json data file, find its size and generate a plot for presentation. """ import fileReader as files import matplotlib.pyplot as plt import os import numpy as np # Load files print "Loading beers..." beersList = files.readBeers() print "Loading users..." usersList = files.readUsers() print "Loading breweries..." breweriesList = files.readBreweries() # Path for saving the images path = "../data/labels/" fileList = os.listdir(path) # Data gathering labels = ('Beers', 'Reviews', 'Users', 'Breweries', 'Labels') index = np.arange(len(labels)) quantities = (len(beersList), sum([len(x.ratings) for x in usersList.values()]), len(usersList), len(breweriesList), len(fileList)) # Plot the quantities plt.figure(1)
def userReviews(): """ Parse through user reviews /user/beers/{username} Retrieves at most 50 reviews per user, retains review, beer, and brewery information. After querying the api, remove username to lessen privacy concerns with untappd data. """ usersList = files.readUsers() beersList = files.readBeers() breweryList = files.readBreweries() breweryToBeers = files.readBreweryToBeers() total = 0 totalUsersComplete = 0 for userHash, user in usersList.iteritems(): totalUsersComplete += 1 # if the data has been normalized, old data will not # have usernames. Ignore older users which may have # already gotten reviews if user.username: userId = user.uid username = user.username user.username = None userReviewCount = 0 offsetTotal = 0 ratings = {} print 'Processing ' + str(userId) + ': ' + username # each response returns at most 25 reviews. To get more user # reviews, call again with an offset get at most 50 reviews # from the same user while (userReviewCount < 2): print username + ': ' + str(userReviewCount + 1) data = untappd.getUserReviewData(username, offsetTotal) offset = data['response']['beers']['count'] offsetTotal += offset reviews = data['response']['beers']['items'] for review in reviews: userRating = review['rating_score'] if userRating > 0: beerInfo = review['beer'] breweryInfo = review['brewery'] # fill in beer information if hash(str(beerInfo['bid'])) not in beersList: stylesList = [] style = unicode( beerInfo['beer_style']).encode("utf-8") styles = style.lower().title().split('/') for style in styles: style = style.strip() stylesList.append(style) beerAttribs = { 'bid': str(beerInfo['bid']), 'name': unicode(beerInfo['beer_name']).encode("utf-8"), 'label': beerInfo['beer_label'], 'abv': beerInfo['beer_abv'], 'ibu': beerInfo['beer_ibu'], 'style': stylesList, 'description': unicode(beerInfo['beer_description']).encode( "utf-8"), 'rating': beerInfo['rating_score'], 'numRatings': 1, 'brewery': str(breweryInfo['brewery_id']) } beer = UT.UntappdBeer(beerAttribs) beersList[hash(beer.bid)] = beer else: beersList[hash(str( beerInfo['bid']))].numRatings += 1 # fill in brewery information if hash(str( breweryInfo['brewery_id'])) not in breweryList: breweryAttribs = { 'breweryId': str(breweryInfo['brewery_id']), 'name': unicode(breweryInfo['brewery_name']).encode( "utf-8"), 'label': breweryInfo['brewery_label'], 'country': unicode(breweryInfo['country_name']).encode( "utf-8"), 'location': unicode( breweryInfo['location']).encode("utf-8") } brewery = UT.UntappdBrewery(breweryAttribs) breweryList[hash(brewery.breweryId)] = brewery # map breweery_id to a list of beers produced there if hash(str(breweryInfo['brewery_id']) ) not in breweryToBeers: # store the current beer in a list of beers of # the brewery breweryToBeers[hash(str( breweryInfo['brewery_id']))] = { str(breweryInfo['brewery_id']): [str(beerInfo['bid'])] } else: # add current beer to brewery's list of beers breweryToBeers[hash(str( breweryInfo['brewery_id']))][str( breweryInfo['brewery_id'])].append( str(beerInfo['bid'])) # add list of beer ratings to user ratings[str(beerInfo['bid'])] = userRating userReviewCount += 1 user.ratings = ratings # store the dictionaries after new data so user doesn't kill process before writing # with open('../data/users.json', 'wb') as usersFile: # json = jpickle.encode(usersList) # usersFile.write(json) # with open('../data/beers.json', 'wb') as beersFile: # json = jpickle.encode(beersList) # beersFile.write(json) # with open('../data/breweries.json', 'wb') as breweriesFile: # json = jpickle.encode(breweryList) # breweriesFile.write(json) # with open('../data/breweryToBeers.json', 'wb') as breweryToBeersFile: # json = jpickle.encode(breweryToBeers) # breweryToBeersFile.write(json) # if the offset is less than 25, then there are no more reviews to retrieve if offset < 25: break writeJSONFile('../data/users.json', usersList) writeJSONFile('../data/beers.json', beersList) writeJSONFile('../data/breweries.json', breweryList) writeJSONFile('../data/breweryToBeers.json', breweryToBeers) total += len(ratings) print str(userId) + ': ' + username + ', Processed: ' + str( len(ratings)) + ' reviews' print 'Total Reviews: ' + str(total) print 'Total Users Completed: ' + str(totalUsersComplete) sleep(37 * (userReviewCount)) else: total += len(user.ratings)
def userReviews(): """ Parse through user reviews /user/beers/{username} Retrieves at most 50 reviews per user, retains review, beer, and brewery information. After querying the api, remove username to lessen privacy concerns with untappd data. """ usersList = files.readUsers() beersList = files.readBeers() breweryList = files.readBreweries() breweryToBeers = files.readBreweryToBeers() total = 0 totalUsersComplete = 0 for userHash, user in usersList.iteritems(): totalUsersComplete += 1 # if the data has been normalized, old data will not # have usernames. Ignore older users which may have # already gotten reviews if user.username: userId = user.uid username = user.username user.username = None userReviewCount = 0 offsetTotal = 0 ratings = {} print 'Processing ' + str(userId) + ': ' + username # each response returns at most 25 reviews. To get more user # reviews, call again with an offset get at most 50 reviews # from the same user while (userReviewCount < 2): print username + ': ' + str(userReviewCount + 1) data = untappd.getUserReviewData(username, offsetTotal) offset = data['response']['beers']['count'] offsetTotal += offset reviews = data['response']['beers']['items'] for review in reviews: userRating = review['rating_score'] if userRating > 0: beerInfo = review['beer'] breweryInfo = review['brewery'] # fill in beer information if hash(str(beerInfo['bid'])) not in beersList: stylesList = [] style = unicode(beerInfo['beer_style']).encode("utf-8") styles = style.lower().title().split('/') for style in styles: style = style.strip() stylesList.append(style) beerAttribs = { 'bid': str(beerInfo['bid']), 'name': unicode(beerInfo['beer_name']).encode("utf-8"), 'label': beerInfo['beer_label'], 'abv': beerInfo['beer_abv'], 'ibu': beerInfo['beer_ibu'], 'style': stylesList, 'description': unicode(beerInfo['beer_description']).encode("utf-8"), 'rating': beerInfo['rating_score'], 'numRatings': 1, 'brewery': str(breweryInfo['brewery_id']) } beer = UT.UntappdBeer(beerAttribs) beersList[hash(beer.bid)] = beer else: beersList[hash(str(beerInfo['bid']))].numRatings += 1 # fill in brewery information if hash(str(breweryInfo['brewery_id'])) not in breweryList: breweryAttribs = { 'breweryId': str(breweryInfo['brewery_id']), 'name': unicode(breweryInfo['brewery_name']).encode("utf-8"), 'label': breweryInfo['brewery_label'], 'country': unicode(breweryInfo['country_name']).encode("utf-8"), 'location': unicode(breweryInfo['location']).encode("utf-8") } brewery = UT.UntappdBrewery(breweryAttribs) breweryList[hash(brewery.breweryId)] = brewery # map breweery_id to a list of beers produced there if hash(str(breweryInfo['brewery_id'])) not in breweryToBeers: # store the current beer in a list of beers of # the brewery breweryToBeers[hash(str(breweryInfo['brewery_id']))] = {str(breweryInfo['brewery_id']): [str(beerInfo['bid'])]} else: # add current beer to brewery's list of beers breweryToBeers[hash(str(breweryInfo['brewery_id']))][str(breweryInfo['brewery_id'])].append(str(beerInfo['bid'])) # add list of beer ratings to user ratings[str(beerInfo['bid'])] = userRating userReviewCount += 1 user.ratings = ratings # store the dictionaries after new data so user doesn't kill process before writing # with open('../data/users.json', 'wb') as usersFile: # json = jpickle.encode(usersList) # usersFile.write(json) # with open('../data/beers.json', 'wb') as beersFile: # json = jpickle.encode(beersList) # beersFile.write(json) # with open('../data/breweries.json', 'wb') as breweriesFile: # json = jpickle.encode(breweryList) # breweriesFile.write(json) # with open('../data/breweryToBeers.json', 'wb') as breweryToBeersFile: # json = jpickle.encode(breweryToBeers) # breweryToBeersFile.write(json) # if the offset is less than 25, then there are no more reviews to retrieve if offset < 25: break writeJSONFile('../data/users.json', usersList) writeJSONFile('../data/beers.json', beersList) writeJSONFile('../data/breweries.json', breweryList) writeJSONFile('../data/breweryToBeers.json', breweryToBeers) total += len(ratings) print str(userId) + ': ' + username + ', Processed: ' + str(len(ratings)) + ' reviews' print 'Total Reviews: ' + str(total) print 'Total Users Completed: ' + str(totalUsersComplete) sleep(37 * (userReviewCount)) else: total += len(user.ratings)