def unzip_file(filename): """ Take in a .gz file and unzip it, saving it with the same file name """ # first make sure we haven't already unzipped it db = get_db_session() ydh = db.query(YelpDownloadHistory).filter(YelpDownloadHistory.date==date.today()).scalar() # if it doesn't exist, error out because we need to download it if not ydh or not ydh.date == date.today() or not ydh.downloaded: logger.critical("Cannot unzip file for today because it wasn't downloaded") return if ydh.unzipped: logger.info("Today's feed already unzipped, skipoing the unzip") rawfile = filename.strip('.gz') return rawfile logger.info("Extracting file: %s" % filename) with gzip.open(filename, 'rb') as infile: rawfile = filename.strip('.gz') with open(rawfile, 'wb') as outfile: i=1 for line in infile: outfile.write(line) if i % 1000 == 0 : print "\r Extracted %i businesses so far" % i i += 1 logger.info("Done extracting file: %s" % rawfile) ydh.unzipped = True db.commit() return rawfile
def geocodeUnknownLocations(wait_time=2, run_time=240): """ Geocode any locations that don't have Lat/Lons Only do so for up to `run_time` minutes, because this can take a very long time if most are unknown Also shuffle so that they all get equal probability of being tried Args: wait_time: how long to wait until timeout Returns: None """ geoLocator = Nominatim() # print geoLocator.geocode("548 riverside dr., NY, NY, 10027") # test db = get_db_session() unknowns = db.query(Location).filter(Location.latitude==None).all() shuffle(unknowns) # give them all a fighting chance logger.info("Attempting to geocode random unknown locations for %i minutes" % run_time) logger.info("%i Unkown locations to geocode" % len(unknowns)) locations = [] upload_mod = 100 # upload batch size start_time = time.time() run_time *= 60 # turn it into seconds for i, location in enumerate(unknowns): # max try time stopping criterion if (time.time() - start_time) > run_time: logger.info("Max geocoding time has elapsed... Stopping for this run") db.add_all(locations) db.commit() break # print location.street_address logger.info("Geocoding location %i..." % i) try: geo = geoLocator.geocode(location.street_address, timeout=wait_time) lat = geo.latitude lon = geo.longitude logger.info("\tSuccess!") except Exception as e: # print "Exception: ", e logger.warning("\tGeocode failed, assigning NULL Lat/Long") lat = None lon = None location.latitude = lat location.longitude = lon locations.append(location) if i % upload_mod == 0: db.add_all(locations) db.commit() locations = [] logger.info("Finished geocode attempts")
def get_yelp_score_distribution(): db = get_db_session(echo=echo, autoflush=False, autocommit=True) counts = [] i = 0 while i <= 1: bucket_count = (db.execute(select([func.count(documents.c.id)]).where( and_( documents.c.fp_pred >= i, documents.c.fp_pred < i + 0.1 ) )).scalar()) i += .1 counts.append(bucket_count) return counts
def copy_tables(): """ Copies a few items from each table into a test database Should not be called in the same session after reset_test_db(); you will get a mapper error for some reason. Instead, call reset_test_db(), then close the python session, then, in a new session, call copy_tables(), to update the tables for a schema change. """ toy = get_db_session() db = models.get_db_session() logger.info("Populating test tables") businesses = db.query(Business).order_by(Business.id)[0:5] locations = [b.location for b in businesses] # [b.categories ...] is a list of lists, so we need a little more processing categories = set().union(*[b.categories for b in businesses]) tweets = db.query(Tweet).order_by(Tweet.id)[0:5] reviews = [] for b in businesses: reviews.extend( db.query(YelpReview).filter(YelpReview.business_id == b.id)[0:5] ) documents = [r.document for r in reviews] + [t.document for t in tweets] doc_assoc = [r.document_rel for r in reviews] + \ [t.document_rel for t in tweets] tables = [businesses, locations, categories, reviews, tweets, \ documents, doc_assoc] # detach all objects from db session before putting them in toy for t in tables: for obj in t: make_transient(obj) # only after *everything* is transient do we add anything for t in tables: toy.add_all(t) # in addition we add the junction table for business categories b_ids = [b.id for b in businesses] business_cat = db.execute(business_category_table.select(). where(business_category_table.c.business_id.in_( [b.id for b in businesses] ))) for row in business_cat: toy.execute(business_category_table.insert(), row) toy.commit()
from twython import Twython from foodbornenyc.db_settings import twitter_config from foodbornenyc.models.documents import Tweet from foodbornenyc.models.users import TwitterUser from foodbornenyc.models.locations import Location from foodbornenyc.models.models import get_db_session import foodbornenyc.sources.foursquare_geo as geo from foodbornenyc.util.util import get_logger, xuni logger = get_logger(__name__, level="INFO") # TODO @teffland: Read this link and make sure location tracking gets the lat/lon pair order correctly # http://support.gnip.com/articles/filtering-twitter-data-by-location.html db = get_db_session() twitter = Twython(twitter_config['consumer_key'], twitter_config['consumer_secret'], twitter_config['access_token'], twitter_config['access_token_secret']) # all possible fields from twitter that we want to import directly user_fields = ['id_str', 'name', 'screen_name', 'location', 'description'] tweet_fields = [ # 'contributors', #<type 'NoneType'> # 'truncated', #<type 'bool'> 'text', #<type 'unicode'> # 'is_quote_status', #<type 'bool'> # 'in_reply_to_status_id', #<type 'NoneType'> # 'id', #<type 'int'> # 'favorite_count', #<type 'int'> # 'source', #<type 'unicode'>
def download_latest_yelp_data(): """Attempt to download the latest gzip file from the Yelp Syndication. Args: None Returns: local_file: the name of where the yelp feed was downloaded to Notes: Yelp doesn't let us look at the bucket, so we just try exact filenames with presigned urls for the past month """ #get whhere to save the feed from config local_file = config['rawdata_dir'] + config['local_file'] # first make sure we haven't already downloaded the file db = get_db_session() ydh = db.query(YelpDownloadHistory).filter(YelpDownloadHistory.date==date.today()).scalar() # if it doesn't exist or it's old, create the new one if not ydh or not ydh.date == date.today(): ydh = YelpDownloadHistory() db.add(ydh) db.commit() logger.info("Creating new download history for today") # if we already downloaded, log and return if ydh.downloaded: logger.info("Already downloaded the Yelp feed for today...") return local_file # set up botocore client session = botocore.session.get_session() client = session.create_client('s3') # try to download most recent data (go up to one month back) for day_delta in range(31): # generate the correct filename for a day dateformat = '%Y%m%d' day = date.today() - timedelta(day_delta) day_str = day.strftime(dateformat) # eg '20151008' ext = '_businesses.json.gz' filename = day_str + ext logger.info("Attempting to get Yelp Reviews from %s.....", day.strftime("%m/%d/%Y")) # generate a presigned url for the file, # since yelp doesn't give us bucket access url = client.generate_presigned_url( 'get_object', Params={'Bucket': config['bucket_name'], 'Key':config['bucket_dir'] +'/'+ filename }, ExpiresIn=3600 # 1 hour in seconds ) # do the downloading logger.info("Feed URL: %s", url) try: download_url_to_file(url, config['rawdata_dir'], config['local_file']) # if we succeed, move on break except Exception: # TODO: Throw more explicit exceptions from `download_url_to_file` # so we can handle it more explicitly, currenlty it can be misleading if day_delta == 30: logger.warning("NO YELP DATA AVAILABLE FOR THE PAST MONTH!") return else: logger.warning("no data for date: %s\n\ Trying the day before." % day.strftime("%m/%d/%Y")) logger.info("Latest Yelp Data successfully downloaded from feed.") # save the success to download history ydh.downloaded = True db.commit() return local_file
def upsert_yelpfile_to_db(filename, geocode=True): """ This takes in the JSON file of all of the Yelp businesses and all the affiliate data (reviews, categories, etc.) and upserts them to the DB It follows the db schema used by the ORM, but doesn't use the ORM directly for optimization purposes. Where the ORM would take a week or more (estimated) to upload a completely new file of 35k businesses, this version does so in ~= 45 min (w/o geocode and over ethernet) DON'T mess with this code unless you know what you're doing. Hopefully it's well commented enough for you to figure it out if you need to. But it is sensitive (relative to normal model code) because of the amount of data that must be transferred to the DB. Args: filename: the name of the unzipped Yelp JSON filename geocode: whether or not to geocode locations missing a Lat/Lon - Can slow down the code significantly if it's the first geocode attempt - Most Yelp locations don't have Lat/Lons - On first upload consider calling the geocode db function after Returns: None. But the database will be updated :) TODO: - Add a geocode unkown locations function - Check if syndication has already been uploaded today """ # database handler object db = get_db_session(echo=False, autoflush=False, autocommit=True) # check YelpDownloadHistory to see if we've already uploaded the feed ydh = db.query(YelpDownloadHistory).filter(YelpDownloadHistory.date==date.today()).scalar() # if it doesn't exist, isnt today, hasn't been downloaded or unzipped if not ydh or not ydh.date == date.today() or not ydh.downloaded or not ydh.unzipped: logger.critical("Can't upload today's feed if it hasn't been downloaded and unzipped") return if ydh.uploaded: logger.info("Already upserted today's Yelp feed. Skipping") return # geocoder for businesses w/o lat longs geoLocator = Nominatim() logger.info("Updating Yelp DB..........") # setup list of businesses to skip updating # because we've uploaded them since yelp updated them # this part gets the skip condition newest = db.query(Business).order_by(desc(Business.updated_at)).first() # check for if there's a business in the db if newest: most_recent = newest.updated_at init_db = False logger.info("Last updated: %r" % most_recent.strftime('%m/%d/%Y:%M:%H:%S')) # if not, then it's the first time we're populating it else: logger.info("First Database Population: This could take a looong time...") most_recent = None init_db = True # if we're initializing the db, disable the foreign key constraints # this will improve upload speeds if init_db and 'mssql' in dbconfig['dbbackend']: disable_fk = """ ALTER TABLE dbo.%s NOCHECK CONSTRAINT fk_loc_businesses; ALTER TABLE dbo.%s NOCHECK CONSTRAINT fk_biz_id; ALTER TABLE dbo.%s NOCHECK CONSTRAINT fk_cat_alias; ALTER TABLE dbo.%s NOCHECK CONSTRAINT fk_rev_biz_id; """ % ( businesses.name, business_category_table.name, business_category_table.name, yelp_reviews.name) with db.begin(): db.execute(disable_fk) start_time = time.time() # for timing the whole process # get sets of uids for each data model, # so we can quickly determine if a datum needs to be updated or inserted # this is much faster than querying the db every time with db.begin(): db_biz_ids = set([ b.id for b in db.query(Business.id).all() ]) db_review_ids = set([ r.id for r in db.query(YelpReview.id).all() ]) db_locations = set([ l.id for l in db.query(Location.id).all() ]) db_categories = set([ c.alias for c in db.query(YelpCategory.alias).all() ]) db_biz_categories = set([ (assoc.business_id, assoc.category_alias) for assoc in db.query(business_category_table).all()]) # batch upsert data structures unloaded_locations = {} unloaded_categories = {} insert_businesses = [] insert_reviews = [] insert_documents = [] insert_doc_associations = [] update_businesses = [] update_reviews = [] update_documents = [] biz_cats = [] # loop over json file ans upsert all data with open(filename, 'rb') as infile: # for unzipped files biz_num = 0 biz_count = 0 review_count = 0 upload_mod = 500 # size of batch upload # each business is one line for line in infile: biz_num += 1 biz_count +=1 logger.info("Updating Restaurant #%i...." % biz_num) current = time.time()-start_time m, s = divmod(current, 60) h, m = divmod(m, 60) logger.info("Time so far: %d:%02d:%02d" % (h, m, s)) # if business doesn't load correctly, skip it try: biz = loads(line) except ValueError: logger.warning("Broken JSON Element. Skipping...") continue bdate = datetime.strptime(biz['time_updated'], '%Y-%m-%dT%H:%M:%S')#2015-10-08T20:17:50 # skip this business if it hasn't ben updated since we last updated # (only works when we aren't initializing the db) if most_recent and not init_db: if bdate <= most_recent: logger.info("SKIPPING (NOT NEW): %s" % biz['name']) biz_count -=1 # print biz_count continue # make note of new Locations location = location_dict_yelp(biz['location']) if location['id'] not in db_locations and location['id'] not in unloaded_locations.keys(): if not location['latitude'] and not location['longitude'] and geocode: # try to reverse-geocode missing coords (if enabled) if geocode: try: logger.info("No Lat/Long for restaurant, attempting to geocode...") # TODO(shao): replace with foursquare geocoder raise Exception('geocode not implemented') except: logger.warning("Geocode failed, assigning NULL Lat/Long") # add to running list of unloaded locations unloaded_locations[location['id']] = location # update or insert business depending on if it's already in db business = business_yelp_dict(biz, location) if biz['id'] in db_biz_ids: update_businesses.append(business) else: insert_businesses.append(business) # update/create all the new Reviews for i, rev in enumerate(biz['reviews']): # if the review isn't new, don't do anything # uncomment this code to update it (significant slowdown) if rev['id'] in db_review_ids: pass # review = review_dict_yelp(biz, rev) # document = document_dict_yelp(rev) # update_reviews.append(review) # update_documents.append(document) # else create a new one else: review = review_dict_yelp(biz, rev) document = document_dict_yelp(rev) doc_assoc = doc_assoc_dict_yelp(rev) insert_reviews.append(review) insert_documents.append(document) insert_doc_associations.append(doc_assoc) review_count += len(biz['reviews']) # create the Categories for category in biz['categories']: # if it's new create it, provided we haven't already if (category['alias'] not in db_categories and category['alias'] not in unloaded_categories.keys()): # some aliases are bad, so skip them if (xstr(category['alias']) == '' or xstr(category['alias']) == None): logger.warning("BAD CATEGORY %r... Skipping" % xstr(category['alias'])) continue cat = {'alias':xstr(category['alias']), 'title':xstr(category['title']) } unloaded_categories[category['alias']] = cat # create the business association link assoc = { 'business_id':biz['id'], 'category_alias':category['alias'] } if (assoc['business_id'], assoc['category_alias']) not in db_biz_categories: biz_cats.append(assoc) # if we've reached batch size, perform the actual transactions if biz_count % upload_mod == 0: with db.begin(): logger.info("Uploading Batch of %i to DB...." % upload_mod) logger.info("Uploading Locations to DB....") db.bulk_insert_mappings(Location, unloaded_locations.values()) logger.info("Uploading Yelp Categories to DB....") db.bulk_insert_mappings(YelpCategory, unloaded_categories.values()) bizlen = len(insert_businesses) + len(update_businesses) logger.info("Uploading %i Businesses to DB...." %bizlen) db.execute(businesses.insert(), insert_businesses) db.bulk_update_mappings(Business, update_businesses) revlen = len(insert_reviews) + len(update_reviews) logger.info("Uploading %i Documents to DB...." % revlen) db.execute(document_associations.insert(), sorted(insert_doc_associations, key=lambda x:x['assoc_id'])) db.execute(documents.insert(), sorted(insert_documents, key=lambda x:x['id'])) # db.bulk_update_mappings(Document, update_documents) logger.info("Uploading %i Business Reviews to DB...." % revlen) db.execute(yelp_reviews.insert(), sorted(insert_reviews, key=lambda x:x['id'])) # db.bulk_update_mappings(YelpReview, update_reviews) #there seem to be duplicate categories for a business #so make the associations unique logger.info("Uploading Business Category associations to DB....") biz_cats = [dict(tupleized) for tupleized in set(tuple(assoc.items()) for assoc in biz_cats)] db.execute(business_category_table.insert(), biz_cats) # reset the lists for the next batch db_categories.update(unloaded_categories.keys()) db_locations.update(unloaded_locations.keys()) unloaded_categories = {} unloaded_locations = {} insert_businesses = [] insert_reviews = [] insert_documents = [] insert_doc_associations = [] update_businesses = [] update_reviews = [] update_documents = [] biz_cats = [] # upload the final batch bizlen = len(insert_businesses) + len(update_businesses) if bizlen > 0: with db.begin(): logger.info("Uploading Batch of %i to DB...." % upload_mod) logger.info("Uploading Locations to DB....") db.bulk_insert_mappings(Location, unloaded_locations.values()) logger.info("Uploading Yelp Categories to DB....") db.bulk_insert_mappings(YelpCategory, unloaded_categories.values()) bizlen = len(insert_businesses) + len(update_businesses) logger.info("Uploading %i Businesses to DB...." %bizlen) db.execute(businesses.insert(), insert_businesses) db.bulk_update_mappings(Business, update_businesses) revlen = len(insert_reviews) + len(update_reviews) logger.info("Uploading %i Documents to DB...." % revlen) db.execute(document_associations.insert(), sorted(insert_doc_associations, key=lambda x:x['assoc_id'])) db.execute(documents.insert(), sorted(insert_documents, key=lambda x:x['id'])) # db.bulk_update_mappings(Document, update_documents) logger.info("Uploading %i Business Reviews to DB...." % revlen) db.execute(yelp_reviews.insert(), sorted(insert_reviews, key=lambda x:x['id'])) # db.bulk_update_mappings(YelpReview, update_reviews) #there seem to be duplicate categories for a business #so make the associations unique logger.info("Uploading Business Category associations to DB....") biz_cats = [dict(tupleized) for tupleized in set(tuple(assoc.items()) for assoc in biz_cats)] db.execute(business_category_table.insert(), biz_cats) # if we are initializing the db, we need to reenable the fk constraints # because we put in all the data correctly, we are sure the fks are correct # this will error if they aren't if init_db and 'mssql' in dbconfig['dbbackend']: # put back all the constraints logger.info("Cheking Constraints...") enable_fk = """ ALTER TABLE dbo.%s CHECK CONSTRAINT ALL; ALTER TABLE dbo.%s CHECK CONSTRAINT ALL; ALTER TABLE dbo.%s CHECK CONSTRAINT ALL; ALTER TABLE dbo.%s CHECK CONSTRAINT ALL; """ % ( businesses.name, business_category_table.name, business_category_table.name, yelp_reviews.name) with db.begin(): db.execute(enable_fk) total_time = float(time.time() - start_time) logger.info("Upserted %i businesses and %i total reviews in %d seconds = %.2f minutes" %\ (biz_num, review_count, total_time, total_time/60.)) # update the download history with db.begin(): ydh.uploaded = True
def get_db_session(**kwargs): return models.get_db_session(config=test_config, **kwargs)
def classify_reviews(self, every=False, unseen=False, since=30, yield_per=1000, verbose=0): """Classify some set of `YelpReview`s' `Docuement` in the database Args: every (bool): Whether to just do them all. Trumps other flags. Likely to __very__ slow. unseen (bool): If not `every`, classify all that don't yet have predictions. Trumps `since`. since (int): Number of past days to classify reviews. yield_per (int): Will work with database in batches of that size. verbose (int): Degree of verbosity v. - v = 0 Only specify number of reviews being classified - v >= 1 Log when eaches review has been classified - v >= 2 Echo SQL statements Returns: None """ echo = True if verbose >= 2 else False db = get_db_session(echo=echo, autoflush=False, autocommit=True) with db.begin(): if every: logger.info("Classifying all reviews. This could take a very long time") query = db.query(YelpReview).order_by(YelpReview.id.asc()) count = db.query(func.count(YelpReview.id)).scalar() elif unseen: logger.info("Classifying all unclassified reviews") # this requires running some special core level queries because of the dynamic document association # this way is actually faster anyways unseen_q = select([documents.c.id]).where(documents.c.fp_pred.is_(None)) query = (db.query(YelpReview) .filter(YelpReview.id.in_(unseen_q)) .order_by(YelpReview.id.asc())) count = (db.execute( select([func.count(documents.c.id)]) .where(documents.c.fp_pred.is_(None))) .scalar()) # instead of tuple else: logger.info("Classifying all reviews from the past %i days", since) backdate = datetime.datetime.now() - datetime.timedelta(since) query = (db.query(YelpReview) .filter(YelpReview.created >= backdate) .order_by(YelpReview.created.desc())) count = (db.query(func.count(YelpReview.created)) .filter(YelpReview.created >= backdate) .scalar()) # instead of tuple logger.info("Classifying %i total reviews", count) start = time() offset = 0 while True: returned = False try: with db.begin(): for i, review in enumerate(query.limit(yield_per).offset(offset)): returned = True if verbose: logger.info("Classified Review #%i/%i", offset+i+1, count) self.score_review(review) logger.info("Commiting Predictions") except OperationalError: continue # if commit error, try again with same offeset offset += yield_per if not returned: break logger.info("Classified %i reviews in %i:%i:%i (h:m:s)", count, *sec_to_hms(time()-start))
def get_twitter_sick_reviews(echo, search_params): (threshold, sortby, num_results, page, start_date, end_date) = search_params threshold = (1.0 / 10) * threshold ## convert to tenths # Reformat date params for comparison later if len(start_date) > 0: start_date = dt.strptime(start_date, "%Y-%m-%d") else: start_date = None if len(end_date) > 0: end_date = dt.strptime(end_date, "%Y-%m-%d") else: end_date = None ## Record whether a limit was given is_limit = (num_results != -1) # If no limit, set num_results to 100 for querying in a loop if not is_limit: num_results = 100 db = get_db_session(echo=echo, autoflush=False, autocommit=True) count = 0 ## Construct query to get all positive reviews with db.begin(): all_tweets = select([documents.c.id]) query = (db.query(Tweet).filter(Tweet.id.in_(all_tweets)).order_by(Tweet.id.asc())) offset = num_results * (page - 1) tweets = [] ## Collect all reviews meeting query criteria while True: returned = False try: with db.begin(): for i, review in enumerate(query.limit(num_results).offset(offset)): returned = True ## only proceed if review's date follows start_date, if applicable if (start_date == None or (review.document.created >= start_date)): ## only proceed if review's date preceeds end_date, if applicable if (end_date == None or (review.document.created <= end_date)): ## The DB doesn't yet have functionality for Twitter reviews to be ## linked to businesses, but once that happens we should adapt this. business = UIBusiness("Unknown business", "XXX", "XXX", "X", "twitter.com", "twitter.com") tweets.append(UITwitterReview(business, review.text, review.document.fp_pred, review.document.created, "Twitter", review.user_id, review.id)) except OperationalError: break if is_limit or (not returned): break # Otherwise, continue until there are no more reviews offset += 100 ## when no limit, loop on sets of 100 ## Sort reviews if (sortby == "severity"): tweets = sorted(tweets, key=attrgetter('score'), reverse=True) else: tweets = sorted(tweets, key=attrgetter('created')) return tweets