Exemple #1
0
    def __init__(
        self,
        latitude=None,
        longitude=None,
        bbox_width=None,
        bbox_height=None,
        line1=None,
        line2=None,
        line3=None,
        city=None,
        country=None,
        postal_code=None,
        state=None,
    ):
        self.latitude = latitude
        self.longitude = longitude
        self.bbox_width = bbox_width
        self.bbox_height = bbox_height

        self.line1 = xstr(line1)
        self.line2 = xstr(line2)
        self.line3 = xstr(line3)
        self.city = xstr(city)
        self.country = xstr(country)
        self.postal_code = xstr(postal_code)
        self.state = xstr(state)

        self.id = self.identifier()
Exemple #2
0
def location_dict_yelp(loc):
    location = {
                'latitude':loc['coordinate']['latitude'],
                'longitude':loc['coordinate']['longitude'],
                'line1':xstr(loc['address'][0]),
                'line2':xstr(loc['address'][1]),
                'line3':xstr(loc['address'][2]),
                'city':xstr(loc['city']),
                'country':xstr(loc['country']),
                'postal_code':xstr(loc['postal_code']),
                'state':xstr(loc['state']),
            }
    if location['latitude'] is not None:
        location['bbox_width'] = 0
        location['bbox_height'] = 0
    else:
        location['bbox_width'] = None
        location['bbox_height'] = None
    location['id'] = location_id(location)
    return location
Exemple #3
0
def upsert_yelpfile_to_db(filename, geocode=True):
    """
    This takes in the JSON file of all of the Yelp businesses and
    all the affiliate data (reviews, categories, etc.) and upserts them to the DB

    It follows the db schema used by the ORM, but doesn't use the ORM directly
    for optimization purposes.  

    Where the ORM would take a week or more (estimated) to upload 
    a completely new file of 35k businesses,
    this version does so in ~= 45 min (w/o geocode and over ethernet)

    DON'T mess with this code unless you know what you're doing.
    Hopefully it's well commented enough for you to figure it out if you need to.
    But it is sensitive (relative to normal model code) because of the amount of
    data that must be transferred to the DB.

    Args:
        filename: the name of the unzipped Yelp JSON filename

        geocode: whether or not to geocode locations missing a Lat/Lon
         - Can slow down the code significantly if it's the first geocode attempt
         - Most Yelp locations don't have Lat/Lons
         - On first upload consider calling the geocode db function after

    Returns:
        None. But the database will be updated :)

    TODO:
        - Add a geocode unkown locations function
        - Check if syndication has already been uploaded today

    """
    # database handler object
    db = get_db_session(echo=False, autoflush=False, autocommit=True)

    # check YelpDownloadHistory to see if we've already uploaded the feed
    ydh = db.query(YelpDownloadHistory).filter(YelpDownloadHistory.date==date.today()).scalar()

    # if it doesn't exist, isnt today, hasn't been downloaded or unzipped
    if not ydh or not ydh.date == date.today() or not ydh.downloaded or not ydh.unzipped:
        logger.critical("Can't upload today's feed if it hasn't been downloaded and unzipped")
        return

    if ydh.uploaded:
        logger.info("Already upserted today's Yelp feed. Skipping")
        return

    # geocoder for businesses w/o lat longs
    geoLocator = Nominatim()

    logger.info("Updating Yelp DB..........")

    # setup list of businesses to skip updating
    # because we've uploaded them since yelp updated them
    # this part gets the skip condition
    newest = db.query(Business).order_by(desc(Business.updated_at)).first()

    # check for if there's a business in the db
    if newest:
        most_recent = newest.updated_at
        init_db = False
        logger.info("Last updated: %r" % most_recent.strftime('%m/%d/%Y:%M:%H:%S'))

    # if not, then it's the first time we're populating it
    else:
        logger.info("First Database Population: This could take a looong time...")
        most_recent = None
        init_db = True

    # if we're initializing the db, disable the foreign key constraints
    # this will improve upload speeds
    if init_db and 'mssql' in dbconfig['dbbackend']:
        disable_fk = """
        ALTER TABLE dbo.%s NOCHECK CONSTRAINT fk_loc_businesses;
        ALTER TABLE dbo.%s NOCHECK CONSTRAINT fk_biz_id;
        ALTER TABLE dbo.%s NOCHECK CONSTRAINT fk_cat_alias;
        ALTER TABLE dbo.%s NOCHECK CONSTRAINT fk_rev_biz_id;
        """ % ( businesses.name, business_category_table.name,
                business_category_table.name, yelp_reviews.name)
        with db.begin():
            db.execute(disable_fk)


    start_time = time.time() # for timing the whole process

    # get sets of uids for each data model, 
    # so we can quickly determine if a datum needs to be updated or inserted
    # this is much faster than querying the db every time
    with db.begin():
        db_biz_ids = set([ b.id 
                        for b in db.query(Business.id).all() ])
        db_review_ids = set([ r.id 
                            for r in db.query(YelpReview.id).all() ])
        db_locations = set([ l.id 
                            for l in db.query(Location.id).all() ])
        db_categories = set([ c.alias 
                            for c in db.query(YelpCategory.alias).all() ])
        db_biz_categories = set([ (assoc.business_id, assoc.category_alias) 
                                for assoc in db.query(business_category_table).all()])

    # batch upsert data structures
    unloaded_locations = {}
    unloaded_categories = {}
    insert_businesses = []
    insert_reviews = []
    insert_documents = []
    insert_doc_associations = []
    update_businesses = []
    update_reviews = []
    update_documents = []
    biz_cats = []

    # loop over json file ans upsert all data
    with open(filename, 'rb') as infile: # for unzipped files
        biz_num = 0
        biz_count = 0
        review_count = 0
        upload_mod = 500 # size of batch upload 

        # each business is one line
        for line in infile:
            biz_num += 1
            biz_count +=1
            logger.info("Updating Restaurant #%i...." % biz_num)

            current = time.time()-start_time
            m, s = divmod(current, 60)  
            h, m = divmod(m, 60)
            logger.info("Time so far: %d:%02d:%02d" % (h, m, s))

            # if business doesn't load correctly, skip it
            try:
                biz = loads(line)
            except ValueError:
                logger.warning("Broken JSON Element. Skipping...")
                continue
            bdate = datetime.strptime(biz['time_updated'], '%Y-%m-%dT%H:%M:%S')#2015-10-08T20:17:50

            # skip this business if it hasn't ben updated since we last updated
            # (only works when we aren't initializing the db)
            if most_recent and not init_db:
                if bdate <= most_recent:
                    logger.info("SKIPPING (NOT NEW): %s" % biz['name'])
                    biz_count -=1
                    # print biz_count
                    continue

            # make note of new Locations
            location = location_dict_yelp(biz['location'])
            if location['id'] not in db_locations and location['id'] not in unloaded_locations.keys():
                if not location['latitude'] and not location['longitude'] and geocode:
                    # try to reverse-geocode missing coords (if enabled)
                    if geocode:
                        try:
                            logger.info("No Lat/Long for restaurant, attempting to geocode...")
                            # TODO(shao): replace with foursquare geocoder
                            raise Exception('geocode not implemented')
                        except:
                            logger.warning("Geocode failed, assigning NULL Lat/Long")
                # add to running list of unloaded locations
                unloaded_locations[location['id']] = location 

            # update or insert business depending on if it's already in db
            business = business_yelp_dict(biz, location)
            if biz['id'] in db_biz_ids:
                update_businesses.append(business)
            else:
                insert_businesses.append(business)

            # update/create all the new Reviews
            for i, rev in enumerate(biz['reviews']):
                # if the review isn't new, don't do anything
                # uncomment this code to update it (significant slowdown)
                if rev['id'] in db_review_ids:
                    pass
                    # review = review_dict_yelp(biz, rev)
                    # document = document_dict_yelp(rev)
                    # update_reviews.append(review)
                    # update_documents.append(document)
                # else create a new one
                else:
                    review = review_dict_yelp(biz, rev)
                    document = document_dict_yelp(rev)
                    doc_assoc = doc_assoc_dict_yelp(rev)

                    insert_reviews.append(review)
                    insert_documents.append(document)
                    insert_doc_associations.append(doc_assoc)
            review_count += len(biz['reviews'])

            # create the Categories
            for category in biz['categories']:
                # if it's new create it, provided we haven't already
                if (category['alias'] not in db_categories 
                and category['alias'] not in unloaded_categories.keys()):
                    # some aliases are bad, so skip them
                    if (xstr(category['alias']) == '' 
                    or xstr(category['alias']) == None): 
                        logger.warning("BAD CATEGORY %r... Skipping" % xstr(category['alias']))
                        continue
                    cat = {'alias':xstr(category['alias']),
                            'title':xstr(category['title'])
                          }
                    unloaded_categories[category['alias']] = cat
                # create the business association link
                assoc = {
                         'business_id':biz['id'], 
                         'category_alias':category['alias']
                         }
                if (assoc['business_id'], assoc['category_alias']) not in db_biz_categories:
                    biz_cats.append(assoc)

            # if we've reached batch size, perform the actual transactions
            if biz_count % upload_mod == 0:
                with db.begin():
                    logger.info("Uploading Batch of %i to DB...." % upload_mod)
                    logger.info("Uploading Locations to DB....")
                    db.bulk_insert_mappings(Location, unloaded_locations.values())
                    logger.info("Uploading Yelp Categories to DB....")
                    db.bulk_insert_mappings(YelpCategory, unloaded_categories.values())
                    bizlen = len(insert_businesses) + len(update_businesses)
                    logger.info("Uploading %i Businesses to DB...." %bizlen)
                    db.execute(businesses.insert(), insert_businesses)
                    db.bulk_update_mappings(Business, update_businesses)
                    revlen = len(insert_reviews) + len(update_reviews)
                    logger.info("Uploading %i Documents to DB...." % revlen)
                    db.execute(document_associations.insert(), sorted(insert_doc_associations, key=lambda x:x['assoc_id']))
                    db.execute(documents.insert(), sorted(insert_documents, key=lambda x:x['id']))
                    # db.bulk_update_mappings(Document, update_documents)
                    logger.info("Uploading %i Business Reviews to DB...." % revlen)
                    db.execute(yelp_reviews.insert(), sorted(insert_reviews, key=lambda x:x['id']))
                    # db.bulk_update_mappings(YelpReview, update_reviews)
                    #there seem to be duplicate categories for a business
                    #so make the associations unique
                    logger.info("Uploading Business Category associations to DB....")
                    biz_cats = [dict(tupleized) for tupleized in set(tuple(assoc.items()) for assoc in biz_cats)]
                    db.execute(business_category_table.insert(), biz_cats)

                # reset the lists for the next batch
                db_categories.update(unloaded_categories.keys())
                db_locations.update(unloaded_locations.keys())
                unloaded_categories = {}
                unloaded_locations = {}
                insert_businesses = []
                insert_reviews = []
                insert_documents = []
                insert_doc_associations = []
                update_businesses = []
                update_reviews = []
                update_documents = []
                biz_cats = []

    # upload the final batch
    bizlen = len(insert_businesses) + len(update_businesses)
    if bizlen > 0:
        with db.begin():
            logger.info("Uploading Batch of %i to DB...." % upload_mod)
            logger.info("Uploading Locations to DB....")
            db.bulk_insert_mappings(Location, unloaded_locations.values())
            logger.info("Uploading Yelp Categories to DB....")
            db.bulk_insert_mappings(YelpCategory, unloaded_categories.values())
            bizlen = len(insert_businesses) + len(update_businesses)
            logger.info("Uploading %i Businesses to DB...." %bizlen)
            db.execute(businesses.insert(), insert_businesses)
            db.bulk_update_mappings(Business, update_businesses)
            revlen = len(insert_reviews) + len(update_reviews)
            logger.info("Uploading %i Documents to DB...." % revlen)
            db.execute(document_associations.insert(), sorted(insert_doc_associations, key=lambda x:x['assoc_id']))
            db.execute(documents.insert(), sorted(insert_documents, key=lambda x:x['id']))
            # db.bulk_update_mappings(Document, update_documents)
            logger.info("Uploading %i Business Reviews to DB...." % revlen)
            db.execute(yelp_reviews.insert(), sorted(insert_reviews, key=lambda x:x['id']))
            # db.bulk_update_mappings(YelpReview, update_reviews)
            #there seem to be duplicate categories for a business
            #so make the associations unique
            logger.info("Uploading Business Category associations to DB....")
            biz_cats = [dict(tupleized) for tupleized in set(tuple(assoc.items()) for assoc in biz_cats)]
            db.execute(business_category_table.insert(), biz_cats)
    
    # if we are initializing the db, we need to reenable the fk constraints
    # because we put in all the data correctly, we are sure the fks are correct
    # this will error if they aren't
    if init_db and 'mssql' in dbconfig['dbbackend']:
        # put back all the constraints
        logger.info("Cheking Constraints...")
        enable_fk = """
        ALTER TABLE dbo.%s CHECK CONSTRAINT ALL;
        ALTER TABLE dbo.%s CHECK CONSTRAINT ALL;
        ALTER TABLE dbo.%s CHECK CONSTRAINT ALL;
        ALTER TABLE dbo.%s CHECK CONSTRAINT ALL;
        """ % ( businesses.name, business_category_table.name,
                business_category_table.name, yelp_reviews.name)
        with db.begin():
            db.execute(enable_fk)
    total_time = float(time.time() - start_time)
    logger.info("Upserted %i businesses and %i total reviews in %d seconds = %.2f minutes" %\
                 (biz_num, review_count, total_time,  total_time/60.))

    # update the download history
    with db.begin():
        ydh.uploaded = True