Example #1
0
def unzip_file(filename):
    """
    Take in a .gz file and unzip it, saving it with the same file name
    """
    # first make sure we haven't already unzipped it
    db = get_db_session()
    ydh = db.query(YelpDownloadHistory).filter(YelpDownloadHistory.date==date.today()).scalar()

    # if it doesn't exist, error out because we need to download it
    if not ydh or not ydh.date == date.today() or not ydh.downloaded:
        logger.critical("Cannot unzip file for today because it wasn't downloaded")
        return

    if ydh.unzipped:
        logger.info("Today's feed already unzipped, skipoing the unzip")
        rawfile = filename.strip('.gz')
        return rawfile

    logger.info("Extracting file: %s" % filename)
    with gzip.open(filename, 'rb') as infile:
        rawfile = filename.strip('.gz')
        with open(rawfile, 'wb') as outfile:
            i=1
            for line in infile:
                outfile.write(line)
                if i % 1000 == 0 :
                    print "\r Extracted %i businesses so far" % i
                i += 1
    logger.info("Done extracting file: %s" % rawfile)
    ydh.unzipped = True
    db.commit()
    return rawfile
Example #2
0
def geocodeUnknownLocations(wait_time=2, run_time=240):
    """
    Geocode any locations that don't have Lat/Lons

    Only do so for up to `run_time` minutes, because this can take a very long time if most are unknown

    Also shuffle so that they all get equal probability of being tried

    Args:
        wait_time: how long to wait until timeout

    Returns: 
        None

    """
    geoLocator = Nominatim()
    # print geoLocator.geocode("548 riverside dr., NY, NY, 10027") # test
    db = get_db_session()
    unknowns = db.query(Location).filter(Location.latitude==None).all()
    shuffle(unknowns) # give them all a fighting chance
    logger.info("Attempting to geocode random unknown locations for %i minutes" % run_time)
    logger.info("%i Unkown locations to geocode" % len(unknowns))
    locations = []
    upload_mod = 100 # upload batch size

    start_time = time.time()
    run_time *= 60 # turn it into seconds

    for i, location in enumerate(unknowns):
        # max try time stopping criterion
        if (time.time() - start_time) > run_time:
            logger.info("Max geocoding time has elapsed... Stopping for this run")
            db.add_all(locations)
            db.commit()
            break
        # print location.street_address
        logger.info("Geocoding location %i..." % i)
        try:
            geo = geoLocator.geocode(location.street_address, timeout=wait_time)
            lat = geo.latitude
            lon = geo.longitude
            logger.info("\tSuccess!")
        except Exception as e:
            # print  "Exception: ", e
            logger.warning("\tGeocode failed, assigning NULL Lat/Long")
            lat = None
            lon = None
        location.latitude = lat
        location.longitude = lon
        locations.append(location)
        if i % upload_mod == 0:
            db.add_all(locations)
            db.commit()
            locations = []
    logger.info("Finished geocode attempts")
Example #3
0
def get_yelp_score_distribution():
    db = get_db_session(echo=echo, autoflush=False, autocommit=True)
    counts = []
    i = 0
    while i <= 1:
        bucket_count = (db.execute(select([func.count(documents.c.id)]).where(
            and_(
                documents.c.fp_pred >= i,
                documents.c.fp_pred < i + 0.1
                )
            )).scalar())
        i += .1
        counts.append(bucket_count)
    return counts
Example #4
0
def copy_tables():
    """ Copies a few items from each table into a test database

    Should not be called in the same session after reset_test_db(); you will get
    a mapper error for some reason. Instead, call reset_test_db(), then close
    the python session, then, in a new session, call copy_tables(), to update
    the tables for a schema change. """
    toy = get_db_session()
    db = models.get_db_session()

    logger.info("Populating test tables")

    businesses = db.query(Business).order_by(Business.id)[0:5]
    locations = [b.location for b in businesses]
    # [b.categories ...] is a list of lists, so we need a little more processing
    categories = set().union(*[b.categories for b in businesses])

    tweets = db.query(Tweet).order_by(Tweet.id)[0:5]
    reviews = []
    for b in businesses:
        reviews.extend(
            db.query(YelpReview).filter(YelpReview.business_id == b.id)[0:5]
        )
    documents = [r.document for r in reviews] + [t.document for t in tweets]
    doc_assoc = [r.document_rel for r in reviews] + \
                [t.document_rel for t in tweets]

    tables = [businesses, locations, categories, reviews, tweets, \
              documents, doc_assoc]

    # detach all objects from db session before putting them in toy
    for t in tables:
        for obj in t: make_transient(obj)

    # only after *everything* is transient do we add anything
    for t in tables: toy.add_all(t)

    # in addition we add the junction table for business categories
    b_ids = [b.id for b in businesses]
    business_cat = db.execute(business_category_table.select().
            where(business_category_table.c.business_id.in_(
                [b.id for b in businesses]
            )))

    for row in business_cat:
        toy.execute(business_category_table.insert(), row)

    toy.commit()
Example #5
0
from twython import Twython
from foodbornenyc.db_settings import twitter_config
from foodbornenyc.models.documents import Tweet
from foodbornenyc.models.users import TwitterUser
from foodbornenyc.models.locations import Location
from foodbornenyc.models.models import get_db_session

import foodbornenyc.sources.foursquare_geo as geo

from foodbornenyc.util.util import get_logger, xuni
logger = get_logger(__name__, level="INFO")

# TODO @teffland: Read this link and make sure location tracking gets the lat/lon pair order correctly 
# http://support.gnip.com/articles/filtering-twitter-data-by-location.html

db = get_db_session()
twitter = Twython(twitter_config['consumer_key'],
    twitter_config['consumer_secret'],
    twitter_config['access_token'],
    twitter_config['access_token_secret'])
# all possible fields from twitter that we want to import directly
user_fields = ['id_str', 'name', 'screen_name', 'location', 'description']
tweet_fields = [
#         'contributors', #<type 'NoneType'>
#         'truncated', #<type 'bool'>
        'text', #<type 'unicode'>
#         'is_quote_status', #<type 'bool'>
#         'in_reply_to_status_id', #<type 'NoneType'>
#         'id', #<type 'int'>
#         'favorite_count', #<type 'int'>
#         'source', #<type 'unicode'>
Example #6
0
def download_latest_yelp_data():
    """Attempt to download the latest gzip file from the Yelp Syndication.

        Args:
            None
        
        Returns:
            local_file: the name of where the yelp feed was downloaded to

        Notes: 
            Yelp doesn't let us look at the bucket, 
            so we just try exact filenames with presigned urls for the past month
    """
    #get whhere to save the feed from config
    local_file = config['rawdata_dir'] + config['local_file']

    # first make sure we haven't already downloaded the file
    db = get_db_session()
    ydh = db.query(YelpDownloadHistory).filter(YelpDownloadHistory.date==date.today()).scalar()

    # if it doesn't exist or it's old, create the new one
    if not ydh or not ydh.date == date.today():
        ydh = YelpDownloadHistory()
        db.add(ydh)
        db.commit()
        logger.info("Creating new download history for today")

    # if we already downloaded, log and return
    if ydh.downloaded:
        logger.info("Already downloaded the Yelp feed for today...")
        return local_file

    # set up botocore client
    session = botocore.session.get_session()
    client = session.create_client('s3')

    # try to download most recent data (go up to one month back)
    for day_delta in range(31):
        # generate the correct filename for a day
        dateformat = '%Y%m%d'
        day = date.today() - timedelta(day_delta)
        day_str = day.strftime(dateformat) # eg '20151008'
        ext = '_businesses.json.gz'
        filename =  day_str + ext
        logger.info("Attempting to get Yelp Reviews from %s.....", day.strftime("%m/%d/%Y"))

        # generate a presigned url for the file, 
        # since yelp doesn't give us bucket access
        url = client.generate_presigned_url(
                'get_object',
                Params={'Bucket': config['bucket_name'],
                'Key':config['bucket_dir'] +'/'+ filename },
                ExpiresIn=3600 # 1 hour in seconds
                )
        # do the downloading
        logger.info("Feed URL: %s", url)
        try:
            download_url_to_file(url, config['rawdata_dir'], config['local_file'])
            # if we succeed, move on
            break

        except Exception:
            # TODO: Throw more explicit exceptions from `download_url_to_file`
            # so we can handle it more explicitly, currenlty it can be misleading
            if day_delta == 30: 
                logger.warning("NO YELP DATA AVAILABLE FOR THE PAST MONTH!")
                return
            else:
                logger.warning("no data for date: %s\n\
                     Trying the day before." % day.strftime("%m/%d/%Y"))

    logger.info("Latest Yelp Data successfully downloaded from feed.")
    # save the success to download history
    ydh.downloaded = True
    db.commit()
    return local_file
Example #7
0
def upsert_yelpfile_to_db(filename, geocode=True):
    """
    This takes in the JSON file of all of the Yelp businesses and
    all the affiliate data (reviews, categories, etc.) and upserts them to the DB

    It follows the db schema used by the ORM, but doesn't use the ORM directly
    for optimization purposes.  

    Where the ORM would take a week or more (estimated) to upload 
    a completely new file of 35k businesses,
    this version does so in ~= 45 min (w/o geocode and over ethernet)

    DON'T mess with this code unless you know what you're doing.
    Hopefully it's well commented enough for you to figure it out if you need to.
    But it is sensitive (relative to normal model code) because of the amount of
    data that must be transferred to the DB.

    Args:
        filename: the name of the unzipped Yelp JSON filename

        geocode: whether or not to geocode locations missing a Lat/Lon
         - Can slow down the code significantly if it's the first geocode attempt
         - Most Yelp locations don't have Lat/Lons
         - On first upload consider calling the geocode db function after

    Returns:
        None. But the database will be updated :)

    TODO:
        - Add a geocode unkown locations function
        - Check if syndication has already been uploaded today

    """
    # database handler object
    db = get_db_session(echo=False, autoflush=False, autocommit=True)

    # check YelpDownloadHistory to see if we've already uploaded the feed
    ydh = db.query(YelpDownloadHistory).filter(YelpDownloadHistory.date==date.today()).scalar()

    # if it doesn't exist, isnt today, hasn't been downloaded or unzipped
    if not ydh or not ydh.date == date.today() or not ydh.downloaded or not ydh.unzipped:
        logger.critical("Can't upload today's feed if it hasn't been downloaded and unzipped")
        return

    if ydh.uploaded:
        logger.info("Already upserted today's Yelp feed. Skipping")
        return

    # geocoder for businesses w/o lat longs
    geoLocator = Nominatim()

    logger.info("Updating Yelp DB..........")

    # setup list of businesses to skip updating
    # because we've uploaded them since yelp updated them
    # this part gets the skip condition
    newest = db.query(Business).order_by(desc(Business.updated_at)).first()

    # check for if there's a business in the db
    if newest:
        most_recent = newest.updated_at
        init_db = False
        logger.info("Last updated: %r" % most_recent.strftime('%m/%d/%Y:%M:%H:%S'))

    # if not, then it's the first time we're populating it
    else:
        logger.info("First Database Population: This could take a looong time...")
        most_recent = None
        init_db = True

    # if we're initializing the db, disable the foreign key constraints
    # this will improve upload speeds
    if init_db and 'mssql' in dbconfig['dbbackend']:
        disable_fk = """
        ALTER TABLE dbo.%s NOCHECK CONSTRAINT fk_loc_businesses;
        ALTER TABLE dbo.%s NOCHECK CONSTRAINT fk_biz_id;
        ALTER TABLE dbo.%s NOCHECK CONSTRAINT fk_cat_alias;
        ALTER TABLE dbo.%s NOCHECK CONSTRAINT fk_rev_biz_id;
        """ % ( businesses.name, business_category_table.name,
                business_category_table.name, yelp_reviews.name)
        with db.begin():
            db.execute(disable_fk)


    start_time = time.time() # for timing the whole process

    # get sets of uids for each data model, 
    # so we can quickly determine if a datum needs to be updated or inserted
    # this is much faster than querying the db every time
    with db.begin():
        db_biz_ids = set([ b.id 
                        for b in db.query(Business.id).all() ])
        db_review_ids = set([ r.id 
                            for r in db.query(YelpReview.id).all() ])
        db_locations = set([ l.id 
                            for l in db.query(Location.id).all() ])
        db_categories = set([ c.alias 
                            for c in db.query(YelpCategory.alias).all() ])
        db_biz_categories = set([ (assoc.business_id, assoc.category_alias) 
                                for assoc in db.query(business_category_table).all()])

    # batch upsert data structures
    unloaded_locations = {}
    unloaded_categories = {}
    insert_businesses = []
    insert_reviews = []
    insert_documents = []
    insert_doc_associations = []
    update_businesses = []
    update_reviews = []
    update_documents = []
    biz_cats = []

    # loop over json file ans upsert all data
    with open(filename, 'rb') as infile: # for unzipped files
        biz_num = 0
        biz_count = 0
        review_count = 0
        upload_mod = 500 # size of batch upload 

        # each business is one line
        for line in infile:
            biz_num += 1
            biz_count +=1
            logger.info("Updating Restaurant #%i...." % biz_num)

            current = time.time()-start_time
            m, s = divmod(current, 60)  
            h, m = divmod(m, 60)
            logger.info("Time so far: %d:%02d:%02d" % (h, m, s))

            # if business doesn't load correctly, skip it
            try:
                biz = loads(line)
            except ValueError:
                logger.warning("Broken JSON Element. Skipping...")
                continue
            bdate = datetime.strptime(biz['time_updated'], '%Y-%m-%dT%H:%M:%S')#2015-10-08T20:17:50

            # skip this business if it hasn't ben updated since we last updated
            # (only works when we aren't initializing the db)
            if most_recent and not init_db:
                if bdate <= most_recent:
                    logger.info("SKIPPING (NOT NEW): %s" % biz['name'])
                    biz_count -=1
                    # print biz_count
                    continue

            # make note of new Locations
            location = location_dict_yelp(biz['location'])
            if location['id'] not in db_locations and location['id'] not in unloaded_locations.keys():
                if not location['latitude'] and not location['longitude'] and geocode:
                    # try to reverse-geocode missing coords (if enabled)
                    if geocode:
                        try:
                            logger.info("No Lat/Long for restaurant, attempting to geocode...")
                            # TODO(shao): replace with foursquare geocoder
                            raise Exception('geocode not implemented')
                        except:
                            logger.warning("Geocode failed, assigning NULL Lat/Long")
                # add to running list of unloaded locations
                unloaded_locations[location['id']] = location 

            # update or insert business depending on if it's already in db
            business = business_yelp_dict(biz, location)
            if biz['id'] in db_biz_ids:
                update_businesses.append(business)
            else:
                insert_businesses.append(business)

            # update/create all the new Reviews
            for i, rev in enumerate(biz['reviews']):
                # if the review isn't new, don't do anything
                # uncomment this code to update it (significant slowdown)
                if rev['id'] in db_review_ids:
                    pass
                    # review = review_dict_yelp(biz, rev)
                    # document = document_dict_yelp(rev)
                    # update_reviews.append(review)
                    # update_documents.append(document)
                # else create a new one
                else:
                    review = review_dict_yelp(biz, rev)
                    document = document_dict_yelp(rev)
                    doc_assoc = doc_assoc_dict_yelp(rev)

                    insert_reviews.append(review)
                    insert_documents.append(document)
                    insert_doc_associations.append(doc_assoc)
            review_count += len(biz['reviews'])

            # create the Categories
            for category in biz['categories']:
                # if it's new create it, provided we haven't already
                if (category['alias'] not in db_categories 
                and category['alias'] not in unloaded_categories.keys()):
                    # some aliases are bad, so skip them
                    if (xstr(category['alias']) == '' 
                    or xstr(category['alias']) == None): 
                        logger.warning("BAD CATEGORY %r... Skipping" % xstr(category['alias']))
                        continue
                    cat = {'alias':xstr(category['alias']),
                            'title':xstr(category['title'])
                          }
                    unloaded_categories[category['alias']] = cat
                # create the business association link
                assoc = {
                         'business_id':biz['id'], 
                         'category_alias':category['alias']
                         }
                if (assoc['business_id'], assoc['category_alias']) not in db_biz_categories:
                    biz_cats.append(assoc)

            # if we've reached batch size, perform the actual transactions
            if biz_count % upload_mod == 0:
                with db.begin():
                    logger.info("Uploading Batch of %i to DB...." % upload_mod)
                    logger.info("Uploading Locations to DB....")
                    db.bulk_insert_mappings(Location, unloaded_locations.values())
                    logger.info("Uploading Yelp Categories to DB....")
                    db.bulk_insert_mappings(YelpCategory, unloaded_categories.values())
                    bizlen = len(insert_businesses) + len(update_businesses)
                    logger.info("Uploading %i Businesses to DB...." %bizlen)
                    db.execute(businesses.insert(), insert_businesses)
                    db.bulk_update_mappings(Business, update_businesses)
                    revlen = len(insert_reviews) + len(update_reviews)
                    logger.info("Uploading %i Documents to DB...." % revlen)
                    db.execute(document_associations.insert(), sorted(insert_doc_associations, key=lambda x:x['assoc_id']))
                    db.execute(documents.insert(), sorted(insert_documents, key=lambda x:x['id']))
                    # db.bulk_update_mappings(Document, update_documents)
                    logger.info("Uploading %i Business Reviews to DB...." % revlen)
                    db.execute(yelp_reviews.insert(), sorted(insert_reviews, key=lambda x:x['id']))
                    # db.bulk_update_mappings(YelpReview, update_reviews)
                    #there seem to be duplicate categories for a business
                    #so make the associations unique
                    logger.info("Uploading Business Category associations to DB....")
                    biz_cats = [dict(tupleized) for tupleized in set(tuple(assoc.items()) for assoc in biz_cats)]
                    db.execute(business_category_table.insert(), biz_cats)

                # reset the lists for the next batch
                db_categories.update(unloaded_categories.keys())
                db_locations.update(unloaded_locations.keys())
                unloaded_categories = {}
                unloaded_locations = {}
                insert_businesses = []
                insert_reviews = []
                insert_documents = []
                insert_doc_associations = []
                update_businesses = []
                update_reviews = []
                update_documents = []
                biz_cats = []

    # upload the final batch
    bizlen = len(insert_businesses) + len(update_businesses)
    if bizlen > 0:
        with db.begin():
            logger.info("Uploading Batch of %i to DB...." % upload_mod)
            logger.info("Uploading Locations to DB....")
            db.bulk_insert_mappings(Location, unloaded_locations.values())
            logger.info("Uploading Yelp Categories to DB....")
            db.bulk_insert_mappings(YelpCategory, unloaded_categories.values())
            bizlen = len(insert_businesses) + len(update_businesses)
            logger.info("Uploading %i Businesses to DB...." %bizlen)
            db.execute(businesses.insert(), insert_businesses)
            db.bulk_update_mappings(Business, update_businesses)
            revlen = len(insert_reviews) + len(update_reviews)
            logger.info("Uploading %i Documents to DB...." % revlen)
            db.execute(document_associations.insert(), sorted(insert_doc_associations, key=lambda x:x['assoc_id']))
            db.execute(documents.insert(), sorted(insert_documents, key=lambda x:x['id']))
            # db.bulk_update_mappings(Document, update_documents)
            logger.info("Uploading %i Business Reviews to DB...." % revlen)
            db.execute(yelp_reviews.insert(), sorted(insert_reviews, key=lambda x:x['id']))
            # db.bulk_update_mappings(YelpReview, update_reviews)
            #there seem to be duplicate categories for a business
            #so make the associations unique
            logger.info("Uploading Business Category associations to DB....")
            biz_cats = [dict(tupleized) for tupleized in set(tuple(assoc.items()) for assoc in biz_cats)]
            db.execute(business_category_table.insert(), biz_cats)
    
    # if we are initializing the db, we need to reenable the fk constraints
    # because we put in all the data correctly, we are sure the fks are correct
    # this will error if they aren't
    if init_db and 'mssql' in dbconfig['dbbackend']:
        # put back all the constraints
        logger.info("Cheking Constraints...")
        enable_fk = """
        ALTER TABLE dbo.%s CHECK CONSTRAINT ALL;
        ALTER TABLE dbo.%s CHECK CONSTRAINT ALL;
        ALTER TABLE dbo.%s CHECK CONSTRAINT ALL;
        ALTER TABLE dbo.%s CHECK CONSTRAINT ALL;
        """ % ( businesses.name, business_category_table.name,
                business_category_table.name, yelp_reviews.name)
        with db.begin():
            db.execute(enable_fk)
    total_time = float(time.time() - start_time)
    logger.info("Upserted %i businesses and %i total reviews in %d seconds = %.2f minutes" %\
                 (biz_num, review_count, total_time,  total_time/60.))

    # update the download history
    with db.begin():
        ydh.uploaded = True
Example #8
0
def get_db_session(**kwargs):
    return models.get_db_session(config=test_config, **kwargs)
Example #9
0
    def classify_reviews(self, every=False, unseen=False, since=30, yield_per=1000, verbose=0):
        """Classify some set of `YelpReview`s' `Docuement` in the database

        Args:
            every (bool): Whether to just do them all.
                          Trumps other flags. Likely to __very__ slow.
            unseen (bool): If not `every`, classify all that don't yet have predictions.
                           Trumps `since`.
            since (int): Number of past days to classify reviews.
            yield_per (int): Will work with database in batches of that size.
            verbose (int): Degree of verbosity v.
                           - v = 0 Only specify number of reviews being classified
                           - v >= 1 Log when eaches review has been classified
                           - v >= 2 Echo SQL statements

        Returns:
            None

        """
        echo = True if verbose >= 2 else False
        db = get_db_session(echo=echo, autoflush=False, autocommit=True)
        with db.begin():
            if every:
                logger.info("Classifying all reviews. This could take a very long time")
                query = db.query(YelpReview).order_by(YelpReview.id.asc())
                count = db.query(func.count(YelpReview.id)).scalar()
            elif unseen:
                logger.info("Classifying all unclassified reviews")
                # this requires running some special core level queries because of the dynamic document association
                # this way is actually faster anyways
                unseen_q = select([documents.c.id]).where(documents.c.fp_pred.is_(None))
                query = (db.query(YelpReview)
                           .filter(YelpReview.id.in_(unseen_q))
                           .order_by(YelpReview.id.asc()))
                count = (db.execute(
                           select([func.count(documents.c.id)])
                           .where(documents.c.fp_pred.is_(None)))
                           .scalar()) # instead of tuple
            else:
                logger.info("Classifying all reviews from the past %i days", since)
                backdate = datetime.datetime.now() - datetime.timedelta(since)
                query = (db.query(YelpReview)
                           .filter(YelpReview.created >= backdate)
                           .order_by(YelpReview.created.desc()))
                count = (db.query(func.count(YelpReview.created))
                           .filter(YelpReview.created >= backdate)
                           .scalar()) # instead of tuple

        logger.info("Classifying %i total reviews", count)
        start = time()
        offset = 0
        while True:
            returned = False
            try:
                with db.begin():
                    for i, review in enumerate(query.limit(yield_per).offset(offset)):
                        returned = True
                        if verbose:
                            logger.info("Classified Review #%i/%i", offset+i+1, count)
                        self.score_review(review)
                    logger.info("Commiting Predictions")
            except OperationalError:
                continue # if commit error, try again with same offeset
            offset += yield_per
            if not returned:
                break
        logger.info("Classified %i reviews in %i:%i:%i (h:m:s)", count, *sec_to_hms(time()-start))
Example #10
0
def get_twitter_sick_reviews(echo, search_params):
    (threshold, sortby, num_results, page, start_date, end_date) = search_params
    threshold = (1.0 / 10) * threshold ## convert to tenths

    # Reformat date params for comparison later
    if len(start_date) > 0:
        start_date = dt.strptime(start_date, "%Y-%m-%d")
    else:
        start_date = None
    if len(end_date) > 0:
        end_date = dt.strptime(end_date, "%Y-%m-%d")
    else:
        end_date = None

    ## Record whether a limit was given
    is_limit = (num_results != -1)

    # If no limit, set num_results to 100 for querying in a loop
    if not is_limit:
        num_results = 100 


    db = get_db_session(echo=echo, autoflush=False, autocommit=True)
    count = 0
    ## Construct query to get all positive reviews
    with db.begin():
        all_tweets = select([documents.c.id])
        query = (db.query(Tweet).filter(Tweet.id.in_(all_tweets)).order_by(Tweet.id.asc()))

    offset = num_results * (page - 1)
    tweets = []
    ## Collect all reviews meeting query criteria
    while True:
        returned = False
        try:
            with db.begin():
                for i, review in enumerate(query.limit(num_results).offset(offset)):
                    returned = True

                    ## only proceed if review's date follows start_date, if applicable
                    if (start_date == None or (review.document.created >= start_date)):
                        ## only proceed if review's date preceeds end_date, if applicable
                        if (end_date == None or (review.document.created <= end_date)):
                            
                            ## The DB doesn't yet have functionality for Twitter reviews to be
                            ## linked to businesses, but once that happens we should adapt this.
                            business = UIBusiness("Unknown business", "XXX", "XXX", "X", "twitter.com",
                                              "twitter.com")
                            tweets.append(UITwitterReview(business, review.text, review.document.fp_pred,
                                      review.document.created, "Twitter", review.user_id, review.id))
        except OperationalError:
            break

        if is_limit or (not returned):
            break
        # Otherwise, continue until there are no more reviews
        offset += 100 ## when no limit, loop on sets of 100

    ## Sort reviews
    if (sortby == "severity"):
        tweets = sorted(tweets, key=attrgetter('score'), reverse=True)
    else:
        tweets = sorted(tweets, key=attrgetter('created'))

    return tweets