Esempio n. 1
0
def parse_tweet(tweet):
    """Return a CheckIn from `tweet` or None if it is not located in a valid
    city"""
    loc = u.get_nested(tweet, 'coordinates')
    city = None
    if not loc:
        # In that case, we would have to follow the link to know whether the
        # checkin falls within our cities but that's too costly so we drop it
        # (and introduce a bias toward open sharing users I guess)
        return None
    lon, lat = loc['coordinates']
    city = find_town(lat, lon, CITIES_TREE)
    if not (city and city in cities.SHORT_KEY):
        return None
    tid = u.get_nested(tweet, 'id_str')
    urls = u.get_nested(tweet, ['entities', 'urls'], [])
    # short url of the checkin that need to be expand, either using bitly API
    # or by VenueIdCrawler. Once we get the full URL, we still need to request
    # 4SQ (500 per hours) to get info.
    is_foursquare_url = lambda u: '4sq.com' in u or 'swarmapp.com' in u
    fsq_urls = [url['expanded_url'] for url in urls
                if is_foursquare_url(url['expanded_url'])]
    if not fsq_urls:
        return None
    lid = str(fsq_urls[0])
    uid = u.get_nested(tweet, ['user', 'id_str'])
    msg = u.get_nested(tweet, 'text')
    try:
        time = datetime.strptime(tweet['created_at'], UTC_DATE)
        time = cities.utc_to_local(city, time)
    except ValueError:
        print('time: {}'.format(tweet['created_at']))
        return None
    return FullCheckIn('', lid, '', city, loc, time, tid, uid, msg)
def parse_tweet(tweet):
    """Return a CheckIn from `tweet` or None if it is not located in a valid
    city"""
    #print 'twitter_helper.py/parse_tweet'
    loc = u.get_nested(tweet, 'coordinates')
    city = None
    if not loc:
        # In that case, we would have to follow the link to know whether the
        # checkin falls within our cities but that's too costly so we drop it
        # (and introduce a bias toward open sharing users I guess)
        return None
    lon, lat = loc['coordinates']
    city = find_town(lat, lon, CITIES_TREE)
    #print 'city', city
    if not (city and city in cities.SHORT_KEY):
        return None
    #print 'tree', CITIES_TREE
    tid = u.get_nested(tweet, 'id_str')
    urls = u.get_nested(tweet, ['entities', 'urls'], [])
    # short url of the checkin that need to be expand, either using bitly API
    # or by VenueIdCrawler. Once we get the full URL, we still need to request
    # 4SQ (500 per hours) to get info.
    is_foursquare_url = lambda u: '4sq.com' in u or 'swarmapp.com' in u
    fsq_urls = [
        url['expanded_url'] for url in urls
        if is_foursquare_url(url['expanded_url'])
    ]
    if not fsq_urls:
        return None
    lid = str(fsq_urls[0])
    uid = u.get_nested(tweet, ['user', 'id_str'])
    msg = u.get_nested(tweet, 'text')
    try:
        time = datetime.strptime(tweet['created_at'], UTC_DATE)
        time = cities.utc_to_local(city, time)
    except ValueError:
        print('time: {}'.format(tweet['created_at']))
        return None
    return FullCheckIn('', lid, '', city, loc, time, tid, uid, msg)
Esempio n. 3
0
     continue
 uid, tid, x, y, t, msg, place = data
 # if not id_must_be_process(int(tid)):
 #     continue
 lat, lon = float(x), float(y)
 # city = find_city(lat, lon)
 # assert city == find_town(lat, lon, tree)
 city = th.find_town(lat, lon, tree)
 lid = None
 if city is not None:
     lid = extract_url_from_msg(msg)
     stats[city] += 1
     how_many += 1
     tid, uid = int(tid), int(uid)
     t = datetime.strptime(t, '%Y-%m-%d %H:%M:%S')
     t = cities.utc_to_local(city, t)
     # to have more numerical values (but lid should be a 64bit
     # unsigned integer which seems to be quite complicated in
     # mongo)
     # t = timegm(t.utctimetuple())
     # city = cities.INDEX[city]
     loc = Location('Point', [lon, lat])._asdict()
     # seen.append(CheckIn(tid, lid, uid, city, loc, t, place))
     seen.append((tid, t, place))
     if len(seen) > 2000:
         # save_to_mongo(seen, checkins, venues_getter)
         update_time_and_place(seen, checkins)
         seen = []
     if how_many % 10000 == 0:
         print('1000(0) miles more')
         # save_var('avenues_id_new_triton', venues_getter.results)
Esempio n. 4
0
     continue
 uid, tid, x, y, t, msg, place = data
 # if not id_must_be_process(int(tid)):
 #     continue
 lat, lon = float(x), float(y)
 # city = find_city(lat, lon)
 # assert city == find_town(lat, lon, tree)
 city = th.find_town(lat, lon, tree)
 lid = None
 if city is not None:
     lid = extract_url_from_msg(msg)
     stats[city] += 1
     how_many += 1
     tid, uid = int(tid), int(uid)
     t = datetime.strptime(t, '%Y-%m-%d %H:%M:%S')
     t = cities.utc_to_local(city, t)
     # to have more numerical values (but lid should be a 64bit
     # unsigned integer which seems to be quite complicated in
     # mongo)
     # t = timegm(t.utctimetuple())
     # city = cities.INDEX[city]
     loc = Location('Point', [lon, lat])._asdict()
     # seen.append(CheckIn(tid, lid, uid, city, loc, t, place))
     seen.append((tid, t, place))
     if len(seen) > 2000:
         # save_to_mongo(seen, checkins, venues_getter)
         update_time_and_place(seen, checkins)
         seen = []
     if how_many % 10000 == 0:
         print('1000(0) miles more')
         # save_var('avenues_id_new_triton', venues_getter.results)