def parse_tweet(tweet): """Return a CheckIn from `tweet` or None if it is not located in a valid city""" loc = u.get_nested(tweet, 'coordinates') city = None if not loc: # In that case, we would have to follow the link to know whether the # checkin falls within our cities but that's too costly so we drop it # (and introduce a bias toward open sharing users I guess) return None lon, lat = loc['coordinates'] city = find_town(lat, lon, CITIES_TREE) if not (city and city in cities.SHORT_KEY): return None tid = u.get_nested(tweet, 'id_str') urls = u.get_nested(tweet, ['entities', 'urls'], []) # short url of the checkin that need to be expand, either using bitly API # or by VenueIdCrawler. Once we get the full URL, we still need to request # 4SQ (500 per hours) to get info. is_foursquare_url = lambda u: '4sq.com' in u or 'swarmapp.com' in u fsq_urls = [url['expanded_url'] for url in urls if is_foursquare_url(url['expanded_url'])] if not fsq_urls: return None lid = str(fsq_urls[0]) uid = u.get_nested(tweet, ['user', 'id_str']) msg = u.get_nested(tweet, 'text') try: time = datetime.strptime(tweet['created_at'], UTC_DATE) time = cities.utc_to_local(city, time) except ValueError: print('time: {}'.format(tweet['created_at'])) return None return FullCheckIn('', lid, '', city, loc, time, tid, uid, msg)
def parse_tweet(tweet): """Return a CheckIn from `tweet` or None if it is not located in a valid city""" #print 'twitter_helper.py/parse_tweet' loc = u.get_nested(tweet, 'coordinates') city = None if not loc: # In that case, we would have to follow the link to know whether the # checkin falls within our cities but that's too costly so we drop it # (and introduce a bias toward open sharing users I guess) return None lon, lat = loc['coordinates'] city = find_town(lat, lon, CITIES_TREE) #print 'city', city if not (city and city in cities.SHORT_KEY): return None #print 'tree', CITIES_TREE tid = u.get_nested(tweet, 'id_str') urls = u.get_nested(tweet, ['entities', 'urls'], []) # short url of the checkin that need to be expand, either using bitly API # or by VenueIdCrawler. Once we get the full URL, we still need to request # 4SQ (500 per hours) to get info. is_foursquare_url = lambda u: '4sq.com' in u or 'swarmapp.com' in u fsq_urls = [ url['expanded_url'] for url in urls if is_foursquare_url(url['expanded_url']) ] if not fsq_urls: return None lid = str(fsq_urls[0]) uid = u.get_nested(tweet, ['user', 'id_str']) msg = u.get_nested(tweet, 'text') try: time = datetime.strptime(tweet['created_at'], UTC_DATE) time = cities.utc_to_local(city, time) except ValueError: print('time: {}'.format(tweet['created_at'])) return None return FullCheckIn('', lid, '', city, loc, time, tid, uid, msg)
continue uid, tid, x, y, t, msg, place = data # if not id_must_be_process(int(tid)): # continue lat, lon = float(x), float(y) # city = find_city(lat, lon) # assert city == find_town(lat, lon, tree) city = th.find_town(lat, lon, tree) lid = None if city is not None: lid = extract_url_from_msg(msg) stats[city] += 1 how_many += 1 tid, uid = int(tid), int(uid) t = datetime.strptime(t, '%Y-%m-%d %H:%M:%S') t = cities.utc_to_local(city, t) # to have more numerical values (but lid should be a 64bit # unsigned integer which seems to be quite complicated in # mongo) # t = timegm(t.utctimetuple()) # city = cities.INDEX[city] loc = Location('Point', [lon, lat])._asdict() # seen.append(CheckIn(tid, lid, uid, city, loc, t, place)) seen.append((tid, t, place)) if len(seen) > 2000: # save_to_mongo(seen, checkins, venues_getter) update_time_and_place(seen, checkins) seen = [] if how_many % 10000 == 0: print('1000(0) miles more') # save_var('avenues_id_new_triton', venues_getter.results)