def describe_venue(venues, city, depth=2, limit=None): """Gather some statistics about venue, aggregating categories at `depth` level.""" query = cm.build_query(city, False, ['cat', 'likes'], limit) group = {'_id': '$cat', 'count': {'$sum': 1}, 'like': {'$sum': '$likes'}} query.extend([{'$group': group}, {'$sort': {'count': -1}}]) res = venues.aggregate(query)['result'] def parenting_cat(place, depth): """Return the category of `place`, without going beyond `depth`""" _, path = fsc.search_categories(place['_id']) if len(path) > depth: return fsc.CAT_TO_ID[:path[depth]] return fsc.CAT_TO_ID[:path[-1]] summary = defaultdict(lambda: (0, 0)) nb_venues = 0 for venue in res: if venue['_id'] is not None: cat = parenting_cat(venue, depth) count, like = venue['count'], venue['like'] nb_venues += count summary[cat] = (summary[cat][0] + count, summary[cat][1] + like) for cat, stat in summary.iteritems(): count, like = stat summary[cat] = (100.0*count/nb_venues, count, like) return OrderedDict(sorted(summary.items(), key=lambda u: u[1][0], reverse=True))
def output_checkins(city, host=cm.HOST, port=cm.PORT): """Write a JS array of all checkins in `city` with their hour.""" checkins = cm.connect_to_db('foursquare', host, port)[0]['checkin'] query = cm.build_query(city, venue=False, fields=['loc', 'time']) res = checkins.aggregate(query)['result'] def format_checkin(checkin): """Extract location (plus jitter) and hour from checkin""" lng, lat = checkin['loc']['coordinates'] hour = checkin['time'].hour return [lng + noise(), lat + noise(), hour] formated = [str(format_checkin(c)) for c in res] with open(city + '_fs.js', 'w') as output: output.write('var helsinki_fs = [\n') output.write(',\n'.join(formated)) output.write('];')
def output_checkins(city, host=cm.HOST, port=cm.PORT): """Write a JS array of all checkins in `city` with their hour.""" checkins = cm.connect_to_db("foursquare", host, port)[0]["checkin"] query = cm.build_query(city, venue=False, fields=["loc", "time"]) res = checkins.aggregate(query)["result"] def format_checkin(checkin): """Extract location (plus jitter) and hour from checkin""" lng, lat = checkin["loc"]["coordinates"] hour = checkin["time"].hour return [lng + noise(), lat + noise(), hour] formated = [str(format_checkin(c)) for c in res] with open(city + "_fs.js", "w") as output: output.write("var helsinki_fs = [\n") output.write(",\n".join(formated)) output.write("];")
def all_places_from_venue(checkins, city, converse=False): """Associate each venue with a list twitter place id (or do the `converse`)""" query = cm.build_query(city, fields=['lid', 'place']) index, values = '$lid', '$place' if converse: index, values = values, index query.append({"$group": {'_id': index, 'others': {'$push': values}}}) answer = checkins.aggregate(query)['result'] return {venue['_id']: venue['others'] for venue in answer if venue['_id']}
def output_checkins(city, host=cm.HOST, port=cm.PORT): """Write a JS array of all checkins in `city` with their hour.""" print 'utils.py/output_checkins' checkins = cm.connect_to_db('foursquare', host, port)[0]['checkin'] query = cm.build_query(city, venue=False, fields=['loc', 'time']) res = checkins.aggregate(query)['result'] def format_checkin(checkin): """Extract location (plus jitter) and hour from checkin""" print 'utils.py/format_checkin' lng, lat = checkin['loc']['coordinates'] hour = checkin['time'].hour return [lng + noise(), lat + noise(), hour] formated = [str(format_checkin(c)) for c in res] with open(city + '_fs.js', 'w') as output: output.write('var helsinki_fs = [\n') output.write(',\n'.join(formated)) output.write('];')
def get_users(args): import CommonMongo as cm city = args.city try: return p.load_var(city+'_users.my') except IOError: pass db = cm.connect_to_db('foursquare', args.host, args.port)[0] # First get a list of all users so far user_query = cm.build_query(city, venue=True, fields=['tuid']) group = {'$group': {'_id': '$tuid', 'checkins': {'$sum': 1}}} user_query.extend([group, {'$sort': {'checkins': -1}}]) users = db.checkin.aggregate(user_query)['result'] # See how many they are and their check-ins count distribution # import utils as u # import pandas as pd # print(len(users)) # infos = u.xzip(users, '_id checkins'.split()) # df_users = pd.DataFrame(index=map(int, infos[0]), # data=dict(count=infos[1])) # ppl.hist(df_users.values, bins=25) users = OrderedDict([(_['_id'], _['checkins']) for _ in users]) return users.keys()
def get_users(args): import CommonMongo as cm city = args.city try: return p.load_var(city + '_users.my') except IOError: pass db = cm.connect_to_db('foursquare', args.host, args.port)[0] # First get a list of all users so far user_query = cm.build_query(city, venue=True, fields=['tuid']) group = {'$group': {'_id': '$tuid', 'checkins': {'$sum': 1}}} user_query.extend([group, {'$sort': {'checkins': -1}}]) users = db.checkin.aggregate(user_query)['result'] # See how many they are and their check-ins count distribution # import utils as u # import pandas as pd # print(len(users)) # infos = u.xzip(users, '_id checkins'.split()) # df_users = pd.DataFrame(index=map(int, infos[0]), # data=dict(count=infos[1])) # ppl.hist(df_users.values, bins=25) users = OrderedDict([(_['_id'], _['checkins']) for _ in users]) return users.keys()
def load_existing_ids(cmd_args): """Read checkins ids in city from disk or DB.""" city = cmd_args.city if city == 'whole': return [] import persistent as p try: return p.load_var(city+'_checkins_ids.my') except IOError: pass import CommonMongo as cm db = cm.connect_to_db('foursquare', cmd_args.host, cmd_args.port)[0] ids = {str(_['_id']) for _ in db.checkin.find({'city': city}, {'_id': 1}) if not isinstance(_['_id'], long)} p.save_var(city+'_checkins_ids.my', ids) return ids
def venues_activity(checkins, city, limit=None): """Return time pattern of all the venues in 'city', or only the 'limit' most visited.""" query = cm.build_query(city, True, ['lid', 'time'], limit) group = {'_id': '$lid', 'count': {'$sum': 1}, 'visits': {'$push': '$time'}} query.insert(2, {'$group': group}) if isinstance(limit, int) and limit > 0: query.insert(-1, {'$sort': {'count': -1}}) res = checkins.aggregate(query)['result'] hourly = [] weekly = [] for venue in res: hour, day = aggregate_visits(venue['visits']) hourly.append(hour) weekly.append(day) return hourly, weekly
def load_existing_ids(cmd_args): """Read checkins ids in city from disk or DB.""" city = cmd_args.city if city == 'whole': return [] import persistent as p try: return p.load_var(city + '_checkins_ids.my') except IOError: pass import CommonMongo as cm db = cm.connect_to_db('foursquare', cmd_args.host, cmd_args.port)[0] ids = { str(_['_id']) for _ in db.checkin.find({'city': city}, {'_id': 1}) if not isinstance(_['_id'], long) } p.save_var(city + '_checkins_ids.my', ids) return ids
return res, int(t) if need_answer: max_tries -= 1 logging.info('insisting on page {}'.format(page)) sleep(5) else: return None, 0 logging.warn('Error getting page {}: too much tries'.format(page)) return None, 0 if __name__ == '__main__': START_OF_REQUESTS = time() logging.info('initial request') args = arguments.city_parser().parse_args() photos = cm.connect_to_db('world', args.host, args.port)[0]['photos'] photos.ensure_index([('loc', cm.pymongo.GEOSPHERE), ('tags', cm.pymongo.ASCENDING), ('uid', cm.pymongo.ASCENDING)]) city = args.city CITY = (cities.US + cities.EU)[cities.INDEX[city]] HINT = city bbox = (CITY[:2], CITY[2:]) start_time = datetime.datetime(2014, 7, 19) total = higher_request(start_time, bbox, photos) logging.info('Saved a total of {} photos.'.format(total)) logging.info('made {} requests.'.format(TOTAL_REQ))
#! /usr/bin/python2 # vim: set fileencoding=utf-8 import scipy.io as sio import VenueFeature as vf import CommonMongo as cm import persistent as p DB, CLIENT = cm.connect_to_db('foursquare') vf.DB = DB vf.CLIENT = CLIENT brands = ["mcdonald's", 'starbucks'] import cities as C starbucks = list(vf.DB.venue.find({'name': {'$in': ['Starbucks Coffee', 'Starbucks']}}, {'city': 1})) macdo = list(vf.DB.venue.find({'name': "McDonald's"}, {'city': 1})) for city in C.SHORT_KEY: vindex = set(list(sio.loadmat(city+'_fv')['i'])) fromdb = set([_['_id'] for _ in macdo if _['city'] == city]) res = list(fromdb.intersection(vindex)) p.save_var('{}_{}.my'.format(city, brands[0]), res) print('saved {} {} in {}'.format(len(res), brands[0], city)) fromdb = set([_['_id'] for _ in starbucks if _['city'] == city]) res = list(fromdb.intersection(vindex)) p.save_var('{}_{}.my'.format(city, brands[1]), res) print('saved {} {} in {}'.format(len(res), brands[1], city))
from time import sleep import TwitterAPI as twitter from api_keys import TWITTER_CONSUMER_KEY as consumer_key from api_keys import TWITTER_CONSUMER_SECRET as consumer_secret from api_keys import TWITTER_ACCESS_TOKEN as access_token from api_keys import TWITTER_ACCESS_SECRET as access_secret import twitter_helper as th import arguments ARGS = arguments.tweets_parser().parse_args() DB = None SAVE = None if ARGS.mongodb: import CommonMongo as cm DB = cm.connect_to_db("foursquare", ARGS.host, ARGS.port)[0] else: json = th.import_json() import CheckinAPICrawler as cac CRAWLER = cac.CheckinAPICrawler() from Queue import Queue from threading import Thread # the size of mongo bulk insert, in multiple of pool size INSERT_SIZE = 7 CHECKINS_QUEUE = Queue((INSERT_SIZE + 3) * cac.BITLY_SIZE) # There is probably a cleaner version, but it is global because if # read_twitter_stream fails, we still want to keep a accurate count of tweets # seen so far NB_TWEETS = 0
"""Find similars venues for 100 location in city, save the result in DB and return matching venues that were already in DB.""" venues = answer_to_dict(venues_db.find({'city': city}, {'loc': 1})) chosen = sample(venues.items(), 500) distances = [] all_match = [] for vid, loc in chosen: similars = af.similar_venues(vid, client=client) if similars is None: continue else: print(vid, similars) venues_db.update({'_id': vid}, {'$set': {'similars': similars}}) matching = answer_to_dict(venues_db.find({'_id': {'$in': similars}}, {'loc': 1})) all_match.append(matching) distances.append([geodesic_distance(loc, sloc) for sloc in matching.itervalues()]) return chosen, distances, all_match if __name__ == '__main__': import doctest doctest.testmod() # pylint: disable=C0103 import arguments args = arguments.city_parser().parse_args() city = args.city db, client = cm.connect_to_db('foursquare', args.host, args.port) checkins = db['checkin']
def mongo_insertion(): global TO_BE_INSERTED if len(TO_BE_INSERTED) == 0: return try: TABLE.insert(TO_BE_INSERTED, continue_on_error=True) except cm.pymongo.errors.DuplicateKeyError: pass except cm.pymongo.errors.OperationFailure as e: print(e, e.code) del TO_BE_INSERTED[:] if __name__ == '__main__': REQ = getattr(CLIENT, REQ) args = arguments.city_parser().parse_args() db = cm.connect_to_db('foursquare', args.host, args.port)[0] checkins = db['checkin'] TABLE = db[ENTITY_KIND] if ENTITY_KIND == 'venue': TABLE.ensure_index([('loc', cm.pymongo.GEOSPHERE), ('city', cm.pymongo.ASCENDING), ('cat', cm.pymongo.ASCENDING)]) t = Thread(target=entities_getter, name='Query4SQ') t.daemon = True t.start() t = Thread(target=entities_putter, name='InsertDB') t.daemon = True t.start() total_entities = 0 city = args.city chunker = Chunker.Chunker(foursquare.MAX_MULTI_REQUESTS)
similars = tree.xpath(XPATH_QUERY) if len(similars) == 0: return [] return [c.attrib["data-venueid"] for c in similars[0].iterchildren()] if __name__ == "__main__": config = ConfigParser.ConfigParser() config.read("api_keys.cfg") CLIENT_ID = config.get("foursquare", "FOURSQUARE_ID2") CLIENT_SECRET = config.get("foursquare", "FOURSQUARE_SECRET2") client = foursquare.Foursquare(CLIENT_ID, CLIENT_SECRET) import arguments args = arguments.city_parser().parse_args() db = cm.connect_to_db("foursquare", args.host, args.port)[0] checkins = db["checkin"] # print(venue_profile(client, '')) # up = user_profile(client, 2355635) # vids = ['4a2705e6f964a52048891fe3', '4b4ad9dff964a5200b8f26e3', # '40a55d80f964a52020f31ee3', '4b4ad9dff964c5200'] # [client.venues(vid, multi=True) for vid in vids] # answers = list(client.multi()) r = gather_all_entities_id(checkins, city="helsinki", limit=50) for b in r: print (b) # svids = ['4c4787646c379521a121cfb5', '43222200f964a5209a271fe3', # '4b218c2ef964a520a83d24e3'] # gold = [[], ['4bbd0fbb8ec3d13acea01b28', '451d2412f964a5208a3a1fe3'], # ['4d72a2a9ec07548190588cbf', '4a736a23f964a52062dc1fe3', # '4f2d3e99e4b056f83aecdc88', '4aa7b5e4f964a520064d20e3',
max_values = [] for idx, val in zip(cells, count): max_values = sps.add_maybe([val, [idx, idx], 0, val], max_values, 500) return sorted(max_values, key=lambda x: x[0], reverse=True) if __name__ == '__main__': #pylint: disable=C0103 import CommonMongo as cm import arguments import cities import sys sys.exit() args = arguments.city_parser().parse_args() city = args.city _, client = cm.connect_to_db('foursquare', args.host, args.port) # client = None photos_in_background = True k = 100 sps.GRID_SIZE = k sps.MAX_SUPPORT = 200 bbox = (cities.US+cities.EU)[cities.INDEX[city]] sps.BBOX = bbox _, _, sps.index_to_rect = sps.k_split_bbox(bbox, k) options = {'city': city, 'photos_background': True, 'bbox': cities.bbox_to_polygon(bbox), 'only': False} top_loc, ratio = do_scan(client, city, k, options['photos_background']) options['ratio'] = ratio output_json(sps.merge_regions(top_loc), options) options['photos_background'] = False top_loc, ratio = do_scan(client, city, k, options['photos_background'])
tags = sorted([k for k, v in entropies.items() if 2.5 <= v <= 3.01]) save_var('mat_tag', tags) u = load_var('user_status') user_index = {k: i for i, k in enumerate(u)} def format_photo(p): user = user_index[p['uid']] loc = p['loc']['coordinates'] taken = [p['taken'].weekday(), p['taken'].hour, calendar.timegm(p['taken'].utctimetuple())] indicator = [int(t in p['ntags']) for t in tags] return [user] + loc + taken + indicator photos_feature = np.mat(tag_time(DB, tags, format_photo)) sio.savemat('deep', {'A': scipy.sparse.csr_matrix(photos_feature)}) if __name__ == '__main__': import arguments args = arguments.city_parser().parse_args() city = args.city DB, client = cm.connect_to_db('world', args.host, args.port) s = clock() tags = supported_tags(DB, city, photos_threshold=30, users_threshold=5, timespan=60) save_var(city+'_tag_support', tags) # entropies = {t[0]: period_entropy(DB, t[0]) for t in tags} # save_var('Hsupported', entropies) # get_data(DB) t = clock() print(t-s)
#! /usr/bin/python2 # vim: set fileencoding=utf-8 """Compare tags of photos and venues.""" if __name__ == '__main__': # pylint: disable=C0103 getvenue = lambda i: db.venue.find_one({'_id': i}) import CommonMongo as cm db, cl = cm.connect_to_db('foursquare') flickr = cm.connect_to_db('world', cl)[0] res = db.venue.find({'city': 'paris', 'tags': {'$ne': []}}, {'tags': 1}) venues_tags = {v['_id']: len(v['tags']) for v in res} fl_venue = flickr.photos.find({'venue': {'$ne': None}}, {'tags': 1}) fl_ids = set([v['_id'] for v in fl_venue]) matching_venue = fl_ids.intersection(venues_tags.keys())
#! /usr/bin/python2 # vim: set fileencoding=utf-8 """Retrieve checkins tweets""" from timeit import default_timer as clock from time import sleep import TwitterAPI as twitter import twitter_helper as th import arguments import ConfigParser ARGS = arguments.tweets_parser().parse_args() DB = None SAVE = None if ARGS.mongodb: import CommonMongo as cm DB = cm.connect_to_db('foursquare', ARGS.host, ARGS.port)[0] else: json = th.import_json() import CheckinAPICrawler as cac CRAWLER = cac.CheckinAPICrawler() from Queue import Queue from threading import Thread # the size of mongo bulk insert, in multiple of pool size INSERT_SIZE = 7 CHECKINS_QUEUE = Queue((INSERT_SIZE + 3) * cac.BITLY_SIZE) # There is probably a cleaner version, but it is global because if # read_twitter_stream fails, we still want to keep a accurate count of tweets # seen so far NB_TWEETS = 0 NUM_VALID = 0
#! /usr/bin/python2 # vim: set fileencoding=utf-8 """Retrieve checkins tweets""" from timeit import default_timer as clock from time import sleep import TwitterAPI as twitter import twitter_helper as th import arguments import ConfigParser ARGS = arguments.tweets_parser().parse_args() DB = None SAVE = None if ARGS.mongodb: import CommonMongo as cm DB = cm.connect_to_db('foursquare', ARGS.host, ARGS.port)[0] else: json = th.import_json() import CheckinAPICrawler as cac CRAWLER = cac.CheckinAPICrawler() from Queue import Queue from threading import Thread # the size of mongo bulk insert, in multiple of pool size INSERT_SIZE = 7 CHECKINS_QUEUE = Queue((INSERT_SIZE+3)*cac.BITLY_SIZE) # There is probably a cleaner version, but it is global because if # read_twitter_stream fails, we still want to keep a accurate count of tweets # seen so far NB_TWEETS = 0 NUM_VALID = 0
plt.xticks(range(24/chunk), named_ticks('day', offset, chunk)) else: plt.xticks(range(7*3), named_ticks('mix')) def get_distorsion(ak, kl, sval): """Compute the sum of euclidean distance from `sval` to its centroid""" return np.sum(np.linalg.norm(ak[kl, :] - sval, axis=1)) if __name__ == '__main__': # pylint: disable=C0103 import arguments args = arguments.city_parser().parse_args() city = args.city DB, CLIENT = cm.connect_to_db('foursquare', args.host, args.port) # pylint: disable=E1101 do_cluster = lambda val, k: cluster.kmeans2(val, k, 20, minit='points') def getclass(c, kl, visits): """Return {id: time pattern} of the venues in class `c` of `kl`.""" return {v[0]: v[1] for v, k in zip(visits.iteritems(), kl) if k == c} def peek_at_class(c, kl, visits, k=15): """Return a table of `k` randomly chosen venues in class `c` of `kl`.""" sample = r.sample([get_venue(i) for i in getclass(c, kl, visits).keys()], k) return pd.DataFrame({'cat': [_[0] for _ in sample],
for photo in self.db.find(self.query, {"tags": 1, "_id": 0}): yield self.dictionary.doc2bow(photo["tags"]) def save(self): corpora.MmCorpus.serialize(city + "_corpus.mm", self) def find_topics(city, client, num_topics, begin): dico = build_dico(city, client, begin) bow_photos = PhotoCorpus(city, client, dico, begin) bow_photos.save() lda = models.LdaModel(bow_photos, id2word=dico, num_topics=num_topics, passes=5, iterations=500, eval_every=5) lda_photos = lda[bow_photos] print(lda.show_topics(10)) lda.save(city + ".lda") return lda if __name__ == "__main__": # pylint: disable=C0103 import arguments args = arguments.city_parser().parse_args() city = args.city logging.basicConfig( filename=city + "_lda.log", format="%(asctime)s [%(levelname)s]: %(message)s", level=logging.INFO ) db, client = cm.connect_to_db("foursquare", args.host, args.port) lda = find_topics(city, client, num_topics=80, begin=dt(2008, 1, 1))
except pycurl.error as e: print(str(e)) return None page = buf.getvalue() tree = etree.fromstring(page, PARSER) similars = tree.xpath(XPATH_QUERY) if len(similars) == 0: return [] return [c.attrib['data-venueid'] for c in similars[0].iterchildren()] if __name__ == '__main__': client = foursquare.Foursquare(CLIENT_ID, CLIENT_SECRET) import arguments args = arguments.city_parser().parse_args() db = cm.connect_to_db('foursquare', args.host, args.port)[0] checkins = db['checkin'] # print(venue_profile(client, '')) # up = user_profile(client, 2355635) # vids = ['4a2705e6f964a52048891fe3', '4b4ad9dff964a5200b8f26e3', # '40a55d80f964a52020f31ee3', '4b4ad9dff964c5200'] # [client.venues(vid, multi=True) for vid in vids] # answers = list(client.multi()) r = gather_all_entities_id(checkins, city='helsinki', limit=50) for b in r: print(b) # svids = ['4c4787646c379521a121cfb5', '43222200f964a5209a271fe3', # '4b218c2ef964a520a83d24e3'] # gold = [[], ['4bbd0fbb8ec3d13acea01b28', '451d2412f964a5208a3a1fe3'], # ['4d72a2a9ec07548190588cbf', '4a736a23f964a52062dc1fe3', # '4f2d3e99e4b056f83aecdc88', '4aa7b5e4f964a520064d20e3',