def disc_latex(N=11): line = u'{} & {:.3f} & {} & {:.3f} & {} & {:.3f} \\\\' from rank_disc import top_discrepancy t = [persistent.load_var('disc/all'), persistent.load_var('disc/all_80'), persistent.load_var('disc/all_20')] supported = [v[0] for v in persistent.load_var('supported')] d = zip(*[top_discrepancy(l, supported) for l in t]) display = lambda v: line.format(v[0][2], v[0][0], v[1][2], v[1][0], v[2][2], v[2][0]) for v in d[:N]: print(display(v)) for v in d[-N:]: print(display(v))
def brand_awareness(brand, src, dst): """For all venues of brand `brand` in `src`, return the position of the first matching venue of the same brand in `dst`, along with a score between 0 (best) and 1 (worst).""" res = [] src_venues = p.load_var('{}_{}.my'.format(src['city'], brand)) dst_venues = p.load_var('{}_{}.my'.format(dst['city'], brand)) among = 0 for venue in src_venues: _, ids, _, _, among = find_closest(venue, src, dst) ranks = [pos for pos, res_id in enumerate(ids) if res_id in dst_venues] res.append((len(dst_venues), among, ranks)) return res
def build_dico(city, begin, client): technical = [ 'blackandwhite', 'longexposure', 'lomofi', 'mobile', 'bw', 'cameraphone', 'lofi', 'lowlight', 'geotagged', 'xproii', 'nikon', 'd200', 'square', 'instagramapp', 'squareformat', 'iphoneography', 'iphone', 'colorvibefilter', 'tonemapped', 'chameleonfilter', 'hdr' 'noflash', 'photo', 'iphone4', 'iphone5', 'fujix100', 'd700', 'f22', 'photoshop', 'photography', 'pictures', 'f18', 'canonef24105mmf4lisusm', 'nikond90', 'nikond700' ] dictionary = corpora.Dictionary(p['tags'] for p in get_tags(city, client, begin)) dictionary.filter_extremes(no_below=30, no_above=0.4, keep_n=None) dictionary.compactify() stop_ids = [ dictionary.token2id[stopword] for stopword in technical if stopword in dictionary.token2id ] good_words = [_[0] for _ in p.load_var(city + '_tag_support')] good_ids = [ dictionary.token2id[goodword] for goodword in good_words if goodword in dictionary.token2id ] dictionary.filter_tokens(bad_ids=stop_ids, good_ids=good_ids) # remove gaps in id sequence after words that were removed dictionary.compactify() print(dictionary) dictionary.save(city + '_flickr.dict') return dictionary
def read_original_graph(filename, seed=None, balanced=False, directed=False): """Read a signed graph from `filename` and compute its degree sequence. Optionally `shuffle` nodes ids""" global DEGREES, G, EDGE_SIGN, INCONSISTENT DEGREES, G, EDGE_SIGN, INCONSISTENT = None, {}, {}, 0 with open(filename) as source: for line in source: if line.startswith('#'): continue i, j, sign = [int(_) for _ in line.split()] if i == j: continue add_signed_edge(i, j, sign > 0, directed) # remove isolated vertices to_delete = [u for u, adj in G.items() if not adj] for u in to_delete: del G[u] # reindex nodes so they are sequential mapping = {v: i for i, v in enumerate(sorted(G.keys()))} G, EDGE_SIGN = reindex_nodes(G, EDGE_SIGN, mapping, directed) if balanced: import persistent src = DIRECTED_TO_DELETE if directed else EDGE_TO_DELETE to_delete = persistent.load_var(src[filename]) for edge in to_delete: remove_signed_edge(*edge, directed=directed) if isinstance(seed, int): r.seed(seed) rperm = list(G.keys()) r.shuffle(rperm) rperm = {i: v for i, v in enumerate(rperm)} G, EDGE_SIGN = reindex_nodes(G, EDGE_SIGN, rperm, directed) DEGREES = sorted(((node, len(adj)) for node, adj in G.items()), key=lambda x: x[1])
def load_graph(name, size=None): is_triangle = False if name.endswith('.my'): G, E = persistent.load_var(name) if name.find('triangle') < 0: return G, E else: is_triangle = True cexp.redensify.G = G cexp.redensify.N = len(G) cexp.redensify.EDGES_SIGN = E if name.endswith('.pbm'): return build_graph(*read_img(name)) assert is_triangle or name in ['PA', 'grid'] assert is_triangle or size > 10 if name == 'PA': cexp.fast_preferential_attachment(size, 3, .13) if name == 'grid': G, E_keys = gs.make_grid(size) cexp.redensify.G = G cexp.redensify.N = len(G) cexp.redensify.EDGES_SIGN = {e: True for e in E_keys} n = cexp.redensify.N nb_cluster = int(2*sqrt(n)) ci = cexp.turn_into_signed_graph_by_propagation(nb_cluster, infected_fraction=.9) G = deepcopy(cexp.redensify.G) E = deepcopy(cexp.redensify.EDGES_SIGN) merge_into_2_clusters(E, ci) return G, E
def global_info(city, standalone=False): """Gather global statistics about `city`.""" lvenues = geo_project(city, DB.venue.find({'city': city}, {'loc': 1})) lcheckins = geo_project(city, DB.checkin.find({'city': city}, {'loc': 1})) lphotos = geo_project(city, CLIENT.world.photos.find({'hint': city}, {'loc': 1})) local_projection = [lvenues, lcheckins, lphotos] visits = xp.get_visits(CLIENT, xp.Entity.venue, city) visitors = xp.get_visitors(CLIENT, city) density = estimate_density(city) activity = [visits, visitors, density] global TOP_CATS TOP_CATS = p.load_var('top_cats.my') infos = {'venue': [] if standalone else ['cat', 'cats'], 'photo': ['taken'] if standalone else ['venue']} svenues = s.Surrounding(DB.venue, {'city': city}, infos['venue'], lvenues) scheckins = s.Surrounding(DB.checkin, {'city': city}, ['time'], lcheckins) sphotos = s.Surrounding(CLIENT.world.photos, {'hint': city}, infos['photo'], lphotos) surroundings = [svenues, scheckins, sphotos] p.save_var('{}_s{}s.my'.format(city, 'venue'), svenues) if standalone: for name, var in zip(['venue', 'checkin', 'photo'], surroundings): p.save_var('{}_s{}s.my'.format(city, name), var) return local_projection + activity + surroundings
def add_cc_noise(noise, balanced=False, seed=None): global bfstrees redensify.G = deepcopy(orig_g) redensify.EDGES_SIGN = deepcopy(orig_es) if balanced and SYNTHETIC_DATA: to_delete = persistent.load_var(BASENAME+'_balance.my') for edge in to_delete: pot.delete_edge(redensify.G, edge, redensify.EDGES_SIGN) cexp.add_noise(noise/100, noise/100) if seed is not None: random.seed(seed) rperm = list(redensify.G.keys()) random.shuffle(rperm) rperm = {i: v for i, v in enumerate(rperm)} _ = rw.reindex_nodes(redensify.G, redensify.EDGES_SIGN, rperm) redensify.G, redensify.EDGES_SIGN = _ rw.G = deepcopy(redensify.G) rw.EDGE_SIGN = deepcopy(redensify.EDGES_SIGN) rw.DEGREES = sorted(((node, len(adj)) for node, adj in rw.G.items()), key=lambda x: x[1]) if not bfstrees: if SYNTHETIC_DATA: bfstrees = [t[1] for t in nsi.compute_trees()] else: bfstrees = [] for root in (u[0] for u in rw.DEGREES[-50:]): bfstrees.append(pot.get_bfs_tree(rw.G, root)) return get_largest_component()
def photos_to_cluster_dataset(city, limit=300): print 'utils.py/photos_to_cluster_dataset' photos = load_var(city) points = [[p[0] + noise(), p[1] + noise(), 'Win!'] for p in photos[:limit]] with open(city+'_cluster.js', 'w') as f: f.write('var {}_cluster = {}'.format(city, str(points)))
def increase_coverage(upto=5000): """Save `upto` unprocessed San Francisco tags""" from more_query import get_top_tags sup = persistent.load_var('supported') more = get_top_tags(upto, 'nsf_tag.dat') already = [v[0] for v in sup] addition = set(more).difference(set(already)) persistent.save_var('addition', addition)
def get_data(DB): entropies = load_var('Hsupported') tags = sorted([k for k, v in entropies.items() if 2.5 <= v <= 3.01]) save_var('mat_tag', tags) u = load_var('user_status') user_index = {k: i for i, k in enumerate(u)} def format_photo(p): user = user_index[p['uid']] loc = p['loc']['coordinates'] taken = [p['taken'].weekday(), p['taken'].hour, calendar.timegm(p['taken'].utctimetuple())] indicator = [int(t in p['ntags']) for t in tags] return [user] + loc + taken + indicator photos_feature = np.mat(tag_time(DB, tags, format_photo)) sio.savemat('deep', {'A': scipy.sparse.csr_matrix(photos_feature)})
def photos_to_heat_dataset(city, precision=4, limit=300): photos = load_var(city) points = Counter([(round(p[0], precision), round(p[1], precision)) for p in photos]) maxi = points.most_common(1)[0][1] dataset = [{"lat": p[1], "lon": p[0], "value": c} for p, c in points.most_common(limit)] json_dataset = json.dumps({"max": maxi, "data": dataset}) with open(city + ".js", "w") as f: f.write("var {} = {}".format(city, json_dataset))
def insert_pending_task(conn, end, pending_file): try: pending_task = load_var(pending_file) pending_task['end'] = end pending_task['e_offset'] = date_offset(pending_task['end']) insert_task(conn, pending_task) os.remove(pending_file) except IOError: pass
def load_surroundings(city): """Load projected coordinates and extra field of all venues, checkins and photos within `city`, as well as returning city geographical bounds.""" import persistent as p surroundings = [p.load_var('{}_svenues.my'.format(city)), None, None] # surroundings = [p.load_var('{}_s{}s.my'.format(city, kind)) # for kind in ['venue', 'checkin', 'photo']] venues_pos = np.vstack(surroundings[0].loc) city_extent = list(np.min(venues_pos, 0)) + list(np.max(venues_pos, 0)) return surroundings, city_extent
def get_best_tags(point): tags = persistent.load_var(u'disc/all_{}'.format(GRID_SIZE)) res = [] size = point.area for tag, polys in tags.items(): for val, poly in polys: if point.intersects(poly) and \ point.intersection(poly).area > .6*size: res.append((tag, val)) break return sorted(res, key=lambda x: x[1], reverse=True)
def photos_to_heat_dataset(city, precision=4, limit=300): print 'utils.py/photos_to_heat_dataset' photos = load_var(city) points = Counter([(round(p[0], precision), round(p[1], precision)) for p in photos]) maxi = points.most_common(1)[0][1] dataset = [{'lat': p[1], 'lon': p[0], 'value': c} for p, c in points.most_common(limit)] json_dataset = json.dumps({'max': maxi, 'data': dataset}) with open(city+'.js', 'w') as f: f.write('var {} = {}'.format(city, json_dataset))
def load_graph(seed=None): if BASENAME.startswith('soc'): rw.read_original_graph(BASENAME, seed=seed, balanced=BALANCED) redensify.G = deepcopy(rw.G) redensify.EDGES_SIGN = deepcopy(rw.EDGE_SIGN) elif DATA == 'LP': _ = persistent.load_var(BASENAME+'.my') redensify.G, redensify.EDGES_SIGN = _ return else: G = gt.load_graph(BASENAME+'.gt') cexp.to_python_graph(G)
def photos_to_heat_dataset(city, precision=4, limit=300): photos = load_var(city) points = Counter([(round(p[0], precision), round(p[1], precision)) for p in photos]) maxi = points.most_common(1)[0][1] dataset = [{ 'lat': p[1], 'lon': p[0], 'value': c } for p, c in points.most_common(limit)] json_dataset = json.dumps({'max': maxi, 'data': dataset}) with open(city + '.js', 'w') as f: f.write('var {} = {}'.format(city, json_dataset))
def build_dico(city, begin, client): technical = [ "blackandwhite", "longexposure", "lomofi", "mobile", "bw", "cameraphone", "lofi", "lowlight", "geotagged", "xproii", "nikon", "d200", "square", "instagramapp", "squareformat", "iphoneography", "iphone", "colorvibefilter", "tonemapped", "chameleonfilter", "hdr" "noflash", "photo", "iphone4", "iphone5", "fujix100", "d700", "f22", "photoshop", "photography", "pictures", "f18", "canonef24105mmf4lisusm", "nikond90", "nikond700", ] dictionary = corpora.Dictionary(p["tags"] for p in get_tags(city, client, begin)) dictionary.filter_extremes(no_below=30, no_above=0.4, keep_n=None) dictionary.compactify() stop_ids = [dictionary.token2id[stopword] for stopword in technical if stopword in dictionary.token2id] good_words = [_[0] for _ in p.load_var(city + "_tag_support")] good_ids = [dictionary.token2id[goodword] for goodword in good_words if goodword in dictionary.token2id] dictionary.filter_tokens(bad_ids=stop_ids, good_ids=good_ids) # remove gaps in id sequence after words that were removed dictionary.compactify() print(dictionary) dictionary.save(city + "_flickr.dict") return dictionary
def get_user_status(with_count=False): name = 'user_status' + ('_full' if with_count else '') fields = {'tourist': 1} if with_count: fields.update({'count': 1}) try: d = load_var(name) except IOError: users = list(DB.users.find(fields=fields)) if with_count: d = dict([(u['_id'], (u['count'], u['tourist'])) for u in users]) else: d = dict([(u['_id'], u['tourist']) for u in users]) save_var(name, d) return d
def load_real_graph(dataset='citeseer', main_class=None): default_main_class = {'citeseer': 1, 'cora': 2, 'pubmed_core': 1, 'usps4500': 4, 'rcv1': 2, 'imdb': 0, } main_class = main_class if main_class is not None else default_main_class[dataset] ew, y = persistent.load_var('{}_lcc.my'.format(dataset)) adj = {} for u, v in ew: add_edge(adj, u, v) gold = {i: 1 if v == main_class else -1 for i, v in enumerate(y)} return adj, ew, gold, compute_phi(ew, gold)
def describe_city(city): """Compute feature vector for selected venue in `city`.""" CATS2 = p.load_var('cat_depth_2.my') # a few venues don't have level 2 categories (TODO add it manually?) CATS2.update({cat: int(idx*1e5) for idx, cat in enumerate(CATS)}) info = global_info(city) lvenues, lcheckins, lphotos = info[:3] visits, visitors, density = info[3:6] nb_visitors = np.unique(np.array([v for place in visitors.itervalues() for v in place])).size svenues, scheckins, sphotos = info[6:] categories = categories_repartition(city, svenues, lvenues, RADIUS) venues = DB.venue.find({'city': city, 'closed': {'$ne': True}, 'cat': {'$ne': None}, 'usersCount': {'$gt': 1}}, {'cat': 1}) chosen = [v['_id'] for v in venues if len(visits.get(v['_id'], [])) > 4 and len(np.unique(visitors.get(v['_id'], []))) > 1 and not is_event(v['cat'])] print("Chosen {} venues in {}.".format(len(chosen), city)) info, _ = venues_info(chosen, visits, visitors, density, depth=2, tags_freq=False) print("{} of them will be in the matrix.".format(len(info))) numeric = np.zeros((len(info), 31), dtype=np.float32) numeric[:, :5] = np.array([info['likes'], info['users'], info['checkins'], info['H'], info['Den']]).T print('venues with no level 2 category:') print([info.index[i] for i, c in enumerate(info['cat']) if CATS2[c] % int(1e5) == 0]) numeric[:, 5] = [CATS2[c] for c in info['cat']] numeric[:, 24] = np.array(info['Ht']) for idx, vid in enumerate(info.index): surrounding = full_surrounding(vid, lvenues, lphotos, lcheckins, svenues, scheckins, sphotos, city) cat, focus, ratio, around_visits = surrounding numeric[idx, 6:15] = cat numeric[idx, 15] = focus numeric[idx, 16] = ratio own_visits = visits[vid] numeric[idx, 17] = is_week_end_place(own_visits) daily_visits = xp.aggregate_visits(own_visits, 1, 4)[0] numeric[idx, 18:24] = xp.to_frequency(daily_visits) numeric[idx, 25:31] = xp.to_frequency(around_visits) weird = np.argwhere(np.logical_or(np.isnan(numeric), np.isinf(numeric))) numeric[weird] = 0.0 sio.savemat(city+'_fv', {'v': numeric, 'c': categories, 'i': np.array(list(info.index)), 'stat': [nb_visitors]}, do_compression=True)
def get_categories(client=None): """Return categories list from disk or from Foursquare website using client""" if client is None: raw_cats = p.load_var('raw_categories')['categories'] else: raw_cats = client.venues.categories() p.save_var('raw_categories', raw_cats) raw_cats = raw_cats['categories'] cats = Category('1', 'Venue', 0, parse_categories(raw_cats)) # pylint: disable=E1101 id_index = [(id_, idx + 100) for idx, id_ in enumerate(sorted(CAT_TO_ID.values())) if id_ not in ['0', '1']] ID_TO_INDEX.update(id_index) return cats
def load_existing_ids(cmd_args): """Read checkins ids in city from disk or DB.""" city = cmd_args.city if city == 'whole': return [] import persistent as p try: return p.load_var(city+'_checkins_ids.my') except IOError: pass import CommonMongo as cm db = cm.connect_to_db('foursquare', cmd_args.host, cmd_args.port)[0] ids = {str(_['_id']) for _ in db.checkin.find({'city': city}, {'_id': 1}) if not isinstance(_['_id'], long)} p.save_var(city+'_checkins_ids.my', ids) return ids
def load_data(self, dataset, balanced=False, small_wiki=False): timestamp = 'ts' in dataset if small_wiki: Gfull, E = p.load_var('small_wiki.my') elif balanced: l.rw.read_original_graph(lp.FILENAMES[dataset], directed=True, balanced=balanced) # conflicting = set() # for (u, v), s in l.rw.EDGE_SIGN.items(): # opposite_sign = l.rw.EDGE_SIGN.get((v, u)) # if opposite_sign is not None and s != opposite_sign: # conflicting.add(tuple(sorted([u, v]))) # msg = 'Number of conflicting edges in {}: {}' # print(msg.format(dataset, 2*len(conflicting))) # for u, v in conflicting: # l.rw.remove_signed_edge(u, v, directed=True) # l.rw.remove_signed_edge(v, u, directed=True) Gfull, E = l.rw.G, l.rw.EDGE_SIGN else: pack_name = 'directed_{}.pack'.format(dataset) if timestamp: order_name = 'directed_{}_order.pack'.format(dataset) with open(order_name, 'r+b') as packfile: self.time_order = msgpack.unpack(packfile, use_list=False) Gfull, E = load_directed_signed_graph(pack_name) # root = max(Gfull.items(), key=lambda x: len(x[1]))[0] # Gbfs, _, _ = initial_spanning_tree(Gfull, root) # self.lcc = set(Gbfs.keys()) self.order = len(Gfull) self.dout, self.din = defaultdict(int), defaultdict(int) self.common_nei = {e: Gfull[e[0]].intersection(Gfull[e[1]]) for e in E} self.Gout, self.Gin = {}, {} self.edge_order, in_lcc = {}, [] for i, (u, v) in enumerate(sorted(E)): self.edge_order[(u, v)] = i # in_lcc.append(u in self.lcc and v in self.lcc) self.dout[u] += 1 self.din[v] += 1 l.add_neighbor(u, v, self.Gout) l.add_neighbor(v, u, self.Gin) self.reciprocal = {ei: self.edge_order[(e[1], e[0])] for e, ei in self.edge_order.items() if (e[1], e[0]) in E} # self.in_lcc = np.array(in_lcc, dtype=bool) self.Gfull = Gfull self.G = self.Gout self.E = E
def load_existing_ids(cmd_args): """Read checkins ids in city from disk or DB.""" city = cmd_args.city if city == 'whole': return [] import persistent as p try: return p.load_var(city + '_checkins_ids.my') except IOError: pass import CommonMongo as cm db = cm.connect_to_db('foursquare', cmd_args.host, cmd_args.port)[0] ids = { str(_['_id']) for _ in db.checkin.find({'city': city}, {'_id': 1}) if not isinstance(_['_id'], long) } p.save_var(city + '_checkins_ids.my', ids) return ids
def load_data(city): features = cn.load_matrix(city + '_fv.mat') density = features['v'][:, 4] weights = density + np.abs(density.min()) venues_generator = WeightedRandomGenerator(weights) vids, _, locs = p.load_var(city + '_svenues.my').all() vindex = features['i'] venues = np.zeros((len(vindex), 2)) index = dict(itertools.imap(lambda x: (x[1], x[0]), enumerate(vindex))) for vid, loc in itertools.izip(vids, locs): pos = index.get(vid) if pos is not None: venues[pos, :] = loc kdtree = cKDTree(venues) with open('static/ground_truth.json') as infile: gold_list = json.load(infile) return vindex, venues_generator, venues, kdtree, gold_list
def load_data(city): features = cn.load_matrix(city + '_fv.mat') density = features['v'][:, 4] weights = density + np.abs(density.min()) venues_generator = WeightedRandomGenerator(weights) vids, _, locs = p.load_var(city+'_svenues.my').all() vindex = features['i'] venues = np.zeros((len(vindex), 2)) index = dict(itertools.imap(lambda x: (x[1], x[0]), enumerate(vindex))) for vid, loc in itertools.izip(vids, locs): pos = index.get(vid) if pos is not None: venues[pos, :] = loc kdtree = cKDTree(venues) with open('static/ground_truth.json') as infile: gold_list = json.load(infile) return vindex, venues_generator, venues, kdtree, gold_list
def get_graph(balanced=False): """Load the graph from BASENAME and optionally remove positive edges to balance the graph. NOTE: this only modify redensify structure and not graph_tool & its distance matrix""" if balanced: import persistent if os.path.isfile(BASENAME+'.gt'): g = graph_tool.load_graph(BASENAME+'.gt') dst_mat = np.load(BASENAME+'_dst.npy') cexp.to_python_graph(g) if balanced: to_delete = persistent.load_var(BASENAME+'_balance.my') for edge in to_delete: pot.delete_edge(redensify.G, edge, redensify.EDGES_SIGN) return g, dst_mat if not PA: cexp.random_signed_communities(2, 500, 13, 11.5/500, .0, .0) g = cexp.to_graph_tool() else: cexp.preferential_attachment(1000, gamma=1.4, m=12) cexp.turn_into_signed_graph_by_propagation(2) DEGREES = sorted(((node, len(adj)) for node, adj in cexp.redensify.G.items()), key=lambda x: x[1]) u, v = DEGREES[-1][0], DEGREES[-2][0] u, v = v, u if u > v else u, v del cexp.redensify.EDGES_SIGN[(u, v)] cexp.redensify.G[u].remove(v) cexp.redensify.G[v].remove(u) n = g.num_vertices() dst = shortest_distance(g, dense=False) dst_mat = np.zeros((n, n), dtype=np.uint8) for v in g.vertices(): dst_mat[int(v), :] = dst[v].a.astype(np.uint8) g.save(BASENAME+'.gt') np.save(BASENAME+'_dst', dst_mat)
args = parser.parse_args() mapbox_service = mapbox.Static() mapbox_service.session.params['access_token'] = os.environ['MAPBOX_ACCESS_TOKEN'] # alternatively, you can uncomment the next line and pass your token directly # mapbox_service.session.params['access_token'] = 'YOUR_TOKEN' mainCats = ['Arts & Entertainment', 'College & University', 'Food', 'Nightlife Spot', 'Outdoors & Recreation', 'Shop & Service', 'Professional & Other Places', 'Residence', 'Travel & Transport'] main_cats_plot = mainCats[:] mainCats = {c: i for i, c in enumerate(mainCats)} mainCats['Event'] = 9 name_to_main_cat = p.load_var('name_to_cat.my') name_to_main_cat['Event'] = 'Event' name_to_main_cat['NOPRIMCAT'] = 'Event' name_to_main_cat['Tennis'] = 'Outdoors & Recreation' timeOfDay = ['MORNING', 'NOON', 'AFTERNOON', 'EVENING', 'NIGHT', 'LATENIGHT'] timeOfDay_plot = ['MORN', 'NOON', 'AFNN', 'EVEN', 'NGHT', 'LATE'] dayOfWeek = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] dayOfWeek_plot = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] city = args.city FOLDER = args.folder prefix = FOLDER + city # filenames
if b < 1e-5: theta = 0 if a < c else np.pi/2 else: theta = .5*arccot((a-c)/(2*b)) if a > c: theta += np.pi/2 R = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]) S = np.diag([_a, _b]) T = R.dot(S) return (x0, y0), .5*T.dot(T.T) if __name__ == '__main__': city, name = "paris", "test" model_prefix_1 = "sandbox/{}".format(city) m1 = p.load_var(model_prefix_1 + ".mdl") model_parameters_1 = m1.get_params() scaler_1 = p.load(model_prefix_1 + ".scaler") centers_1 = model_parameters_1.topic_centers covars_1 = model_parameters_1.topic_covar for i, (center, cov) in enumerate(zip(centers_1, covars_1)): json_poly = gaussian_to_poly(center, cov, city, "test", scaler_1, stddev=1.41, resolution=17) points = scaler_1.transform(np.array(json_poly['geometry']['coordinates'][0])) points += 0.0002*np.random.randn(*points.shape) print('{}: original (center & covariance)\n{}\n{}'.format(i, center, cov)) ncenter, ncov = poly_to_gaussian(points) print('recovered (center & covariance)\n{}\n{}'.format(ncenter, ncov)) print(json.dumps(json_poly)) print('paste that on on http://geojsonlint.com/')
t.daemon = True t.start() t = Thread(target=entities_putter, name='InsertDB') t.daemon = True t.start() total_entities = 0 city = args.city chunker = Chunker.Chunker(foursquare.MAX_MULTI_REQUESTS) previous = [e['_id'] for e in TABLE.find({'city': city})] potential = gather_all_entities_id(checkins, DB_FIELD, city=city) print('but already {} {}s in DB.'.format(len(previous), ENTITY_KIND)) import persistent as p region = city or 'world' invalid_filename = 'non_{}_id_{}'.format(ENTITY_KIND, region) try: INVALID_ID = p.load_var(invalid_filename) except IOError: pass print('and {} {}s are invalid.'.format(len(INVALID_ID), ENTITY_KIND)) new_ones = set(potential).difference(set(previous)) new_ones = new_ones.difference(set(INVALID_ID)) outside = set([e['_id'] for e in TABLE.find({'city': None}, {'_id': 1})]) outside.intersection_update(new_ones) print('and {} {}s are outside range.'.format(len(outside), ENTITY_KIND)) new_ones = new_ones.difference(outside) print('So only {} new ones.'.format(len(new_ones))) for batch in chunker(new_ones): IDS_QUEUE.put(batch) total_entities += len(batch) IDS_QUEUE.join()
#! /usr/bin/python2 # vim: set fileencoding=utf-8 import persistent import numpy as np u = persistent.load_var('user_status_full') t = np.array([p[0] for p in u.values() if p[1]]) l = np.array([p[0] for p in u.values() if not p[1]]) photos = sum(t) + sum(l) print('tourists proportion: {}%'.format(100*len(t)/(len(t) + len(l)))) print("tourists' photos proportion: {}%".format(100*sum(t)/photos)) print('tourist 90 percentile: {}'.format(np.percentile(t, 90))) print('local 90 percentile: {}'.format(np.percentile(l, 90)))
def photos_to_cluster_dataset(city, limit=300): photos = load_var(city) points = [[p[0] + noise(), p[1] + noise(), 'Win!'] for p in photos[:limit]] with open(city + '_cluster.js', 'w') as f: f.write('var {}_cluster = {}'.format(city, str(points)))
precision) / (beta * beta * precision + recall) def point_inside_poly(poly, point): """Tell whether `point` is inside convex `poly` based on dot product with every edges: demonstrations.wolfram.com/AnEfficientTestForAPointToBeInAConvexPolygon/ """ tpoly = poly - point size = tpoly.shape[0] - 1 angles = tpoly[1:, 0] * tpoly[:size, 1] - tpoly[:size, 0] * tpoly[1:, 1] return int(np.abs(np.sign(angles).sum())) == size # load venues location for all cities cities_venues_raw = {name: p.load_var(name + '_svenues.my') for name in cities} cities_venues = {} cities_index = {} cities_kdtree = {} for city in cities: vids, _, locs = cities_venues_raw[city].all() vindex = cities_desc[city]['index'] cities_venues[city] = np.zeros((len(vindex), 2)) cities_index[city] = dict( itertools.imap(lambda x: (x[1], x[0]), enumerate(vindex))) for vid, loc in itertools.izip(vids, locs): pos = cities_index[city].get(vid) if pos is not None: cities_venues[city][pos, :] = loc cities_kdtree[city] = cKDTree(cities_venues[city]) gray = '#bdbdbd'
"""update the checkin in DB""" while True: tid, lid = self.queue.get() self.new_venues.append(lid) try: self.checkinDB.update({'_id': tid}, {'$set': {'lid': lid}}) except (KeyboardInterrupt, SystemExit): raise except: print(sys.exc_info()[1]) self.queue.task_done() def save_new_lid(self): """save these new lid because we need to build their profile""" region = 'world' if self.city is None else self.city id_ = str(hash(self.cids[0]))[:5] output = 'new_venue_id_{}_{}'.format(id_, region) p.save_var(output, set(self.new_venues)) if __name__ == '__main__': city = 'chicago' checkin_ids = [ u for u in p.load_var('non_venue_id_' + city) if len(u) == 24 and u.startswith('4') ] mongo_client = pymongo.MongoClient('localhost', 27017) db = mongo_client['foursquare'] cc = CheckinCorrector(checkin_ids, db['checkin'], 'xaa', city) cc.correct()
query_city = sys.argv[1] assert query_city in CITIES, ', '.join(CITIES) CITIES.remove(query_city) input_dir = 'www_comparaison_' + query_city res = {city: defaultdict(list) for city in CITIES} for city in res.keys(): for neighborhood in NEIGHBORHOODS: for metric in METRICS: subtop = [] for output in [ name for name in os.listdir(input_dir) if name.startswith(city + '_' + neighborhood) and name.endswith(metric + '.my') ]: output = os.path.join(input_dir, output) subtop.extend(p.load_var(output)) top = get_top_disjoint(subtop, 5) if not top: continue json_cell = [ to_json(city, x[1] + [metric], x[0] + 1) for x in enumerate(top) ] res[city][neighborhood].extend(json_cell) out_name = 'static/www_cmp_{}.js'.format(query_city) with open(out_name, 'w') as out: out.write( 'var TOPREG =\n' + json.dumps(res, sort_keys=True, indent=2, separators=(',', ': ')) + ';')
#! /usr/bin/python2 # vim: set fileencoding=utf-8 from operator import itemgetter from datetime import datetime import CommonMongo as cm import csv import persistent import pyhash TO_BE_INSERTED = [] HASHER = pyhash.spooky_128() VENUE_LOC = persistent.load_var('venue_loc.my') TRAD = {} with open('trad.dat', 'r') as f: for line in f: old, new = line.strip().split(';') TRAD[old] = new def reformat(line_dict): vid = line_dict['vid'] if vid in TRAD: vid = TRAD[vid] if vid not in VENUE_LOC: return None if line_dict['_id'] == 'ICWSM': txt = ''.join(itemgetter('uid', 'vid', 'time')(line_dict)) line_dict['_id'] = hex(HASHER(txt))[2:-1] line_dict['uid'] = int(line_dict['uid']) line_dict['loc'], line_dict['city'] = VENUE_LOC[vid] line_dict['time'] = datetime.strptime(line_dict['time'], '%Y-%m-%dT%H:%M:%SZ')
for k, v in venue_to_place.iteritems() if k in bijective_venue }) return common_map def update_checkins(checkins, cmap): """Use the mapping to update venue id of checkins.""" missing = checkins.find({'lid': None}, {'_id': 1, 'place': 1}) total, corrected = 0, 0 for checkin in missing: total += 1 _id, place = checkin['_id'], checkin.get('place', None) if place and place in cmap: try: checkins.update({'_id': _id}, {'$set': {'lid': cmap[place]}}) corrected += 1 except cm.pymongo.errors.OperationFailure as err: print(err, err.coderr) print('correct {} out of {} checkins'.format(corrected, total)) if __name__ == '__main__': #pylint: disable=C0103 import persistent as p import arguments args = arguments.city_parser().parse_args() db = cm.connect_to_db('foursquare', args.host, args.port)[0] # cmap = build_map(db['checkin']) # p.save_var('place_to_venue', cmap) update_checkins(db['checkin'], p.load_var('place_to_venue'))
def consolidate(tags): d = { tag: persistent.load_var(u'disc/post_{}_{}'.format(tag, GRID_SIZE)) for tag in tags } persistent.save_var(u'disc/all_{}'.format(GRID_SIZE), d)
t.start() t = Thread(target=entities_putter, name='InsertDB') t.daemon = True t.start() total_entities = 0 city = args.city chunker = Chunker.Chunker(foursquare.MAX_MULTI_REQUESTS) previous = [e['_id'] for e in TABLE.find({'city': city})] print previous potential = gather_all_entities_id(checkins, DB_FIELD, city=city) print('but already {} {}s in DB.'.format(len(previous), ENTITY_KIND)) import persistent as p region = city or 'world' invalid_filename = 'non_{}_id_{}'.format(ENTITY_KIND, region) try: INVALID_ID = p.load_var(invalid_filename) except IOError: pass print('and {} {}s are invalid.'.format(len(INVALID_ID), ENTITY_KIND)) new_ones = set(potential).difference(set(previous)) new_ones = new_ones.difference(set(INVALID_ID)) outside = set([e['_id'] for e in TABLE.find({'city': None}, {'_id': 1})]) outside.intersection_update(new_ones) print('and {} {}s are outside range.'.format(len(outside), ENTITY_KIND)) new_ones = new_ones.difference(outside) print('So only {} new ones.'.format(len(new_ones))) for batch in chunker(new_ones): IDS_QUEUE.put(batch) total_entities += len(batch) IDS_QUEUE.join()
self.fields) return neighbors_ids, extra, neighbors_locs if __name__ == '__main__': # pylint: disable=C0103 from timeit import default_timer as clock import CommonMongo as cm import random as r import arguments args = arguments.city_parser().parse_args() city = args.city radius = 350 DB, CLIENT = cm.connect_to_db('foursquare', args.host, args.port) import persistent as p lvenues = p.load_var(city + '_lvenues.my') svenues = Surrounding(DB.venue, {'city': city}, 'cat cats'.split(), lvenues) test_ids = r.sample(lvenues.keys(), 35) start = clock() for vid in test_ids: me = DB.venue.find_one({'_id': vid}, {'loc': 1, 'city': 1}) ball = {'$geometry': me['loc'], '$maxDistance': radius} neighbors = DB.venue.find({ 'city': city, 'loc': { '$near': ball } }, { 'cat': 1, 'cats': 1,
def main(): import persistent as p args, parser = parse_args() # Get current time to use it as a filename for output files filename_prefix = "data/" + args.description # filename_prefix = datetime.today().strftime("%d-%m-%Y-%H.%M.%S") if args.city: external = args.external or str(args.k_min) city = args.city filename_prefix = '_'.join([city, external, str(args.n_components)]) filename_prefix = 'comparisons/' + filename_prefix args.query = '{{"bboxCity": "{}"}}'.format(args.city) # connect to mongo, load and standardize data db = get_mongo_database_with_auth(args.dbhost, args.dbport, args.dbname, args.username, args.password) # TODO: Get this from command line venue_extractors = [io.venue_primary_category_extractor] checkin_extractors = [ io.checkin_time_extractor_hard, io.checkin_user_extractor, io.checkin_day_extractor ] data, scaler = io.load_data_mongo(db[args.venuecoll], db[args.checkincoll], args.query, venue_extractors, checkin_extractors, filename_prefix, args.n_components, args.venue_threshold) # Split into train and test train, test = io.split_train_test_with_common_vocabulary(data, test_size=0.2) print("Loaded {0} ({1} train, {2} test) data points.".format( data["coordinates"].shape[0], train["coordinates"].shape[0], test["coordinates"].shape[0]), file=sys.stderr) # set centers of topics initial_topic_centers = None initial_topic_covar = None if args.external: initial_topic_centers, initial_topic_covar = \ p.load_var('comparisons/{}_{}.preset'.format(city, args.external)) # Run EM n times best_train_likelihood = -1 * np.inf best_test_likelihood = -1 * np.inf best_k = None best_lambda = None best_model = None lambda_list = args.lambdas k_list = range(args.k_min, 1 + args.k_max, args.k_step) train_likelihood_across_k = -np.inf * np.ones( (len(lambda_list), len(k_list))) test_likelihood_across_k = -np.inf * np.ones( (len(lambda_list), len(k_list))) track_params = args.trackparams if args.plot: likelihood_fig = plt.figure() if initial_topic_centers is not None: k_list = [len(initial_topic_centers)] for lidx, Lambda in enumerate(lambda_list): for kidx, num_topics in enumerate(k_list): print("\n====== lambda = {0}, k = {1} ======\n\n".format( Lambda, num_topics), file=sys.stderr) # n_jobs=-2 -> Leave only one logical core unused models = Parallel(n_jobs=-2, backend="threading")(delayed(run)( train, Lambda, num_topics, i, args, initial_topic_centers, initial_topic_covar, track_params) for i in range(args.runs)) # TODO remove this or add command line option # Swap to this for serial processing # models = [run(train, Lambda, num_topics, i, args, # initial_topic_centers, initial_topic_covar, # track_params) # for i in range(args.runs)] best_model_index_for_parameters = np.argmax( [model.latest_statistics.likelihood for model in models]) best_model_in_k = models[best_model_index_for_parameters] train_likelihood_across_k[lidx][kidx] = \ best_model_in_k.latest_statistics.likelihood test_likelihood_for_parameters = \ best_model_in_k.predict_log_probs(test) test_likelihood_across_k[lidx][kidx] = \ test_likelihood_for_parameters if test_likelihood_for_parameters > best_test_likelihood: best_train_likelihood = \ best_model_in_k.latest_statistics.likelihood best_test_likelihood = test_likelihood_for_parameters best_k = num_topics best_model = best_model_in_k gc.collect() print("Results of the best model:\n", file=sys.stderr) print_stuff(data["unigrams"], best_model.get_params()) print("Best train likelihood: {0}\n".format(best_train_likelihood), file=sys.stderr) print("Best test likelihood: {0}\n".format(best_test_likelihood), file=sys.stderr) print("PROB VS VARIATIONAL") print(best_model.predict_log_probs(test)) print(best_model.predict_log_probs_variational(test)) if args.save: query = "synthetic" try: if args.query: query = args.query except: pass io.save_model(best_model, scaler, query, data["unigrams"], filename_prefix) # PLOTS if args.plot: x_plot_num = 1 y_plot_num = 1 if len(k_list) > 1: plotting.plot_across_lambda_and_k(lambda_list, k_list, train_likelihood_across_k, test_likelihood_across_k, train["coordinates"].shape[0], data["coordinates"].shape[0], filename_prefix, save=True) if track_params: best_statistics_history = best_model.get_statistics_history() # Plot likelihood graph likelihood_plot = plotting.plot_statistics_history( likelihood_fig, best_statistics_history, x_plot_num, y_plot_num, 0) # Put the legend on the last likelihood plot likelihood_fig.legend(list(likelihood_plot), [ 'Likelihood', 'User likelihood', 'Location likelihood', 'Topic likelihood', 'Sigma likelihood', 'Phi entropy' ]) # TODO add command line option # Uncomment to enable animated plots # phi_animated_fig, phi_animated_ax = plt.subplots(1, 1) # anim = plotting.plot_phi_animated(phi_animated_fig, # phi_animated_ax, train, best_statistics_history) # anim.save('anim.gif', writer='imagemagick', fps=10, dpi=300) plt.show()
# NEIGHBORHOODS = ['triangle', 'latin'] # METRICS = ['jsd', 'emd', 'cluster', 'emd-lmnn', 'leftover'] METRICS = ['emd-itml', 'emd-tsne'] if __name__ == '__main__': # pylint: disable=C0103 import json query_city = sys.argv[1] assert query_city in CITIES, ', '.join(CITIES) CITIES.remove(query_city) input_dir = 'www_comparaison_' + query_city res = {city: defaultdict(list) for city in CITIES} for city in res.keys(): for neighborhood in NEIGHBORHOODS: for metric in METRICS: subtop = [] for output in [name for name in os.listdir(input_dir) if name.startswith(city+'_'+neighborhood) and name.endswith(metric+'.my')]: output = os.path.join(input_dir, output) subtop.extend(p.load_var(output)) top = get_top_disjoint(subtop, 5) if not top: continue json_cell = [to_json(city, x[1]+[metric], x[0]+1) for x in enumerate(top)] res[city][neighborhood].extend(json_cell) out_name = 'static/www_cmp_{}.js'.format(query_city) with open(out_name, 'w') as out: out.write('var TOPREG =\n' + json.dumps(res, sort_keys=True, indent=2, separators=(',', ': ')) + ';')
random.seed(123) nrep = 11 timings = [] n = int(1e6) # ew = {(random.randint(0, n), random.randint(0, n)): 1002*random.random() # for _ in range(30000)} # for _ in range(nrep): # start = clock() # benchmark(ew) # timings.append(clock() - start) # print('\t'.join(('{:.3g}'.format(t) for t in timings))) # print(sum(timings[1:])/(nrep-1)) # sys.exit() import persistent dataset = 'usps4500' ew, y = persistent.load_var('{}_lcc.my'.format(dataset)) nrep = 5 timings = [] for _ in range(nrep): start = clock() mst = kruskal_mst_edges(ew) timings.append(clock() - start) print('\t'.join(('{:.3g}'.format(t) for t in timings))) print(sum(timings[1:])/(nrep-1)) import networkx as nx g = nx.Graph() g.add_weighted_edges_from((u,v,w) for (u,v),w in ew.items()) mst_gold = [(u,v) for (u,v) in nx.minimum_spanning_tree(g).edges()] print(sorted(mst) == sorted(mst_gold))
theta = 0 if a < c else np.pi / 2 else: theta = .5 * arccot((a - c) / (2 * b)) if a > c: theta += np.pi / 2 R = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]]) S = np.diag([_a, _b]) T = R.dot(S) return (x0, y0), .5 * T.dot(T.T) if __name__ == '__main__': city, name = "paris", "test" model_prefix_1 = "sandbox/{}".format(city) m1 = p.load_var(model_prefix_1 + ".mdl") model_parameters_1 = m1.get_params() scaler_1 = p.load(model_prefix_1 + ".scaler") centers_1 = model_parameters_1.topic_centers covars_1 = model_parameters_1.topic_covar for i, (center, cov) in enumerate(zip(centers_1, covars_1)): json_poly = gaussian_to_poly(center, cov, city, "test", scaler_1, stddev=1.41, resolution=17) points = scaler_1.transform( np.array(json_poly['geometry']['coordinates'][0])) points += 0.0002 * np.random.randn(*points.shape)
mapbox_service.session.params['access_token'] = os.environ[ 'MAPBOX_ACCESS_TOKEN'] # alternatively, you can uncomment the next line and pass your token directly # mapbox_service.session.params['access_token'] = 'YOUR_TOKEN' mainCats = [ 'Arts & Entertainment', 'College & University', 'Food', 'Nightlife Spot', 'Outdoors & Recreation', 'Shop & Service', 'Professional & Other Places', 'Residence', 'Travel & Transport' ] main_cats_plot = mainCats[:] mainCats = {c: i for i, c in enumerate(mainCats)} mainCats['Event'] = 9 name_to_main_cat = p.load_var('name_to_cat.my') name_to_main_cat['Event'] = 'Event' name_to_main_cat['NOPRIMCAT'] = 'Event' name_to_main_cat['Tennis'] = 'Outdoors & Recreation' timeOfDay = [ 'MORNING', 'NOON', 'AFTERNOON', 'EVENING', 'NIGHT', 'LATENIGHT' ] timeOfDay_plot = ['MORN', 'NOON', 'AFNN', 'EVEN', 'NGHT', 'LATE'] dayOfWeek = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ] dayOfWeek_plot = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] city = args.city
'src': name + '.shp', 'labeling': { 'key': 'tag' } } color = '#ffa873' style.append(CSS.format(name, color, 'black')) with fiona.collection(mkpath('disc', name + '.shp'), "w", "ESRI Shapefile", schema) as f: f.writerecords(polys) with open(mkpath('disc', 'photos.json'), 'w') as f: json.dump(KARTO_CONFIG, f) style.append('#top_disc-label {font-family: OpenSans; font-size: 14px}') with open(mkpath('disc', 'photos.css'), 'w') as f: f.write('\n'.join(style)) sf = box(SF_BBOX[1], SF_BBOX[0], SF_BBOX[3], SF_BBOX[2]) print(sf.bounds) print(100 * cover.area, sf.area) if __name__ == '__main__': t = persistent.load_var('disc/all') # top = get_top_tags(2000, 'nsf_tag.dat') supported = [v[0] for v in persistent.load_var('supported')][:600] d = top_discrepancy(t, supported) N = 11 print([v[2] for v in d[-N:]], [v[2] for v in d[:N]]) # plot_some(d, 20) # js_some(d, 15)
def start_requests(self): self.seen_users = persistent.load_var('seen_users.my') urls = [MEMBER_PREFIX+id_ for id_ in persistent.load_var('next_users.my')] for url in urls: yield scrapy.Request(url=url, callback=self.parse)
def load_list(name): try: return p.load_var(name + '.my') except IOError: return []
import persistent import codecs import math import scipy.io as sio import numpy d = persistent.load_var('tag_support') t = sorted(d.iteritems(), key=lambda x: (x[1][0], x[1][1]), reverse=True) res = [] template = u'{}: {} photos by {} users over {} days' numeric = numpy.zeros((len(t), 3), dtype=numpy.int32) i = 0 for tag, info in t: days = int(math.ceil((info[3] - info[2]).total_seconds()/3600)) res.append(template.format(tag, info[0], info[1], days)) numeric[i, :] = [info[0], info[1], days] i += 1 sio.savemat('tag_support_num', {'t': numeric}) # with codecs.open('tag_support.txt', 'w', 'utf8') as f: # f.write('\n'.join(res))
def post_process(tag): top_loc = persistent.load_var(u'disc/top_{}_{}'.format(tag, GRID_SIZE)) merged = merge_regions(top_loc) persistent.save_var(u'disc/post_{}_{}'.format(tag, GRID_SIZE), merged)