def find_match(): side = int(f.request.form['side']) assert side in [0, 1] _id = f.request.form['_id'] first = DEST if side else ORIGIN second = ORIGIN if side else DEST query, res_ids, answers, dsts, among = cn.find_closest(_id, first, second) _ = cn.interpret(first['features'][query, :], second['features'][answers[0], :]) query_info, first_answer, feature_order = _ answers_info = [first_answer] answers_info.extend([ cn.interpret(first['features'][query, :], second['features'][answer, :], feature_order)[1] for answer in answers[1:] ]) sendf = lambda x, p: ('{:.' + str(p) + 'f}').format(float(x)) res = { 'query': query_info, 'answers_id': list(res_ids), 'distances': [sendf(d, 5) for d in dsts], 'explanations': answers_info, 'among': among } return f.jsonify(r=res)
def compare(origin, dest, knn): """Compare two cities.""" global ORIGIN global DEST origin = 'barcelona' if origin not in c.SHORT_KEY else origin dest = 'helsinki' if dest not in c.SHORT_KEY else dest ORIGIN = cn.gather_info(origin, knn, raw_features=True) DEST = cn.gather_info(dest, knn, raw_features=True) return f.render_template('cnn.html', origin=origin, dest=dest, knn=knn, lbbox=c.BBOXES[origin], rbbox=c.BBOXES[dest])
def neighborhoods(origin, dest): """Match neighborhoods.""" global ORIGIN global DEST origin = 'paris' if origin not in c.SHORT_KEY else origin dest = 'helsinki' if dest not in c.SHORT_KEY else dest ORIGIN = cn.gather_info(origin, 1, raw_features=True) DEST = cn.gather_info(dest, 1, raw_features=True) return f.render_template('nei.html', origin=origin, dest=dest, lbbox=c.BBOXES[origin], rbbox=c.BBOXES[dest])
def show_gold(city, neighborhood): """Show ground thruth for the given query.""" global ORIGIN ORIGIN = cn.gather_info(city, 1, raw_features=True) return f.render_template('gold.html', district=neighborhood, bbox=c.BBOXES[city], city=city)
def find_match(): side = int(f.request.form['side']) assert side in [0, 1] _id = f.request.form['_id'] first = DEST if side else ORIGIN second = ORIGIN if side else DEST query, res_ids, answers, dsts, among = cn.find_closest(_id, first, second) _ = cn.interpret(first['features'][query, :], second['features'][answers[0], :]) query_info, first_answer, feature_order = _ answers_info = [first_answer] answers_info.extend([cn.interpret(first['features'][query, :], second['features'][answer, :], feature_order)[1] for answer in answers[1:]]) sendf = lambda x, p: ('{:.'+str(p)+'f}').format(float(x)) res = {'query': query_info, 'answers_id': list(res_ids), 'distances': [sendf(d, 5) for d in dsts], 'explanations': answers_info, 'among': among} return f.jsonify(r=res)
def load_data(city): features = cn.load_matrix(city + '_fv.mat') density = features['v'][:, 4] weights = density + np.abs(density.min()) venues_generator = WeightedRandomGenerator(weights) vids, _, locs = p.load_var(city + '_svenues.my').all() vindex = features['i'] venues = np.zeros((len(vindex), 2)) index = dict(itertools.imap(lambda x: (x[1], x[0]), enumerate(vindex))) for vid, loc in itertools.izip(vids, locs): pos = index.get(vid) if pos is not None: venues[pos, :] = loc kdtree = cKDTree(venues) with open('static/ground_truth.json') as infile: gold_list = json.load(infile) return vindex, venues_generator, venues, kdtree, gold_list
def load_data(city): features = cn.load_matrix(city + '_fv.mat') density = features['v'][:, 4] weights = density + np.abs(density.min()) venues_generator = WeightedRandomGenerator(weights) vids, _, locs = p.load_var(city+'_svenues.my').all() vindex = features['i'] venues = np.zeros((len(vindex), 2)) index = dict(itertools.imap(lambda x: (x[1], x[0]), enumerate(vindex))) for vid, loc in itertools.izip(vids, locs): pos = index.get(vid) if pos is not None: venues[pos, :] = loc kdtree = cKDTree(venues) with open('static/ground_truth.json') as infile: gold_list = json.load(infile) return vindex, venues_generator, venues, kdtree, gold_list
def get_knn_candidates(vids, left_knn, right_knn, at_least, at_most=None): """Return between `at_least` and `at_most` venue in right that are close (in the sense of euclidean distance) of the `vids` in left. Namely, it return their row number and their ids.""" import heapq candidates = [] candidates_id = [] knn = right_knn['knn'] at_most = int(at_most) or 50000 nb_venues = min(at_most, max(len(vids) * knn, at_least)) for idx, vid in enumerate(vids): _, rid, ridx, dst, _ = cn.find_closest(vid, left_knn, right_knn) for dst_, rid_, ridx_, idx_ in zip(dst, rid, ridx, range(knn)): if rid_ not in candidates_id: candidates_id.append(rid_) heapq.heappush(candidates, (dst_, idx * knn + idx_, (rid_, ridx_))) nb_venues = min(len(candidates), int(nb_venues)) closest = heapq.nsmallest(nb_venues, candidates) mask = np.array([v[2][1] for v in closest]) r_vids = np.array([v[2][0] for v in closest]) return mask, r_vids
def get_knn_candidates(vids, left_knn, right_knn, at_least, at_most=None): """Return between `at_least` and `at_most` venue in right that are close (in the sense of euclidean distance) of the `vids` in left. Namely, it return their row number and their ids.""" import heapq candidates = [] candidates_id = [] knn = right_knn['knn'] at_most = int(at_most) or 50000 nb_venues = min(at_most, max(len(vids)*knn, at_least)) for idx, vid in enumerate(vids): _, rid, ridx, dst, _ = cn.find_closest(vid, left_knn, right_knn) for dst_, rid_, ridx_, idx_ in zip(dst, rid, ridx, range(knn)): if rid_ not in candidates_id: candidates_id.append(rid_) heapq.heappush(candidates, (dst_, idx*knn+idx_, (rid_, ridx_))) nb_venues = min(len(candidates), int(nb_venues)) closest = heapq.nsmallest(nb_venues, candidates) mask = np.array([v[2][1] for v in closest]) r_vids = np.array([v[2][0] for v in closest]) return mask, r_vids
def interpret_query(from_city, to_city, region, metric): """Load informations about cities and compute useful quantities.""" # Load info of the first city suffix = '_tsne.mat' if metric == 'emd-tsne' else '' left = cn.gather_info(from_city+suffix, knn=1, raw_features='lmnn' not in metric, hide_category=metric != 'jsd') left_infos = load_surroundings(from_city) left_support = features_support(left['features']) # Compute info about the query region center, radius, _, contains = polygon_to_local(from_city, region) query = describe_region(center, radius, contains, left_infos[0], left) features, times, weights, vids = query # print('{} venues in query region.'.format(len(vids))) venue_proportion = 1.0*len(vids) / left['features'].shape[0] # And use them to define the metric that will be used theta = np.ones((1, left['features'].shape[1])) theta = np.array([[0.0396, 0.0396, 0.2932, 0.0396, 0.0396, 0.0396, 0.0396, 0.3404, 0.0396, 0.0396, 0.0396, 0.0396, 0.0396, 0.3564, 0.0396, 0.3564, 0.0396, 0.3564, 0.3564, 0.3564, 0.0396, 0.0396, 0.0396, 0.0396, 0.3564, 0.0396, 0.0396, 0.0396, 0.0396, 0.0396, 0.0396]]) ltheta = len(theta.ravel())*[1, ] if 'emd' in metric: from emd import emd from emd_dst import dist_for_emd if 'tsne' in metric: from specific_emd_dst import dst_tsne as dist_for_emd if 'itml' in metric: from specific_emd_dst import dst_itml as dist_for_emd query_num = features_as_lists(features) @profile def regions_distance(r_features, r_weigths): if len(r_features) >= MAX_EMD_POINTS: return 1e20 return emd((query_num, map(float, weights)), (r_features, map(float, r_weigths)), lambda a, b: float(dist_for_emd(a, b, ltheta))) elif 'cluster' in metric: from scipy.spatial.distance import cdist query_num = weighted_clusters(features, NB_CLUSTERS, weights) def regions_distance(r_features, r_weigths): r_cluster = weighted_clusters(r_features, NB_CLUSTERS, r_weigths) costs = cdist(query_num, r_cluster).tolist() return min_cost(costs) elif 'leftover' in metric: @profile def regions_distance(r_features, second_arg): r_weigths, idx = second_arg emd_leftover.write_matlab_problem(features, weights, r_features, r_weigths, idx) return -1 else: query_num = features_as_density(features, weights, left_support) @profile def regions_distance(r_density, r_global): """Return distance of a region from `query_num`.""" return proba_distance(query_num, times, r_density, r_global, theta) # Load info of the target city right = cn.gather_info(to_city+suffix, knn=2, raw_features='lmnn' not in metric, hide_category=metric != 'jsd') right_infos = load_surroundings(to_city) minx, miny, maxx, maxy = right_infos[1] right_city_size = (maxx - minx, maxy - miny) right_support = features_support(right['features']) global RIGHT_SUPPORT RIGHT_SUPPORT = right_support # given extents, compute threshold of candidate threshold = 0.7 * venue_proportion * right['features'].shape[0] right_desc = [right_city_size, right_support, right, right_infos] return [left, right, right_desc, regions_distance, vids, threshold]
if __name__ == '__main__': # pylint: disable=C0103 client = pymongo.MongoClient() db = client.foursquare with open('static/raw_ground_truth.json') as inf: regions = json.load(inf) cities_venues = {} for district, gold in regions.iteritems(): # a not very elegant indirection gold = gold['gold'] for city, areas in gold.iteritems(): print(city, district) if city not in cities_venues: try: city_venues = list(cn.load_matrix(city)['i']) except IOError: city_venues = None cities_venues[city] = city_venues if cities_venues[city]: ground_truth = merge_regions(city, district, db, cities_venues[city]) msg = '{}, {}: merged into {} areas' print(msg.format(city, district, len(ground_truth))) regions[district]['gold'][city] = ground_truth else: msg = '{}, {}: not merged' print(msg.format(city, district)) with open('static/ground_truth.json', 'w') as out: json.dump(regions, out,
import CommonMongo as cm if __name__ == '__main__': # pylint: disable=C0103 import arguments args = arguments.city_parser().parse_args() city = args.city DB, CLIENT = cm.connect_to_db('foursquare', args.host, args.port) clusterer = cl.KMeans(3, n_init=5) clusterer = cl.MeanShift(min_bin_freq=3, cluster_all=False) clusterer = cl.DBSCAN(eps=5, min_samples=8, metric='cityblock') clusterer = cl.AffinityPropagation(damping=.55, affinity='euclidean') clusterer = cl.SpectralClustering(3, affinity='cosine', n_init=3) hel = cn.load_matrix(city) features = hel['v'] scale = pp.MinMaxScaler(copy=False) scale.fit_transform(features[:, 0:3]) scores = [] for k in range(3, 16): clusterer = cl.KMeans(k, n_init=10, tol=1e-5, max_iter=500) labels = clusterer.fit_predict(features) scores.append(mt.silhouette_score(features, labels)) print(Counter(labels)) np.argsort(scores)[::-1] + 3 ppl.plot(range(3, 16), scores[0:], '+') clusterer = cl.MeanShift(min_bin_freq=3, cluster_all=False) clusterer = cl.KMeans(6, n_init=20, tol=1e-5, max_iter=500) visits = xp.get_visits(CLIENT, xp.Entity.venue, city)
def interpret_query(from_city, to_city, region, metric): """Load informations about cities and compute useful quantities.""" # Load info of the first city suffix = '_tsne.mat' if metric == 'emd-tsne' else '' left = cn.gather_info(from_city + suffix, knn=1, raw_features='lmnn' not in metric, hide_category=metric != 'jsd') left_infos = load_surroundings(from_city) left_support = features_support(left['features']) # Compute info about the query region center, radius, _, contains = polygon_to_local(from_city, region) query = describe_region(center, radius, contains, left_infos[0], left) features, times, weights, vids = query # print('{} venues in query region.'.format(len(vids))) venue_proportion = 1.0 * len(vids) / left['features'].shape[0] # And use them to define the metric that will be used theta = np.ones((1, left['features'].shape[1])) theta = np.array([[ 0.0396, 0.0396, 0.2932, 0.0396, 0.0396, 0.0396, 0.0396, 0.3404, 0.0396, 0.0396, 0.0396, 0.0396, 0.0396, 0.3564, 0.0396, 0.3564, 0.0396, 0.3564, 0.3564, 0.3564, 0.0396, 0.0396, 0.0396, 0.0396, 0.3564, 0.0396, 0.0396, 0.0396, 0.0396, 0.0396, 0.0396 ]]) ltheta = len(theta.ravel()) * [ 1, ] if 'emd' in metric: from emd import emd from emd_dst import dist_for_emd if 'tsne' in metric: from specific_emd_dst import dst_tsne as dist_for_emd if 'itml' in metric: from specific_emd_dst import dst_itml as dist_for_emd query_num = features_as_lists(features) @profile def regions_distance(r_features, r_weigths): if len(r_features) >= MAX_EMD_POINTS: return 1e20 return emd((query_num, map(float, weights)), (r_features, map(float, r_weigths)), lambda a, b: float(dist_for_emd(a, b, ltheta))) elif 'cluster' in metric: from scipy.spatial.distance import cdist query_num = weighted_clusters(features, NB_CLUSTERS, weights) def regions_distance(r_features, r_weigths): r_cluster = weighted_clusters(r_features, NB_CLUSTERS, r_weigths) costs = cdist(query_num, r_cluster).tolist() return min_cost(costs) elif 'leftover' in metric: @profile def regions_distance(r_features, second_arg): r_weigths, idx = second_arg emd_leftover.write_matlab_problem(features, weights, r_features, r_weigths, idx) return -1 else: query_num = features_as_density(features, weights, left_support) @profile def regions_distance(r_density, r_global): """Return distance of a region from `query_num`.""" return proba_distance(query_num, times, r_density, r_global, theta) # Load info of the target city right = cn.gather_info(to_city + suffix, knn=2, raw_features='lmnn' not in metric, hide_category=metric != 'jsd') right_infos = load_surroundings(to_city) minx, miny, maxx, maxy = right_infos[1] right_city_size = (maxx - minx, maxy - miny) right_support = features_support(right['features']) global RIGHT_SUPPORT RIGHT_SUPPORT = right_support # given extents, compute threshold of candidate threshold = 0.7 * venue_proportion * right['features'].shape[0] right_desc = [right_city_size, right_support, right, right_infos] return [left, right, right_desc, regions_distance, vids, threshold]
if __name__ == '__main__': # pylint: disable=C0103 import arguments args = arguments.city_parser().parse_args() city = args.city DB, CLIENT = cm.connect_to_db('foursquare', args.host, args.port) clusterer = cl.KMeans(3, n_init=5) clusterer = cl.MeanShift(min_bin_freq=3, cluster_all=False) clusterer = cl.DBSCAN(eps=5, min_samples=8, metric='cityblock') clusterer = cl.AffinityPropagation(damping=.55, affinity='euclidean') clusterer = cl.SpectralClustering(3, affinity='cosine', n_init=3) hel = cn.load_matrix(city) features = hel['v'] scale = pp.MinMaxScaler(copy=False) scale.fit_transform(features[:, 0:3]) scores = [] for k in range(3, 16): clusterer = cl.KMeans(k, n_init=10, tol=1e-5, max_iter=500) labels = clusterer.fit_predict(features) scores.append(mt.silhouette_score(features, labels)) print(Counter(labels)) np.argsort(scores)[::-1]+3 ppl.plot(range(3, 16), scores[0:], '+') clusterer = cl.MeanShift(min_bin_freq=3, cluster_all=False) clusterer = cl.KMeans(6, n_init=20, tol=1e-5, max_iter=500) visits = xp.get_visits(CLIENT, xp.Entity.venue, city)
if __name__ == '__main__': # pylint: disable=C0103 client = pymongo.MongoClient() db = client.foursquare with open('static/raw_ground_truth.json') as inf: regions = json.load(inf) cities_venues = {} for district, gold in regions.iteritems(): # a not very elegant indirection gold = gold['gold'] for city, areas in gold.iteritems(): print(city, district) if city not in cities_venues: try: city_venues = list(cn.load_matrix(city)['i']) except IOError: city_venues = None cities_venues[city] = city_venues if cities_venues[city]: ground_truth = merge_regions(city, district, db, cities_venues[city]) msg = '{}, {}: merged into {} areas' print(msg.format(city, district, len(ground_truth))) regions[district]['gold'][city] = ground_truth else: msg = '{}, {}: not merged' print(msg.format(city, district)) with open('static/ground_truth.json', 'w') as out: json.dump(regions, out, sort_keys=True, indent=2, separators=(',', ': '))