Ejemplo n.º 1
0
def find_match():
    side = int(f.request.form['side'])
    assert side in [0, 1]
    _id = f.request.form['_id']
    first = DEST if side else ORIGIN
    second = ORIGIN if side else DEST
    query, res_ids, answers, dsts, among = cn.find_closest(_id, first, second)
    _ = cn.interpret(first['features'][query, :],
                     second['features'][answers[0], :])
    query_info, first_answer, feature_order = _
    answers_info = [first_answer]
    answers_info.extend([
        cn.interpret(first['features'][query, :],
                     second['features'][answer, :], feature_order)[1]
        for answer in answers[1:]
    ])
    sendf = lambda x, p: ('{:.' + str(p) + 'f}').format(float(x))
    res = {
        'query': query_info,
        'answers_id': list(res_ids),
        'distances': [sendf(d, 5) for d in dsts],
        'explanations': answers_info,
        'among': among
    }
    return f.jsonify(r=res)
Ejemplo n.º 2
0
def compare(origin, dest, knn):
    """Compare two cities."""
    global ORIGIN
    global DEST
    origin = 'barcelona' if origin not in c.SHORT_KEY else origin
    dest = 'helsinki' if dest not in c.SHORT_KEY else dest
    ORIGIN = cn.gather_info(origin, knn, raw_features=True)
    DEST = cn.gather_info(dest, knn, raw_features=True)
    return f.render_template('cnn.html', origin=origin, dest=dest, knn=knn,
                             lbbox=c.BBOXES[origin], rbbox=c.BBOXES[dest])
Ejemplo n.º 3
0
def neighborhoods(origin, dest):
    """Match neighborhoods."""
    global ORIGIN
    global DEST
    origin = 'paris' if origin not in c.SHORT_KEY else origin
    dest = 'helsinki' if dest not in c.SHORT_KEY else dest
    ORIGIN = cn.gather_info(origin, 1, raw_features=True)
    DEST = cn.gather_info(dest, 1, raw_features=True)
    return f.render_template('nei.html', origin=origin, dest=dest,
                             lbbox=c.BBOXES[origin], rbbox=c.BBOXES[dest])
Ejemplo n.º 4
0
def neighborhoods(origin, dest):
    """Match neighborhoods."""
    global ORIGIN
    global DEST
    origin = 'paris' if origin not in c.SHORT_KEY else origin
    dest = 'helsinki' if dest not in c.SHORT_KEY else dest
    ORIGIN = cn.gather_info(origin, 1, raw_features=True)
    DEST = cn.gather_info(dest, 1, raw_features=True)
    return f.render_template('nei.html',
                             origin=origin,
                             dest=dest,
                             lbbox=c.BBOXES[origin],
                             rbbox=c.BBOXES[dest])
Ejemplo n.º 5
0
def compare(origin, dest, knn):
    """Compare two cities."""
    global ORIGIN
    global DEST
    origin = 'barcelona' if origin not in c.SHORT_KEY else origin
    dest = 'helsinki' if dest not in c.SHORT_KEY else dest
    ORIGIN = cn.gather_info(origin, knn, raw_features=True)
    DEST = cn.gather_info(dest, knn, raw_features=True)
    return f.render_template('cnn.html',
                             origin=origin,
                             dest=dest,
                             knn=knn,
                             lbbox=c.BBOXES[origin],
                             rbbox=c.BBOXES[dest])
Ejemplo n.º 6
0
def show_gold(city, neighborhood):
    """Show ground thruth for the given query."""
    global ORIGIN
    ORIGIN = cn.gather_info(city, 1, raw_features=True)
    return f.render_template('gold.html',
                             district=neighborhood,
                             bbox=c.BBOXES[city],
                             city=city)
Ejemplo n.º 7
0
def find_match():
    side = int(f.request.form['side'])
    assert side in [0, 1]
    _id = f.request.form['_id']
    first = DEST if side else ORIGIN
    second = ORIGIN if side else DEST
    query, res_ids, answers, dsts, among = cn.find_closest(_id, first, second)
    _ = cn.interpret(first['features'][query, :],
                     second['features'][answers[0], :])
    query_info, first_answer, feature_order = _
    answers_info = [first_answer]
    answers_info.extend([cn.interpret(first['features'][query, :],
                                      second['features'][answer, :],
                                      feature_order)[1]
                         for answer in answers[1:]])
    sendf = lambda x, p: ('{:.'+str(p)+'f}').format(float(x))
    res = {'query': query_info, 'answers_id': list(res_ids),
           'distances': [sendf(d, 5) for d in dsts],
           'explanations': answers_info, 'among': among}
    return f.jsonify(r=res)
Ejemplo n.º 8
0
def load_data(city):
    features = cn.load_matrix(city + '_fv.mat')
    density = features['v'][:, 4]
    weights = density + np.abs(density.min())
    venues_generator = WeightedRandomGenerator(weights)

    vids, _, locs = p.load_var(city + '_svenues.my').all()
    vindex = features['i']
    venues = np.zeros((len(vindex), 2))
    index = dict(itertools.imap(lambda x: (x[1], x[0]), enumerate(vindex)))
    for vid, loc in itertools.izip(vids, locs):
        pos = index.get(vid)
        if pos is not None:
            venues[pos, :] = loc
    kdtree = cKDTree(venues)

    with open('static/ground_truth.json') as infile:
        gold_list = json.load(infile)

    return vindex, venues_generator, venues, kdtree, gold_list
Ejemplo n.º 9
0
def load_data(city):
    features = cn.load_matrix(city + '_fv.mat')
    density = features['v'][:, 4]
    weights = density + np.abs(density.min())
    venues_generator = WeightedRandomGenerator(weights)

    vids, _, locs = p.load_var(city+'_svenues.my').all()
    vindex = features['i']
    venues = np.zeros((len(vindex), 2))
    index = dict(itertools.imap(lambda x: (x[1], x[0]),
                                enumerate(vindex)))
    for vid, loc in itertools.izip(vids, locs):
        pos = index.get(vid)
        if pos is not None:
            venues[pos, :] = loc
    kdtree = cKDTree(venues)

    with open('static/ground_truth.json') as infile:
        gold_list = json.load(infile)

    return vindex, venues_generator, venues, kdtree, gold_list
Ejemplo n.º 10
0
def get_knn_candidates(vids, left_knn, right_knn, at_least, at_most=None):
    """Return between `at_least` and `at_most` venue in right that are close (in
    the sense of euclidean distance) of the `vids` in left. Namely, it return
    their row number and their ids."""
    import heapq
    candidates = []
    candidates_id = []
    knn = right_knn['knn']
    at_most = int(at_most) or 50000
    nb_venues = min(at_most, max(len(vids) * knn, at_least))
    for idx, vid in enumerate(vids):
        _, rid, ridx, dst, _ = cn.find_closest(vid, left_knn, right_knn)
        for dst_, rid_, ridx_, idx_ in zip(dst, rid, ridx, range(knn)):
            if rid_ not in candidates_id:
                candidates_id.append(rid_)
                heapq.heappush(candidates,
                               (dst_, idx * knn + idx_, (rid_, ridx_)))
    nb_venues = min(len(candidates), int(nb_venues))
    closest = heapq.nsmallest(nb_venues, candidates)
    mask = np.array([v[2][1] for v in closest])
    r_vids = np.array([v[2][0] for v in closest])
    return mask, r_vids
Ejemplo n.º 11
0
def get_knn_candidates(vids, left_knn, right_knn, at_least, at_most=None):
    """Return between `at_least` and `at_most` venue in right that are close (in
    the sense of euclidean distance) of the `vids` in left. Namely, it return
    their row number and their ids."""
    import heapq
    candidates = []
    candidates_id = []
    knn = right_knn['knn']
    at_most = int(at_most) or 50000
    nb_venues = min(at_most, max(len(vids)*knn, at_least))
    for idx, vid in enumerate(vids):
        _, rid, ridx, dst, _ = cn.find_closest(vid, left_knn, right_knn)
        for dst_, rid_, ridx_, idx_ in zip(dst, rid, ridx, range(knn)):
            if rid_ not in candidates_id:
                candidates_id.append(rid_)
                heapq.heappush(candidates, (dst_, idx*knn+idx_,
                                            (rid_, ridx_)))
    nb_venues = min(len(candidates), int(nb_venues))
    closest = heapq.nsmallest(nb_venues, candidates)
    mask = np.array([v[2][1] for v in closest])
    r_vids = np.array([v[2][0] for v in closest])
    return mask, r_vids
Ejemplo n.º 12
0
def interpret_query(from_city, to_city, region, metric):
    """Load informations about cities and compute useful quantities."""
    # Load info of the first city
    suffix = '_tsne.mat' if metric == 'emd-tsne' else ''
    left = cn.gather_info(from_city+suffix, knn=1,
                          raw_features='lmnn' not in metric,
                          hide_category=metric != 'jsd')
    left_infos = load_surroundings(from_city)
    left_support = features_support(left['features'])

    # Compute info about the query region
    center, radius, _, contains = polygon_to_local(from_city, region)
    query = describe_region(center, radius, contains, left_infos[0], left)
    features, times, weights, vids = query
    # print('{} venues in query region.'.format(len(vids)))
    venue_proportion = 1.0*len(vids) / left['features'].shape[0]

    # And use them to define the metric that will be used
    theta = np.ones((1, left['features'].shape[1]))
    theta = np.array([[0.0396, 0.0396, 0.2932, 0.0396, 0.0396, 0.0396,
                       0.0396, 0.3404, 0.0396, 0.0396, 0.0396, 0.0396,
                       0.0396, 0.3564, 0.0396, 0.3564, 0.0396, 0.3564,
                       0.3564, 0.3564, 0.0396, 0.0396, 0.0396, 0.0396,
                       0.3564, 0.0396, 0.0396, 0.0396, 0.0396, 0.0396,
                       0.0396]])
    ltheta = len(theta.ravel())*[1, ]

    if 'emd' in metric:
        from emd import emd
        from emd_dst import dist_for_emd
        if 'tsne' in metric:
            from specific_emd_dst import dst_tsne as dist_for_emd
        if 'itml' in metric:
            from specific_emd_dst import dst_itml as dist_for_emd
        query_num = features_as_lists(features)

        @profile
        def regions_distance(r_features, r_weigths):
            if len(r_features) >= MAX_EMD_POINTS:
                return 1e20
            return emd((query_num, map(float, weights)),
                       (r_features, map(float, r_weigths)),
                       lambda a, b: float(dist_for_emd(a, b, ltheta)))
    elif 'cluster' in metric:
        from scipy.spatial.distance import cdist
        query_num = weighted_clusters(features, NB_CLUSTERS, weights)

        def regions_distance(r_features, r_weigths):
            r_cluster = weighted_clusters(r_features, NB_CLUSTERS, r_weigths)
            costs = cdist(query_num, r_cluster).tolist()
            return min_cost(costs)
    elif 'leftover' in metric:

        @profile
        def regions_distance(r_features, second_arg):
            r_weigths, idx = second_arg
            emd_leftover.write_matlab_problem(features, weights, r_features,
                                              r_weigths, idx)
            return -1
    else:
        query_num = features_as_density(features, weights, left_support)

        @profile
        def regions_distance(r_density, r_global):
            """Return distance of a region from `query_num`."""
            return proba_distance(query_num, times, r_density, r_global,
                                  theta)

    # Load info of the target city
    right = cn.gather_info(to_city+suffix, knn=2,
                           raw_features='lmnn' not in metric,
                           hide_category=metric != 'jsd')
    right_infos = load_surroundings(to_city)
    minx, miny, maxx, maxy = right_infos[1]
    right_city_size = (maxx - minx, maxy - miny)
    right_support = features_support(right['features'])
    global RIGHT_SUPPORT
    RIGHT_SUPPORT = right_support

    # given extents, compute threshold of candidate
    threshold = 0.7 * venue_proportion * right['features'].shape[0]
    right_desc = [right_city_size, right_support, right, right_infos]

    return [left, right, right_desc, regions_distance, vids, threshold]
Ejemplo n.º 13
0
if __name__ == '__main__':
    # pylint: disable=C0103
    client = pymongo.MongoClient()
    db = client.foursquare
    with open('static/raw_ground_truth.json') as inf:
        regions = json.load(inf)
    cities_venues = {}
    for district, gold in regions.iteritems():
        # a not very elegant indirection
        gold = gold['gold']
        for city, areas in gold.iteritems():
            print(city, district)
            if city not in cities_venues:
                try:
                    city_venues = list(cn.load_matrix(city)['i'])
                except IOError:
                    city_venues = None
                cities_venues[city] = city_venues
            if cities_venues[city]:
                ground_truth = merge_regions(city, district, db,
                                             cities_venues[city])
                msg = '{}, {}: merged into {} areas'
                print(msg.format(city, district, len(ground_truth)))
                regions[district]['gold'][city] = ground_truth
            else:
                msg = '{}, {}: not merged'
                print(msg.format(city, district))
    with open('static/ground_truth.json', 'w') as out:
        json.dump(regions,
                  out,
Ejemplo n.º 14
0
def show_gold(city, neighborhood):
    """Show ground thruth for the given query."""
    global ORIGIN
    ORIGIN = cn.gather_info(city, 1, raw_features=True)
    return f.render_template('gold.html', district=neighborhood,
                             bbox=c.BBOXES[city], city=city)
Ejemplo n.º 15
0
import CommonMongo as cm

if __name__ == '__main__':
    # pylint: disable=C0103
    import arguments
    args = arguments.city_parser().parse_args()
    city = args.city
    DB, CLIENT = cm.connect_to_db('foursquare', args.host, args.port)

    clusterer = cl.KMeans(3, n_init=5)
    clusterer = cl.MeanShift(min_bin_freq=3, cluster_all=False)
    clusterer = cl.DBSCAN(eps=5, min_samples=8, metric='cityblock')
    clusterer = cl.AffinityPropagation(damping=.55, affinity='euclidean')
    clusterer = cl.SpectralClustering(3, affinity='cosine', n_init=3)

    hel = cn.load_matrix(city)
    features = hel['v']
    scale = pp.MinMaxScaler(copy=False)
    scale.fit_transform(features[:, 0:3])
    scores = []
    for k in range(3, 16):
        clusterer = cl.KMeans(k, n_init=10, tol=1e-5, max_iter=500)
        labels = clusterer.fit_predict(features)
        scores.append(mt.silhouette_score(features, labels))
        print(Counter(labels))
    np.argsort(scores)[::-1] + 3
    ppl.plot(range(3, 16), scores[0:], '+')
    clusterer = cl.MeanShift(min_bin_freq=3, cluster_all=False)
    clusterer = cl.KMeans(6, n_init=20, tol=1e-5, max_iter=500)

    visits = xp.get_visits(CLIENT, xp.Entity.venue, city)
Ejemplo n.º 16
0
def interpret_query(from_city, to_city, region, metric):
    """Load informations about cities and compute useful quantities."""
    # Load info of the first city
    suffix = '_tsne.mat' if metric == 'emd-tsne' else ''
    left = cn.gather_info(from_city + suffix,
                          knn=1,
                          raw_features='lmnn' not in metric,
                          hide_category=metric != 'jsd')
    left_infos = load_surroundings(from_city)
    left_support = features_support(left['features'])

    # Compute info about the query region
    center, radius, _, contains = polygon_to_local(from_city, region)
    query = describe_region(center, radius, contains, left_infos[0], left)
    features, times, weights, vids = query
    # print('{} venues in query region.'.format(len(vids)))
    venue_proportion = 1.0 * len(vids) / left['features'].shape[0]

    # And use them to define the metric that will be used
    theta = np.ones((1, left['features'].shape[1]))
    theta = np.array([[
        0.0396, 0.0396, 0.2932, 0.0396, 0.0396, 0.0396, 0.0396, 0.3404, 0.0396,
        0.0396, 0.0396, 0.0396, 0.0396, 0.3564, 0.0396, 0.3564, 0.0396, 0.3564,
        0.3564, 0.3564, 0.0396, 0.0396, 0.0396, 0.0396, 0.3564, 0.0396, 0.0396,
        0.0396, 0.0396, 0.0396, 0.0396
    ]])
    ltheta = len(theta.ravel()) * [
        1,
    ]

    if 'emd' in metric:
        from emd import emd
        from emd_dst import dist_for_emd
        if 'tsne' in metric:
            from specific_emd_dst import dst_tsne as dist_for_emd
        if 'itml' in metric:
            from specific_emd_dst import dst_itml as dist_for_emd
        query_num = features_as_lists(features)

        @profile
        def regions_distance(r_features, r_weigths):
            if len(r_features) >= MAX_EMD_POINTS:
                return 1e20
            return emd((query_num, map(float, weights)),
                       (r_features, map(float, r_weigths)),
                       lambda a, b: float(dist_for_emd(a, b, ltheta)))
    elif 'cluster' in metric:
        from scipy.spatial.distance import cdist
        query_num = weighted_clusters(features, NB_CLUSTERS, weights)

        def regions_distance(r_features, r_weigths):
            r_cluster = weighted_clusters(r_features, NB_CLUSTERS, r_weigths)
            costs = cdist(query_num, r_cluster).tolist()
            return min_cost(costs)
    elif 'leftover' in metric:

        @profile
        def regions_distance(r_features, second_arg):
            r_weigths, idx = second_arg
            emd_leftover.write_matlab_problem(features, weights, r_features,
                                              r_weigths, idx)
            return -1
    else:
        query_num = features_as_density(features, weights, left_support)

        @profile
        def regions_distance(r_density, r_global):
            """Return distance of a region from `query_num`."""
            return proba_distance(query_num, times, r_density, r_global, theta)

    # Load info of the target city
    right = cn.gather_info(to_city + suffix,
                           knn=2,
                           raw_features='lmnn' not in metric,
                           hide_category=metric != 'jsd')
    right_infos = load_surroundings(to_city)
    minx, miny, maxx, maxy = right_infos[1]
    right_city_size = (maxx - minx, maxy - miny)
    right_support = features_support(right['features'])
    global RIGHT_SUPPORT
    RIGHT_SUPPORT = right_support

    # given extents, compute threshold of candidate
    threshold = 0.7 * venue_proportion * right['features'].shape[0]
    right_desc = [right_city_size, right_support, right, right_infos]

    return [left, right, right_desc, regions_distance, vids, threshold]
Ejemplo n.º 17
0

if __name__ == '__main__':
    # pylint: disable=C0103
    import arguments
    args = arguments.city_parser().parse_args()
    city = args.city
    DB, CLIENT = cm.connect_to_db('foursquare', args.host, args.port)

    clusterer = cl.KMeans(3, n_init=5)
    clusterer = cl.MeanShift(min_bin_freq=3, cluster_all=False)
    clusterer = cl.DBSCAN(eps=5, min_samples=8, metric='cityblock')
    clusterer = cl.AffinityPropagation(damping=.55, affinity='euclidean')
    clusterer = cl.SpectralClustering(3, affinity='cosine', n_init=3)

    hel = cn.load_matrix(city)
    features = hel['v']
    scale = pp.MinMaxScaler(copy=False)
    scale.fit_transform(features[:, 0:3])
    scores = []
    for k in range(3, 16):
        clusterer = cl.KMeans(k, n_init=10, tol=1e-5, max_iter=500)
        labels = clusterer.fit_predict(features)
        scores.append(mt.silhouette_score(features, labels))
        print(Counter(labels))
    np.argsort(scores)[::-1]+3
    ppl.plot(range(3, 16), scores[0:], '+')
    clusterer = cl.MeanShift(min_bin_freq=3, cluster_all=False)
    clusterer = cl.KMeans(6, n_init=20, tol=1e-5, max_iter=500)

    visits = xp.get_visits(CLIENT, xp.Entity.venue, city)
Ejemplo n.º 18
0
if __name__ == '__main__':
    # pylint: disable=C0103
    client = pymongo.MongoClient()
    db = client.foursquare
    with open('static/raw_ground_truth.json') as inf:
        regions = json.load(inf)
    cities_venues = {}
    for district, gold in regions.iteritems():
        # a not very elegant indirection
        gold = gold['gold']
        for city, areas in gold.iteritems():
            print(city, district)
            if city not in cities_venues:
                try:
                    city_venues = list(cn.load_matrix(city)['i'])
                except IOError:
                    city_venues = None
                cities_venues[city] = city_venues
            if cities_venues[city]:
                ground_truth = merge_regions(city, district, db,
                                             cities_venues[city])
                msg = '{}, {}: merged into {} areas'
                print(msg.format(city, district, len(ground_truth)))
                regions[district]['gold'][city] = ground_truth
            else:
                msg = '{}, {}: not merged'
                print(msg.format(city, district))
    with open('static/ground_truth.json', 'w') as out:
        json.dump(regions, out, sort_keys=True, indent=2,
                  separators=(',', ': '))