Exemple #1
0
def global_info(city, standalone=False):
    """Gather global statistics about `city`."""
    lvenues = geo_project(city, DB.venue.find({'city': city}, {'loc': 1}))
    lcheckins = geo_project(city, DB.checkin.find({'city': city}, {'loc': 1}))
    lphotos = geo_project(city, CLIENT.world.photos.find({'hint': city},
                                                         {'loc': 1}))
    local_projection = [lvenues, lcheckins, lphotos]
    visits = xp.get_visits(CLIENT, xp.Entity.venue, city)
    visitors = xp.get_visitors(CLIENT, city)
    density = estimate_density(city)
    activity = [visits, visitors, density]
    global TOP_CATS
    TOP_CATS = p.load_var('top_cats.my')
    infos = {'venue': [] if standalone else ['cat', 'cats'],
             'photo': ['taken'] if standalone else ['venue']}
    svenues = s.Surrounding(DB.venue, {'city': city}, infos['venue'], lvenues)
    scheckins = s.Surrounding(DB.checkin, {'city': city}, ['time'], lcheckins)
    sphotos = s.Surrounding(CLIENT.world.photos, {'hint': city},
                            infos['photo'], lphotos)
    surroundings = [svenues, scheckins, sphotos]
    p.save_var('{}_s{}s.my'.format(city, 'venue'), svenues)
    if standalone:
        for name, var in zip(['venue', 'checkin', 'photo'], surroundings):
            p.save_var('{}_s{}s.my'.format(city, name), var)
    return local_projection + activity + surroundings
Exemple #2
0
def venues_info(vids, visits=None, visitors=None, density=None, depth=10,
                tags_freq=True):
    """Return various info about from the venue ids `vids`."""
    tags = defaultdict(int)
    city = DB.venue.find_one({'_id': vids[0]})['city']
    visits = visits or xp.get_visits(CLIENT, xp.Entity.venue, city)
    visitors = visitors or xp.get_visitors(CLIENT, city)
    density = density or estimate_density(city)
    venues = list(DB.venue.find({'_id': {'$in': vids}},
                                {'cat': 1, 'name': 1, 'loc': 1,
                                 'price': 1, 'rating': 1, 'tags': 1,
                                 'likes': 1, 'usersCount': 1,
                                 'checkinsCount': 1}))

    msg = 'Asked for {} but get only {}'.format(len(vids), len(venues))
    assert len(vids) == len(venues), msg
    res = pd.DataFrame(index=[_['_id'] for _ in venues])

    def add_col(field):
        res[field.replace('Count', '')] = [_[field] for _ in venues]
    for field in ['name', 'price', 'rating', 'likes',
                  'usersCount', 'checkinsCount']:
        add_col(field)
    if tags_freq:
        res['tags'] = [[normalized_tag(t) for t in _['tags']] for _ in venues]
    loc = [_['loc']['coordinates'] for _ in venues]
    get_cat = lambda c, d: top_category(c) if d == 1 else parenting_cat(c, d)
    res['cat'] = [get_cat(_['cat'], depth) for _ in venues]
    res['vis'] = [len(visits[id_]) for id_ in res.index]
    res['H'] = [venue_entropy(visitors[id_]) for id_ in res.index]
    res['Ht'] = [time_entropy(visits[id_]) for id_ in res.index]
    coords = np.fliplr(np.array(loc))
    points = cm.cities.GEO_TO_2D[city](coords)
    res['Den'] = density(points)
    if tags_freq:
        for venue in venues:
            for tag in venue['tags']:
                tags[normalized_tag(tag)] += 1
    return res, OrderedDict(sorted(tags.iteritems(), key=lambda x: x[1],
                                   reverse=True))
Exemple #3
0
    clusterer = cl.DBSCAN(eps=5, min_samples=8, metric='cityblock')
    clusterer = cl.AffinityPropagation(damping=.55, affinity='euclidean')
    clusterer = cl.SpectralClustering(3, affinity='cosine', n_init=3)

    hel = cn.load_matrix(city)
    features = hel['v']
    scale = pp.MinMaxScaler(copy=False)
    scale.fit_transform(features[:, 0:3])
    scores = []
    for k in range(3, 16):
        clusterer = cl.KMeans(k, n_init=10, tol=1e-5, max_iter=500)
        labels = clusterer.fit_predict(features)
        scores.append(mt.silhouette_score(features, labels))
        print(Counter(labels))
    np.argsort(scores)[::-1]+3
    ppl.plot(range(3, 16), scores[0:], '+')
    clusterer = cl.MeanShift(min_bin_freq=3, cluster_all=False)
    clusterer = cl.KMeans(6, n_init=20, tol=1e-5, max_iter=500)

    visits = xp.get_visits(CLIENT, xp.Entity.venue, city)
    visitors = xp.get_visitors(CLIENT, city)
    density = vf.estimate_density(city)
    c0, _ = vf.venues_info([_ for _ in hel['i'][labels == 0].tolist()
                            if _ in visits],
                           visits, visitors, density, depth=2, tags_freq=False)
    c5, v = vf.venues_info([v for v in hel['i'][labels == 5].tolist()
                            if v in visits],
                           visits, visitors, density, depth=2, tags_freq=False)
    c0.describe()
    c5.describe()
Exemple #4
0
    features = hel['v']
    scale = pp.MinMaxScaler(copy=False)
    scale.fit_transform(features[:, 0:3])
    scores = []
    for k in range(3, 16):
        clusterer = cl.KMeans(k, n_init=10, tol=1e-5, max_iter=500)
        labels = clusterer.fit_predict(features)
        scores.append(mt.silhouette_score(features, labels))
        print(Counter(labels))
    np.argsort(scores)[::-1] + 3
    ppl.plot(range(3, 16), scores[0:], '+')
    clusterer = cl.MeanShift(min_bin_freq=3, cluster_all=False)
    clusterer = cl.KMeans(6, n_init=20, tol=1e-5, max_iter=500)

    visits = xp.get_visits(CLIENT, xp.Entity.venue, city)
    visitors = xp.get_visitors(CLIENT, city)
    density = vf.estimate_density(city)
    c0, _ = vf.venues_info(
        [_ for _ in hel['i'][labels == 0].tolist() if _ in visits],
        visits,
        visitors,
        density,
        depth=2,
        tags_freq=False)
    c5, v = vf.venues_info(
        [v for v in hel['i'][labels == 5].tolist() if v in visits],
        visits,
        visitors,
        density,
        depth=2,
        tags_freq=False)