Beispiel #1
0
def disc_latex(N=11):
    line = u'{} & {:.3f} & {} & {:.3f} & {} & {:.3f} \\\\'
    from rank_disc import top_discrepancy
    t = [persistent.load_var('disc/all'),
         persistent.load_var('disc/all_80'),
         persistent.load_var('disc/all_20')]
    supported = [v[0] for v in persistent.load_var('supported')]
    d = zip(*[top_discrepancy(l, supported) for l in t])
    display = lambda v: line.format(v[0][2], v[0][0], v[1][2], v[1][0],
                                    v[2][2], v[2][0])
    for v in d[:N]:
        print(display(v))
    for v in d[-N:]:
        print(display(v))
Beispiel #2
0
def brand_awareness(brand, src, dst):
    """For all venues of brand `brand` in `src`, return the position of the
    first matching venue of the same brand in `dst`, along with a score
    between 0 (best) and 1 (worst)."""
    res = []
    src_venues = p.load_var('{}_{}.my'.format(src['city'], brand))
    dst_venues = p.load_var('{}_{}.my'.format(dst['city'], brand))
    among = 0
    for venue in src_venues:
        _, ids, _, _, among = find_closest(venue, src, dst)
        ranks = [pos for pos, res_id in enumerate(ids)
                 if res_id in dst_venues]
        res.append((len(dst_venues), among, ranks))
    return res
Beispiel #3
0
def build_dico(city, begin, client):
    technical = [
        'blackandwhite', 'longexposure', 'lomofi', 'mobile', 'bw',
        'cameraphone', 'lofi', 'lowlight', 'geotagged', 'xproii', 'nikon',
        'd200', 'square', 'instagramapp', 'squareformat', 'iphoneography',
        'iphone', 'colorvibefilter', 'tonemapped', 'chameleonfilter', 'hdr'
        'noflash', 'photo', 'iphone4', 'iphone5', 'fujix100', 'd700', 'f22',
        'photoshop', 'photography', 'pictures', 'f18',
        'canonef24105mmf4lisusm', 'nikond90', 'nikond700'
    ]
    dictionary = corpora.Dictionary(p['tags']
                                    for p in get_tags(city, client, begin))
    dictionary.filter_extremes(no_below=30, no_above=0.4, keep_n=None)
    dictionary.compactify()
    stop_ids = [
        dictionary.token2id[stopword] for stopword in technical
        if stopword in dictionary.token2id
    ]
    good_words = [_[0] for _ in p.load_var(city + '_tag_support')]
    good_ids = [
        dictionary.token2id[goodword] for goodword in good_words
        if goodword in dictionary.token2id
    ]
    dictionary.filter_tokens(bad_ids=stop_ids, good_ids=good_ids)
    # remove gaps in id sequence after words that were removed
    dictionary.compactify()
    print(dictionary)
    dictionary.save(city + '_flickr.dict')
    return dictionary
Beispiel #4
0
def read_original_graph(filename, seed=None, balanced=False, directed=False):
    """Read a signed graph from `filename` and compute its degree sequence.
    Optionally `shuffle` nodes ids"""
    global DEGREES, G, EDGE_SIGN, INCONSISTENT
    DEGREES, G, EDGE_SIGN, INCONSISTENT = None, {}, {}, 0
    with open(filename) as source:
        for line in source:
            if line.startswith('#'):
                continue
            i, j, sign = [int(_) for _ in line.split()]
            if i == j:
                continue
            add_signed_edge(i, j, sign > 0, directed)
    # remove isolated vertices
    to_delete = [u for u, adj in G.items() if not adj]
    for u in to_delete:
        del G[u]
    # reindex nodes so they are sequential
    mapping = {v: i for i, v in enumerate(sorted(G.keys()))}
    G, EDGE_SIGN = reindex_nodes(G, EDGE_SIGN, mapping, directed)
    if balanced:
        import persistent
        src = DIRECTED_TO_DELETE if directed else EDGE_TO_DELETE
        to_delete = persistent.load_var(src[filename])
        for edge in to_delete:
            remove_signed_edge(*edge, directed=directed)
    if isinstance(seed, int):
        r.seed(seed)
        rperm = list(G.keys())
        r.shuffle(rperm)
        rperm = {i: v for i, v in enumerate(rperm)}
        G, EDGE_SIGN = reindex_nodes(G, EDGE_SIGN, rperm, directed)
    DEGREES = sorted(((node, len(adj)) for node, adj in G.items()),
                     key=lambda x: x[1])
Beispiel #5
0
def load_graph(name, size=None):
    is_triangle = False
    if name.endswith('.my'):
        G, E = persistent.load_var(name)
        if name.find('triangle') < 0:
            return G, E
        else:
            is_triangle = True
            cexp.redensify.G = G
            cexp.redensify.N = len(G)
            cexp.redensify.EDGES_SIGN = E
    if name.endswith('.pbm'):
        return build_graph(*read_img(name))
    assert is_triangle or name in ['PA', 'grid']
    assert is_triangle or size > 10
    if name == 'PA':
        cexp.fast_preferential_attachment(size, 3, .13)
    if name == 'grid':
        G, E_keys = gs.make_grid(size)
        cexp.redensify.G = G
        cexp.redensify.N = len(G)
        cexp.redensify.EDGES_SIGN = {e: True for e in E_keys}
    n = cexp.redensify.N
    nb_cluster = int(2*sqrt(n))
    ci = cexp.turn_into_signed_graph_by_propagation(nb_cluster,
                                                    infected_fraction=.9)
    G = deepcopy(cexp.redensify.G)
    E = deepcopy(cexp.redensify.EDGES_SIGN)
    merge_into_2_clusters(E, ci)
    return G, E
Beispiel #6
0
def global_info(city, standalone=False):
    """Gather global statistics about `city`."""
    lvenues = geo_project(city, DB.venue.find({'city': city}, {'loc': 1}))
    lcheckins = geo_project(city, DB.checkin.find({'city': city}, {'loc': 1}))
    lphotos = geo_project(city, CLIENT.world.photos.find({'hint': city},
                                                         {'loc': 1}))
    local_projection = [lvenues, lcheckins, lphotos]
    visits = xp.get_visits(CLIENT, xp.Entity.venue, city)
    visitors = xp.get_visitors(CLIENT, city)
    density = estimate_density(city)
    activity = [visits, visitors, density]
    global TOP_CATS
    TOP_CATS = p.load_var('top_cats.my')
    infos = {'venue': [] if standalone else ['cat', 'cats'],
             'photo': ['taken'] if standalone else ['venue']}
    svenues = s.Surrounding(DB.venue, {'city': city}, infos['venue'], lvenues)
    scheckins = s.Surrounding(DB.checkin, {'city': city}, ['time'], lcheckins)
    sphotos = s.Surrounding(CLIENT.world.photos, {'hint': city},
                            infos['photo'], lphotos)
    surroundings = [svenues, scheckins, sphotos]
    p.save_var('{}_s{}s.my'.format(city, 'venue'), svenues)
    if standalone:
        for name, var in zip(['venue', 'checkin', 'photo'], surroundings):
            p.save_var('{}_s{}s.my'.format(city, name), var)
    return local_projection + activity + surroundings
Beispiel #7
0
    def add_cc_noise(noise, balanced=False, seed=None):
        global bfstrees
        redensify.G = deepcopy(orig_g)
        redensify.EDGES_SIGN = deepcopy(orig_es)
        if balanced and SYNTHETIC_DATA:
            to_delete = persistent.load_var(BASENAME+'_balance.my')
            for edge in to_delete:
                pot.delete_edge(redensify.G, edge, redensify.EDGES_SIGN)
        cexp.add_noise(noise/100, noise/100)
        if seed is not None:
            random.seed(seed)
            rperm = list(redensify.G.keys())
            random.shuffle(rperm)
            rperm = {i: v for i, v in enumerate(rperm)}
            _ = rw.reindex_nodes(redensify.G, redensify.EDGES_SIGN, rperm)
            redensify.G, redensify.EDGES_SIGN = _
        rw.G = deepcopy(redensify.G)
        rw.EDGE_SIGN = deepcopy(redensify.EDGES_SIGN)
        rw.DEGREES = sorted(((node, len(adj)) for node, adj in rw.G.items()),
                            key=lambda x: x[1])
        if not bfstrees:
            if SYNTHETIC_DATA:
                bfstrees = [t[1] for t in nsi.compute_trees()]
            else:
                bfstrees = []
                for root in (u[0] for u in rw.DEGREES[-50:]):
                    bfstrees.append(pot.get_bfs_tree(rw.G, root))

        return get_largest_component()
def photos_to_cluster_dataset(city, limit=300):
    print 'utils.py/photos_to_cluster_dataset'
    photos = load_var(city)
    points = [[p[0] + noise(), p[1] + noise(), 'Win!']
              for p in photos[:limit]]
    with open(city+'_cluster.js', 'w') as f:
        f.write('var {}_cluster = {}'.format(city, str(points)))
Beispiel #9
0
def increase_coverage(upto=5000):
    """Save `upto` unprocessed San Francisco tags"""
    from more_query import get_top_tags
    sup = persistent.load_var('supported')
    more = get_top_tags(upto, 'nsf_tag.dat')
    already = [v[0] for v in sup]
    addition = set(more).difference(set(already))
    persistent.save_var('addition', addition)
Beispiel #10
0
def get_data(DB):
    entropies = load_var('Hsupported')
    tags = sorted([k for k, v in entropies.items() if 2.5 <= v <= 3.01])
    save_var('mat_tag', tags)
    u = load_var('user_status')
    user_index = {k: i for i, k in enumerate(u)}

    def format_photo(p):
        user = user_index[p['uid']]
        loc = p['loc']['coordinates']
        taken = [p['taken'].weekday(), p['taken'].hour,
                 calendar.timegm(p['taken'].utctimetuple())]
        indicator = [int(t in p['ntags']) for t in tags]
        return [user] + loc + taken + indicator

    photos_feature = np.mat(tag_time(DB, tags, format_photo))
    sio.savemat('deep', {'A': scipy.sparse.csr_matrix(photos_feature)})
Beispiel #11
0
def photos_to_heat_dataset(city, precision=4, limit=300):
    photos = load_var(city)
    points = Counter([(round(p[0], precision), round(p[1], precision)) for p in photos])
    maxi = points.most_common(1)[0][1]
    dataset = [{"lat": p[1], "lon": p[0], "value": c} for p, c in points.most_common(limit)]
    json_dataset = json.dumps({"max": maxi, "data": dataset})
    with open(city + ".js", "w") as f:
        f.write("var {} = {}".format(city, json_dataset))
Beispiel #12
0
def insert_pending_task(conn, end, pending_file):
    try:
        pending_task = load_var(pending_file)
        pending_task['end'] = end
        pending_task['e_offset'] = date_offset(pending_task['end'])
        insert_task(conn, pending_task)
        os.remove(pending_file)
    except IOError:
        pass
Beispiel #13
0
def load_surroundings(city):
    """Load projected coordinates and extra field of all venues, checkins and
    photos within `city`, as well as returning city geographical bounds."""
    import persistent as p
    surroundings = [p.load_var('{}_svenues.my'.format(city)), None, None]
    # surroundings = [p.load_var('{}_s{}s.my'.format(city, kind))
    #                 for kind in ['venue', 'checkin', 'photo']]
    venues_pos = np.vstack(surroundings[0].loc)
    city_extent = list(np.min(venues_pos, 0)) + list(np.max(venues_pos, 0))
    return surroundings, city_extent
Beispiel #14
0
def load_surroundings(city):
    """Load projected coordinates and extra field of all venues, checkins and
    photos within `city`, as well as returning city geographical bounds."""
    import persistent as p
    surroundings = [p.load_var('{}_svenues.my'.format(city)), None, None]
    # surroundings = [p.load_var('{}_s{}s.my'.format(city, kind))
    #                 for kind in ['venue', 'checkin', 'photo']]
    venues_pos = np.vstack(surroundings[0].loc)
    city_extent = list(np.min(venues_pos, 0)) + list(np.max(venues_pos, 0))
    return surroundings, city_extent
Beispiel #15
0
def get_best_tags(point):
    tags = persistent.load_var(u'disc/all_{}'.format(GRID_SIZE))
    res = []
    size = point.area
    for tag, polys in tags.items():
        for val, poly in polys:
            if point.intersects(poly) and \
               point.intersection(poly).area > .6*size:
                res.append((tag, val))
                break
    return sorted(res, key=lambda x: x[1], reverse=True)
def photos_to_heat_dataset(city, precision=4, limit=300):
    print 'utils.py/photos_to_heat_dataset'
    photos = load_var(city)
    points = Counter([(round(p[0], precision), round(p[1], precision))
                      for p in photos])
    maxi = points.most_common(1)[0][1]
    dataset = [{'lat': p[1], 'lon': p[0], 'value': c}
               for p, c in points.most_common(limit)]
    json_dataset = json.dumps({'max': maxi, 'data': dataset})
    with open(city+'.js', 'w') as f:
        f.write('var {} = {}'.format(city, json_dataset))
Beispiel #17
0
def get_best_tags(point):
    tags = persistent.load_var(u'disc/all_{}'.format(GRID_SIZE))
    res = []
    size = point.area
    for tag, polys in tags.items():
        for val, poly in polys:
            if point.intersects(poly) and \
               point.intersection(poly).area > .6*size:
                res.append((tag, val))
                break
    return sorted(res, key=lambda x: x[1], reverse=True)
Beispiel #18
0
 def load_graph(seed=None):
     if BASENAME.startswith('soc'):
         rw.read_original_graph(BASENAME, seed=seed, balanced=BALANCED)
         redensify.G = deepcopy(rw.G)
         redensify.EDGES_SIGN = deepcopy(rw.EDGE_SIGN)
     elif DATA == 'LP':
         _ = persistent.load_var(BASENAME+'.my')
         redensify.G, redensify.EDGES_SIGN = _
         return
     else:
         G = gt.load_graph(BASENAME+'.gt')
         cexp.to_python_graph(G)
Beispiel #19
0
def photos_to_heat_dataset(city, precision=4, limit=300):
    photos = load_var(city)
    points = Counter([(round(p[0], precision), round(p[1], precision))
                      for p in photos])
    maxi = points.most_common(1)[0][1]
    dataset = [{
        'lat': p[1],
        'lon': p[0],
        'value': c
    } for p, c in points.most_common(limit)]
    json_dataset = json.dumps({'max': maxi, 'data': dataset})
    with open(city + '.js', 'w') as f:
        f.write('var {} = {}'.format(city, json_dataset))
Beispiel #20
0
def build_dico(city, begin, client):
    technical = [
        "blackandwhite",
        "longexposure",
        "lomofi",
        "mobile",
        "bw",
        "cameraphone",
        "lofi",
        "lowlight",
        "geotagged",
        "xproii",
        "nikon",
        "d200",
        "square",
        "instagramapp",
        "squareformat",
        "iphoneography",
        "iphone",
        "colorvibefilter",
        "tonemapped",
        "chameleonfilter",
        "hdr" "noflash",
        "photo",
        "iphone4",
        "iphone5",
        "fujix100",
        "d700",
        "f22",
        "photoshop",
        "photography",
        "pictures",
        "f18",
        "canonef24105mmf4lisusm",
        "nikond90",
        "nikond700",
    ]
    dictionary = corpora.Dictionary(p["tags"] for p in get_tags(city, client, begin))
    dictionary.filter_extremes(no_below=30, no_above=0.4, keep_n=None)
    dictionary.compactify()
    stop_ids = [dictionary.token2id[stopword] for stopword in technical if stopword in dictionary.token2id]
    good_words = [_[0] for _ in p.load_var(city + "_tag_support")]
    good_ids = [dictionary.token2id[goodword] for goodword in good_words if goodword in dictionary.token2id]
    dictionary.filter_tokens(bad_ids=stop_ids, good_ids=good_ids)
    # remove gaps in id sequence after words that were removed
    dictionary.compactify()
    print(dictionary)
    dictionary.save(city + "_flickr.dict")
    return dictionary
Beispiel #21
0
def get_user_status(with_count=False):
    name = 'user_status' + ('_full' if with_count else '')
    fields = {'tourist': 1}
    if with_count:
        fields.update({'count': 1})
    try:
        d = load_var(name)
    except IOError:
        users = list(DB.users.find(fields=fields))
        if with_count:
            d = dict([(u['_id'], (u['count'], u['tourist'])) for u in users])
        else:
            d = dict([(u['_id'], u['tourist']) for u in users])
        save_var(name, d)
    return d
Beispiel #22
0
def get_user_status(with_count=False):
    name = 'user_status' + ('_full' if with_count else '')
    fields = {'tourist': 1}
    if with_count:
        fields.update({'count': 1})
    try:
        d = load_var(name)
    except IOError:
        users = list(DB.users.find(fields=fields))
        if with_count:
            d = dict([(u['_id'], (u['count'], u['tourist'])) for u in users])
        else:
            d = dict([(u['_id'], u['tourist']) for u in users])
        save_var(name, d)
    return d
def load_real_graph(dataset='citeseer', main_class=None):
    default_main_class = {'citeseer': 1,
                          'cora': 2,
                          'pubmed_core': 1,
                          'usps4500': 4,
                          'rcv1': 2,
                          'imdb': 0,
                          }
    main_class = main_class if main_class is not None else default_main_class[dataset]
    ew, y = persistent.load_var('{}_lcc.my'.format(dataset))
    adj = {}
    for u, v in ew:
        add_edge(adj, u, v)
    gold = {i: 1 if v == main_class else -1 for i, v in enumerate(y)}
    return adj, ew, gold, compute_phi(ew, gold)
Beispiel #24
0
def describe_city(city):
    """Compute feature vector for selected venue in `city`."""
    CATS2 = p.load_var('cat_depth_2.my')
    # a few venues don't have level 2 categories (TODO add it manually?)
    CATS2.update({cat: int(idx*1e5) for idx, cat in enumerate(CATS)})
    info = global_info(city)
    lvenues, lcheckins, lphotos = info[:3]
    visits, visitors, density = info[3:6]
    nb_visitors = np.unique(np.array([v for place in visitors.itervalues()
                                      for v in place])).size
    svenues, scheckins, sphotos = info[6:]
    categories = categories_repartition(city, svenues, lvenues, RADIUS)
    venues = DB.venue.find({'city': city, 'closed': {'$ne': True},
                            'cat': {'$ne': None}, 'usersCount': {'$gt': 1}},
                           {'cat': 1})
    chosen = [v['_id'] for v in venues
              if len(visits.get(v['_id'], [])) > 4 and
              len(np.unique(visitors.get(v['_id'], []))) > 1 and
              not is_event(v['cat'])]
    print("Chosen {} venues in {}.".format(len(chosen), city))
    info, _ = venues_info(chosen, visits, visitors, density, depth=2,
                          tags_freq=False)
    print("{} of them will be in the matrix.".format(len(info)))
    numeric = np.zeros((len(info), 31), dtype=np.float32)
    numeric[:, :5] = np.array([info['likes'], info['users'], info['checkins'],
                               info['H'], info['Den']]).T
    print('venues with no level 2 category:')
    print([info.index[i] for i, c in enumerate(info['cat'])
           if CATS2[c] % int(1e5) == 0])
    numeric[:, 5] = [CATS2[c] for c in info['cat']]
    numeric[:, 24] = np.array(info['Ht'])
    for idx, vid in enumerate(info.index):
        surrounding = full_surrounding(vid, lvenues, lphotos, lcheckins,
                                       svenues, scheckins, sphotos, city)
        cat, focus, ratio, around_visits = surrounding
        numeric[idx, 6:15] = cat
        numeric[idx, 15] = focus
        numeric[idx, 16] = ratio
        own_visits = visits[vid]
        numeric[idx, 17] = is_week_end_place(own_visits)
        daily_visits = xp.aggregate_visits(own_visits, 1, 4)[0]
        numeric[idx, 18:24] = xp.to_frequency(daily_visits)
        numeric[idx, 25:31] = xp.to_frequency(around_visits)
    weird = np.argwhere(np.logical_or(np.isnan(numeric), np.isinf(numeric)))
    numeric[weird] = 0.0
    sio.savemat(city+'_fv', {'v': numeric, 'c': categories,
                             'i': np.array(list(info.index)),
                             'stat': [nb_visitors]}, do_compression=True)
Beispiel #25
0
def get_categories(client=None):
    """Return categories list from disk or from Foursquare website using
    client"""
    if client is None:
        raw_cats = p.load_var('raw_categories')['categories']
    else:
        raw_cats = client.venues.categories()
        p.save_var('raw_categories', raw_cats)
        raw_cats = raw_cats['categories']
    cats = Category('1', 'Venue', 0, parse_categories(raw_cats))
    # pylint: disable=E1101
    id_index = [(id_, idx + 100)
                for idx, id_ in enumerate(sorted(CAT_TO_ID.values()))
                if id_ not in ['0', '1']]
    ID_TO_INDEX.update(id_index)
    return cats
Beispiel #26
0
def load_existing_ids(cmd_args):
    """Read checkins ids in city from disk or DB."""
    city = cmd_args.city
    if city == 'whole':
        return []
    import persistent as p
    try:
        return p.load_var(city+'_checkins_ids.my')
    except IOError:
        pass
    import CommonMongo as cm
    db = cm.connect_to_db('foursquare', cmd_args.host, cmd_args.port)[0]
    ids = {str(_['_id']) for _ in db.checkin.find({'city': city}, {'_id': 1})
           if not isinstance(_['_id'], long)}
    p.save_var(city+'_checkins_ids.my', ids)
    return ids
Beispiel #27
0
def get_categories(client=None):
    """Return categories list from disk or from Foursquare website using
    client"""
    if client is None:
        raw_cats = p.load_var('raw_categories')['categories']
    else:
        raw_cats = client.venues.categories()
        p.save_var('raw_categories', raw_cats)
        raw_cats = raw_cats['categories']
    cats = Category('1', 'Venue', 0, parse_categories(raw_cats))
    # pylint: disable=E1101
    id_index = [(id_, idx + 100)
                for idx, id_ in enumerate(sorted(CAT_TO_ID.values()))
                if id_ not in ['0', '1']]
    ID_TO_INDEX.update(id_index)
    return cats
Beispiel #28
0
 def load_data(self, dataset, balanced=False, small_wiki=False):
     timestamp = 'ts' in dataset
     if small_wiki:
         Gfull, E = p.load_var('small_wiki.my')
     elif balanced:
         l.rw.read_original_graph(lp.FILENAMES[dataset], directed=True,
                                  balanced=balanced)
     # conflicting = set()
     # for (u, v), s in l.rw.EDGE_SIGN.items():
     #     opposite_sign = l.rw.EDGE_SIGN.get((v, u))
     #     if opposite_sign is not None and s != opposite_sign:
     #         conflicting.add(tuple(sorted([u, v])))
     # msg = 'Number of conflicting edges in {}: {}'
     # print(msg.format(dataset, 2*len(conflicting)))
     # for u, v in conflicting:
     #     l.rw.remove_signed_edge(u, v, directed=True)
     #     l.rw.remove_signed_edge(v, u, directed=True)
         Gfull, E = l.rw.G, l.rw.EDGE_SIGN
     else:
         pack_name = 'directed_{}.pack'.format(dataset)
         if timestamp:
             order_name = 'directed_{}_order.pack'.format(dataset)
             with open(order_name, 'r+b') as packfile:
                 self.time_order = msgpack.unpack(packfile, use_list=False)
         Gfull, E = load_directed_signed_graph(pack_name)
     # root = max(Gfull.items(), key=lambda x: len(x[1]))[0]
     # Gbfs, _, _ = initial_spanning_tree(Gfull, root)
     # self.lcc = set(Gbfs.keys())
     self.order = len(Gfull)
     self.dout, self.din = defaultdict(int), defaultdict(int)
     self.common_nei = {e: Gfull[e[0]].intersection(Gfull[e[1]]) for e in E}
     self.Gout, self.Gin = {}, {}
     self.edge_order, in_lcc = {}, []
     for i, (u, v) in enumerate(sorted(E)):
         self.edge_order[(u, v)] = i
         # in_lcc.append(u in self.lcc and v in self.lcc)
         self.dout[u] += 1
         self.din[v] += 1
         l.add_neighbor(u, v, self.Gout)
         l.add_neighbor(v, u, self.Gin)
     self.reciprocal = {ei: self.edge_order[(e[1], e[0])]
                        for e, ei in self.edge_order.items()
                        if (e[1], e[0]) in E}
     # self.in_lcc = np.array(in_lcc, dtype=bool)
     self.Gfull = Gfull
     self.G = self.Gout
     self.E = E
Beispiel #29
0
def load_existing_ids(cmd_args):
    """Read checkins ids in city from disk or DB."""
    city = cmd_args.city
    if city == 'whole':
        return []
    import persistent as p
    try:
        return p.load_var(city + '_checkins_ids.my')
    except IOError:
        pass
    import CommonMongo as cm
    db = cm.connect_to_db('foursquare', cmd_args.host, cmd_args.port)[0]
    ids = {
        str(_['_id'])
        for _ in db.checkin.find({'city': city}, {'_id': 1})
        if not isinstance(_['_id'], long)
    }
    p.save_var(city + '_checkins_ids.my', ids)
    return ids
Beispiel #30
0
def load_data(city):
    features = cn.load_matrix(city + '_fv.mat')
    density = features['v'][:, 4]
    weights = density + np.abs(density.min())
    venues_generator = WeightedRandomGenerator(weights)

    vids, _, locs = p.load_var(city + '_svenues.my').all()
    vindex = features['i']
    venues = np.zeros((len(vindex), 2))
    index = dict(itertools.imap(lambda x: (x[1], x[0]), enumerate(vindex)))
    for vid, loc in itertools.izip(vids, locs):
        pos = index.get(vid)
        if pos is not None:
            venues[pos, :] = loc
    kdtree = cKDTree(venues)

    with open('static/ground_truth.json') as infile:
        gold_list = json.load(infile)

    return vindex, venues_generator, venues, kdtree, gold_list
Beispiel #31
0
def load_data(city):
    features = cn.load_matrix(city + '_fv.mat')
    density = features['v'][:, 4]
    weights = density + np.abs(density.min())
    venues_generator = WeightedRandomGenerator(weights)

    vids, _, locs = p.load_var(city+'_svenues.my').all()
    vindex = features['i']
    venues = np.zeros((len(vindex), 2))
    index = dict(itertools.imap(lambda x: (x[1], x[0]),
                                enumerate(vindex)))
    for vid, loc in itertools.izip(vids, locs):
        pos = index.get(vid)
        if pos is not None:
            venues[pos, :] = loc
    kdtree = cKDTree(venues)

    with open('static/ground_truth.json') as infile:
        gold_list = json.load(infile)

    return vindex, venues_generator, venues, kdtree, gold_list
Beispiel #32
0
def get_graph(balanced=False):
    """Load the graph from BASENAME and optionally remove positive edges to
    balance the graph. NOTE: this only modify redensify structure and not
    graph_tool & its distance matrix"""
    if balanced:
        import persistent
    if os.path.isfile(BASENAME+'.gt'):
        g = graph_tool.load_graph(BASENAME+'.gt')
        dst_mat = np.load(BASENAME+'_dst.npy')
        cexp.to_python_graph(g)
        if balanced:
            to_delete = persistent.load_var(BASENAME+'_balance.my')
            for edge in to_delete:
                pot.delete_edge(redensify.G, edge, redensify.EDGES_SIGN)
        return g, dst_mat
    if not PA:
        cexp.random_signed_communities(2, 500, 13, 11.5/500, .0, .0)
        g = cexp.to_graph_tool()
    else:
        cexp.preferential_attachment(1000, gamma=1.4, m=12)
        cexp.turn_into_signed_graph_by_propagation(2)
        DEGREES = sorted(((node, len(adj))
                          for node, adj in cexp.redensify.G.items()),
                         key=lambda x: x[1])
        u, v = DEGREES[-1][0], DEGREES[-2][0]
        u, v = v, u if u > v else u, v
        del cexp.redensify.EDGES_SIGN[(u, v)]
        cexp.redensify.G[u].remove(v)
        cexp.redensify.G[v].remove(u)
    n = g.num_vertices()
    dst = shortest_distance(g, dense=False)
    dst_mat = np.zeros((n, n), dtype=np.uint8)
    for v in g.vertices():
        dst_mat[int(v), :] = dst[v].a.astype(np.uint8)
    g.save(BASENAME+'.gt')
    np.save(BASENAME+'_dst', dst_mat)
Beispiel #33
0
    args = parser.parse_args()

    mapbox_service = mapbox.Static()
    mapbox_service.session.params['access_token'] = os.environ['MAPBOX_ACCESS_TOKEN']

    # alternatively, you can uncomment the next line and pass your token directly
    # mapbox_service.session.params['access_token'] = 'YOUR_TOKEN'

    mainCats = ['Arts & Entertainment', 'College & University', 'Food', 'Nightlife Spot',
            'Outdoors & Recreation', 'Shop & Service', 'Professional & Other Places',
            'Residence', 'Travel & Transport']
    main_cats_plot = mainCats[:]
    mainCats = {c: i for i, c in enumerate(mainCats)}
    mainCats['Event'] = 9

    name_to_main_cat = p.load_var('name_to_cat.my')
    name_to_main_cat['Event'] = 'Event'
    name_to_main_cat['NOPRIMCAT'] = 'Event'
    name_to_main_cat['Tennis'] = 'Outdoors & Recreation'

    timeOfDay = ['MORNING', 'NOON', 'AFTERNOON', 'EVENING', 'NIGHT', 'LATENIGHT']
    timeOfDay_plot = ['MORN', 'NOON', 'AFNN', 'EVEN', 'NGHT', 'LATE']
    dayOfWeek = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    dayOfWeek_plot = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']


    city = args.city
    FOLDER = args.folder

    prefix = FOLDER + city
    # filenames
    if b < 1e-5:
        theta = 0 if a < c else np.pi/2
    else:
        theta = .5*arccot((a-c)/(2*b))
        if a > c:
            theta += np.pi/2
    R = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
    S = np.diag([_a, _b])
    T = R.dot(S)
    return (x0, y0), .5*T.dot(T.T)


if __name__ == '__main__':
    city, name = "paris", "test"
    model_prefix_1 = "sandbox/{}".format(city)
    m1 = p.load_var(model_prefix_1 + ".mdl")
    model_parameters_1 = m1.get_params()
    scaler_1 = p.load(model_prefix_1 + ".scaler")
    centers_1 = model_parameters_1.topic_centers
    covars_1 = model_parameters_1.topic_covar
    for i, (center, cov) in enumerate(zip(centers_1, covars_1)):
        json_poly = gaussian_to_poly(center, cov, city, "test", scaler_1,
                                     stddev=1.41, resolution=17)
        points = scaler_1.transform(np.array(json_poly['geometry']['coordinates'][0]))
        points += 0.0002*np.random.randn(*points.shape)
        print('{}: original (center & covariance)\n{}\n{}'.format(i, center, cov))
        ncenter, ncov = poly_to_gaussian(points)
        print('recovered (center & covariance)\n{}\n{}'.format(ncenter, ncov))
    print(json.dumps(json_poly))
    print('paste that on on http://geojsonlint.com/')
Beispiel #35
0
    t.daemon = True
    t.start()
    t = Thread(target=entities_putter, name='InsertDB')
    t.daemon = True
    t.start()
    total_entities = 0
    city = args.city
    chunker = Chunker.Chunker(foursquare.MAX_MULTI_REQUESTS)
    previous = [e['_id'] for e in TABLE.find({'city': city})]
    potential = gather_all_entities_id(checkins, DB_FIELD, city=city)
    print('but already {} {}s in DB.'.format(len(previous), ENTITY_KIND))
    import persistent as p
    region = city or 'world'
    invalid_filename = 'non_{}_id_{}'.format(ENTITY_KIND, region)
    try:
        INVALID_ID = p.load_var(invalid_filename)
    except IOError:
        pass
    print('and {} {}s are invalid.'.format(len(INVALID_ID), ENTITY_KIND))
    new_ones = set(potential).difference(set(previous))
    new_ones = new_ones.difference(set(INVALID_ID))
    outside = set([e['_id'] for e in TABLE.find({'city': None}, {'_id': 1})])
    outside.intersection_update(new_ones)
    print('and {} {}s are outside range.'.format(len(outside), ENTITY_KIND))
    new_ones = new_ones.difference(outside)
    print('So only {} new ones.'.format(len(new_ones)))
    for batch in chunker(new_ones):
        IDS_QUEUE.put(batch)
        total_entities += len(batch)

    IDS_QUEUE.join()
Beispiel #36
0
#! /usr/bin/python2
# vim: set fileencoding=utf-8
import persistent
import numpy as np
u = persistent.load_var('user_status_full')
t = np.array([p[0] for p in u.values() if p[1]])
l = np.array([p[0] for p in u.values() if not p[1]])
photos = sum(t) + sum(l)
print('tourists proportion: {}%'.format(100*len(t)/(len(t) + len(l))))
print("tourists' photos proportion: {}%".format(100*sum(t)/photos))
print('tourist 90 percentile: {}'.format(np.percentile(t, 90)))
print('local 90 percentile: {}'.format(np.percentile(l, 90)))
Beispiel #37
0
def photos_to_cluster_dataset(city, limit=300):
    photos = load_var(city)
    points = [[p[0] + noise(), p[1] + noise(), 'Win!'] for p in photos[:limit]]
    with open(city + '_cluster.js', 'w') as f:
        f.write('var {}_cluster = {}'.format(city, str(points)))
Beispiel #38
0
                                precision) / (beta * beta * precision + recall)


def point_inside_poly(poly, point):
    """Tell whether `point` is inside convex `poly` based on dot product with
    every edges:
    demonstrations.wolfram.com/AnEfficientTestForAPointToBeInAConvexPolygon/
    """
    tpoly = poly - point
    size = tpoly.shape[0] - 1
    angles = tpoly[1:, 0] * tpoly[:size, 1] - tpoly[:size, 0] * tpoly[1:, 1]
    return int(np.abs(np.sign(angles).sum())) == size


# load venues location for all cities
cities_venues_raw = {name: p.load_var(name + '_svenues.my') for name in cities}
cities_venues = {}
cities_index = {}
cities_kdtree = {}
for city in cities:
    vids, _, locs = cities_venues_raw[city].all()
    vindex = cities_desc[city]['index']
    cities_venues[city] = np.zeros((len(vindex), 2))
    cities_index[city] = dict(
        itertools.imap(lambda x: (x[1], x[0]), enumerate(vindex)))
    for vid, loc in itertools.izip(vids, locs):
        pos = cities_index[city].get(vid)
        if pos is not None:
            cities_venues[city][pos, :] = loc
    cities_kdtree[city] = cKDTree(cities_venues[city])
gray = '#bdbdbd'
Beispiel #39
0
        """update the checkin in DB"""
        while True:
            tid, lid = self.queue.get()
            self.new_venues.append(lid)
            try:
                self.checkinDB.update({'_id': tid}, {'$set': {'lid': lid}})
            except (KeyboardInterrupt, SystemExit):
                raise
            except:
                print(sys.exc_info()[1])
            self.queue.task_done()

    def save_new_lid(self):
        """save these new lid because we need to build their profile"""
        region = 'world' if self.city is None else self.city
        id_ = str(hash(self.cids[0]))[:5]
        output = 'new_venue_id_{}_{}'.format(id_, region)
        p.save_var(output, set(self.new_venues))


if __name__ == '__main__':
    city = 'chicago'
    checkin_ids = [
        u for u in p.load_var('non_venue_id_' + city)
        if len(u) == 24 and u.startswith('4')
    ]
    mongo_client = pymongo.MongoClient('localhost', 27017)
    db = mongo_client['foursquare']
    cc = CheckinCorrector(checkin_ids, db['checkin'], 'xaa', city)
    cc.correct()
Beispiel #40
0
    query_city = sys.argv[1]
    assert query_city in CITIES, ', '.join(CITIES)
    CITIES.remove(query_city)
    input_dir = 'www_comparaison_' + query_city
    res = {city: defaultdict(list) for city in CITIES}
    for city in res.keys():
        for neighborhood in NEIGHBORHOODS:
            for metric in METRICS:
                subtop = []
                for output in [
                        name for name in os.listdir(input_dir)
                        if name.startswith(city + '_' + neighborhood)
                        and name.endswith(metric + '.my')
                ]:
                    output = os.path.join(input_dir, output)
                    subtop.extend(p.load_var(output))
                top = get_top_disjoint(subtop, 5)
                if not top:
                    continue
                json_cell = [
                    to_json(city, x[1] + [metric], x[0] + 1)
                    for x in enumerate(top)
                ]
                res[city][neighborhood].extend(json_cell)
    out_name = 'static/www_cmp_{}.js'.format(query_city)
    with open(out_name, 'w') as out:
        out.write(
            'var TOPREG =\n' +
            json.dumps(res, sort_keys=True, indent=2, separators=(',', ': ')) +
            ';')
Beispiel #41
0
#! /usr/bin/python2
# vim: set fileencoding=utf-8
from operator import itemgetter
from datetime import datetime
import CommonMongo as cm
import csv
import persistent
import pyhash
TO_BE_INSERTED = []
HASHER = pyhash.spooky_128()
VENUE_LOC = persistent.load_var('venue_loc.my')
TRAD = {}
with open('trad.dat', 'r') as f:
    for line in f:
        old, new = line.strip().split(';')
        TRAD[old] = new


def reformat(line_dict):
    vid = line_dict['vid']
    if vid in TRAD:
        vid = TRAD[vid]
    if vid not in VENUE_LOC:
        return None
    if line_dict['_id'] == 'ICWSM':
        txt = ''.join(itemgetter('uid', 'vid', 'time')(line_dict))
        line_dict['_id'] = hex(HASHER(txt))[2:-1]
    line_dict['uid'] = int(line_dict['uid'])
    line_dict['loc'], line_dict['city'] = VENUE_LOC[vid]
    line_dict['time'] = datetime.strptime(line_dict['time'],
                                          '%Y-%m-%dT%H:%M:%SZ')
Beispiel #42
0
            for k, v in venue_to_place.iteritems() if k in bijective_venue
        })
    return common_map


def update_checkins(checkins, cmap):
    """Use the mapping to update venue id of checkins."""
    missing = checkins.find({'lid': None}, {'_id': 1, 'place': 1})
    total, corrected = 0, 0
    for checkin in missing:
        total += 1
        _id, place = checkin['_id'], checkin.get('place', None)
        if place and place in cmap:
            try:
                checkins.update({'_id': _id}, {'$set': {'lid': cmap[place]}})
                corrected += 1
            except cm.pymongo.errors.OperationFailure as err:
                print(err, err.coderr)
    print('correct {} out of {} checkins'.format(corrected, total))


if __name__ == '__main__':
    #pylint: disable=C0103
    import persistent as p
    import arguments
    args = arguments.city_parser().parse_args()
    db = cm.connect_to_db('foursquare', args.host, args.port)[0]
    # cmap = build_map(db['checkin'])
    # p.save_var('place_to_venue', cmap)
    update_checkins(db['checkin'], p.load_var('place_to_venue'))
Beispiel #43
0
def consolidate(tags):
    d = {
        tag: persistent.load_var(u'disc/post_{}_{}'.format(tag, GRID_SIZE))
        for tag in tags
    }
    persistent.save_var(u'disc/all_{}'.format(GRID_SIZE), d)
    t.start()
    t = Thread(target=entities_putter, name='InsertDB')
    t.daemon = True
    t.start()
    total_entities = 0
    city = args.city
    chunker = Chunker.Chunker(foursquare.MAX_MULTI_REQUESTS)
    previous = [e['_id'] for e in TABLE.find({'city': city})]
    print previous
    potential = gather_all_entities_id(checkins, DB_FIELD, city=city)
    print('but already {} {}s in DB.'.format(len(previous), ENTITY_KIND))
    import persistent as p
    region = city or 'world'
    invalid_filename = 'non_{}_id_{}'.format(ENTITY_KIND, region)
    try:
        INVALID_ID = p.load_var(invalid_filename)
    except IOError:
        pass
    print('and {} {}s are invalid.'.format(len(INVALID_ID), ENTITY_KIND))
    new_ones = set(potential).difference(set(previous))
    new_ones = new_ones.difference(set(INVALID_ID))
    outside = set([e['_id'] for e in TABLE.find({'city': None}, {'_id': 1})])
    outside.intersection_update(new_ones)
    print('and {} {}s are outside range.'.format(len(outside), ENTITY_KIND))
    new_ones = new_ones.difference(outside)
    print('So only {} new ones.'.format(len(new_ones)))
    for batch in chunker(new_ones):
        IDS_QUEUE.put(batch)
        total_entities += len(batch)

    IDS_QUEUE.join()
Beispiel #45
0
                           self.fields)
        return neighbors_ids, extra, neighbors_locs


if __name__ == '__main__':
    # pylint: disable=C0103
    from timeit import default_timer as clock
    import CommonMongo as cm
    import random as r
    import arguments
    args = arguments.city_parser().parse_args()
    city = args.city
    radius = 350
    DB, CLIENT = cm.connect_to_db('foursquare', args.host, args.port)
    import persistent as p
    lvenues = p.load_var(city + '_lvenues.my')
    svenues = Surrounding(DB.venue, {'city': city}, 'cat cats'.split(),
                          lvenues)
    test_ids = r.sample(lvenues.keys(), 35)
    start = clock()
    for vid in test_ids:
        me = DB.venue.find_one({'_id': vid}, {'loc': 1, 'city': 1})
        ball = {'$geometry': me['loc'], '$maxDistance': radius}
        neighbors = DB.venue.find({
            'city': city,
            'loc': {
                '$near': ball
            }
        }, {
            'cat': 1,
            'cats': 1,
Beispiel #46
0
def main():
    import persistent as p
    args, parser = parse_args()

    # Get current time to use it as a filename for output files
    filename_prefix = "data/" + args.description
    # filename_prefix = datetime.today().strftime("%d-%m-%Y-%H.%M.%S")
    if args.city:
        external = args.external or str(args.k_min)
        city = args.city
        filename_prefix = '_'.join([city, external, str(args.n_components)])
        filename_prefix = 'comparisons/' + filename_prefix
        args.query = '{{"bboxCity": "{}"}}'.format(args.city)

    # connect to mongo, load and standardize data
    db = get_mongo_database_with_auth(args.dbhost, args.dbport, args.dbname,
                                      args.username, args.password)

    # TODO: Get this from command line
    venue_extractors = [io.venue_primary_category_extractor]
    checkin_extractors = [
        io.checkin_time_extractor_hard, io.checkin_user_extractor,
        io.checkin_day_extractor
    ]

    data, scaler = io.load_data_mongo(db[args.venuecoll], db[args.checkincoll],
                                      args.query, venue_extractors,
                                      checkin_extractors, filename_prefix,
                                      args.n_components, args.venue_threshold)

    # Split into train and test
    train, test = io.split_train_test_with_common_vocabulary(data,
                                                             test_size=0.2)

    print("Loaded {0} ({1} train, {2} test) data points.".format(
        data["coordinates"].shape[0], train["coordinates"].shape[0],
        test["coordinates"].shape[0]),
          file=sys.stderr)

    # set centers of topics
    initial_topic_centers = None
    initial_topic_covar = None
    if args.external:
        initial_topic_centers, initial_topic_covar = \
            p.load_var('comparisons/{}_{}.preset'.format(city, args.external))

    # Run EM n times
    best_train_likelihood = -1 * np.inf
    best_test_likelihood = -1 * np.inf
    best_k = None
    best_lambda = None
    best_model = None

    lambda_list = args.lambdas
    k_list = range(args.k_min, 1 + args.k_max, args.k_step)
    train_likelihood_across_k = -np.inf * np.ones(
        (len(lambda_list), len(k_list)))
    test_likelihood_across_k = -np.inf * np.ones(
        (len(lambda_list), len(k_list)))

    track_params = args.trackparams

    if args.plot:
        likelihood_fig = plt.figure()

    if initial_topic_centers is not None:
        k_list = [len(initial_topic_centers)]

    for lidx, Lambda in enumerate(lambda_list):

        for kidx, num_topics in enumerate(k_list):
            print("\n====== lambda = {0}, k = {1} ======\n\n".format(
                Lambda, num_topics),
                  file=sys.stderr)

            # n_jobs=-2 -> Leave only one logical core unused
            models = Parallel(n_jobs=-2, backend="threading")(delayed(run)(
                train, Lambda, num_topics, i, args, initial_topic_centers,
                initial_topic_covar, track_params) for i in range(args.runs))

            # TODO remove this or add command line option
            # Swap to this for serial processing
            # models = [run(train, Lambda, num_topics, i, args,
            #               initial_topic_centers, initial_topic_covar,
            #               track_params)
            #           for i in range(args.runs)]

            best_model_index_for_parameters = np.argmax(
                [model.latest_statistics.likelihood for model in models])

            best_model_in_k = models[best_model_index_for_parameters]

            train_likelihood_across_k[lidx][kidx] = \
                best_model_in_k.latest_statistics.likelihood
            test_likelihood_for_parameters = \
                best_model_in_k.predict_log_probs(test)
            test_likelihood_across_k[lidx][kidx] = \
                test_likelihood_for_parameters

            if test_likelihood_for_parameters > best_test_likelihood:
                best_train_likelihood = \
                    best_model_in_k.latest_statistics.likelihood
                best_test_likelihood = test_likelihood_for_parameters

                best_k = num_topics
                best_model = best_model_in_k

            gc.collect()

    print("Results of the best model:\n", file=sys.stderr)
    print_stuff(data["unigrams"], best_model.get_params())
    print("Best train likelihood: {0}\n".format(best_train_likelihood),
          file=sys.stderr)
    print("Best test likelihood: {0}\n".format(best_test_likelihood),
          file=sys.stderr)

    print("PROB VS VARIATIONAL")
    print(best_model.predict_log_probs(test))
    print(best_model.predict_log_probs_variational(test))

    if args.save:
        query = "synthetic"
        try:
            if args.query:
                query = args.query
        except:
            pass

        io.save_model(best_model, scaler, query, data["unigrams"],
                      filename_prefix)

    # PLOTS
    if args.plot:
        x_plot_num = 1
        y_plot_num = 1

        if len(k_list) > 1:
            plotting.plot_across_lambda_and_k(lambda_list,
                                              k_list,
                                              train_likelihood_across_k,
                                              test_likelihood_across_k,
                                              train["coordinates"].shape[0],
                                              data["coordinates"].shape[0],
                                              filename_prefix,
                                              save=True)

        if track_params:
            best_statistics_history = best_model.get_statistics_history()

            # Plot likelihood graph
            likelihood_plot = plotting.plot_statistics_history(
                likelihood_fig, best_statistics_history, x_plot_num,
                y_plot_num, 0)

            # Put the legend on the last likelihood plot
            likelihood_fig.legend(list(likelihood_plot), [
                'Likelihood', 'User likelihood', 'Location likelihood',
                'Topic likelihood', 'Sigma likelihood', 'Phi entropy'
            ])

            # TODO add command line option
            #  Uncomment to enable animated plots
            # phi_animated_fig, phi_animated_ax = plt.subplots(1, 1)
            # anim = plotting.plot_phi_animated(phi_animated_fig,
            # phi_animated_ax, train, best_statistics_history)

            # anim.save('anim.gif', writer='imagemagick', fps=10, dpi=300)

        plt.show()
Beispiel #47
0
# NEIGHBORHOODS = ['triangle', 'latin']
# METRICS = ['jsd', 'emd', 'cluster', 'emd-lmnn', 'leftover']
METRICS = ['emd-itml', 'emd-tsne']
if __name__ == '__main__':
    # pylint: disable=C0103
    import json
    query_city = sys.argv[1]
    assert query_city in CITIES, ', '.join(CITIES)
    CITIES.remove(query_city)
    input_dir = 'www_comparaison_' + query_city
    res = {city: defaultdict(list) for city in CITIES}
    for city in res.keys():
        for neighborhood in NEIGHBORHOODS:
            for metric in METRICS:
                subtop = []
                for output in [name for name in os.listdir(input_dir)
                               if name.startswith(city+'_'+neighborhood) and
                               name.endswith(metric+'.my')]:
                    output = os.path.join(input_dir, output)
                    subtop.extend(p.load_var(output))
                top = get_top_disjoint(subtop, 5)
                if not top:
                    continue
                json_cell = [to_json(city, x[1]+[metric], x[0]+1)
                             for x in enumerate(top)]
                res[city][neighborhood].extend(json_cell)
    out_name = 'static/www_cmp_{}.js'.format(query_city)
    with open(out_name, 'w') as out:
        out.write('var TOPREG =\n' + json.dumps(res, sort_keys=True, indent=2,
                                                separators=(',', ': ')) + ';')
Beispiel #48
0
    random.seed(123)
    nrep = 11
    timings = []
    n = int(1e6)
    # ew = {(random.randint(0, n), random.randint(0, n)): 1002*random.random()
    #       for _ in range(30000)}
    # for _ in range(nrep):
    #     start = clock()
    #     benchmark(ew)
    #     timings.append(clock() - start)
    # print('\t'.join(('{:.3g}'.format(t) for t in timings)))
    # print(sum(timings[1:])/(nrep-1))
    # sys.exit()

    import persistent
    dataset = 'usps4500'
    ew, y = persistent.load_var('{}_lcc.my'.format(dataset))
    nrep = 5
    timings = []
    for _ in range(nrep):
        start = clock()
        mst = kruskal_mst_edges(ew)
        timings.append(clock() - start)
    print('\t'.join(('{:.3g}'.format(t) for t in timings)))
    print(sum(timings[1:])/(nrep-1))
    import networkx as nx
    g = nx.Graph()
    g.add_weighted_edges_from((u,v,w) for (u,v),w in ew.items())
    mst_gold = [(u,v) for (u,v) in nx.minimum_spanning_tree(g).edges()]
    print(sorted(mst) == sorted(mst_gold))
Beispiel #49
0
        theta = 0 if a < c else np.pi / 2
    else:
        theta = .5 * arccot((a - c) / (2 * b))
        if a > c:
            theta += np.pi / 2
    R = np.array([[np.cos(theta), -np.sin(theta)],
                  [np.sin(theta), np.cos(theta)]])
    S = np.diag([_a, _b])
    T = R.dot(S)
    return (x0, y0), .5 * T.dot(T.T)


if __name__ == '__main__':
    city, name = "paris", "test"
    model_prefix_1 = "sandbox/{}".format(city)
    m1 = p.load_var(model_prefix_1 + ".mdl")
    model_parameters_1 = m1.get_params()
    scaler_1 = p.load(model_prefix_1 + ".scaler")
    centers_1 = model_parameters_1.topic_centers
    covars_1 = model_parameters_1.topic_covar
    for i, (center, cov) in enumerate(zip(centers_1, covars_1)):
        json_poly = gaussian_to_poly(center,
                                     cov,
                                     city,
                                     "test",
                                     scaler_1,
                                     stddev=1.41,
                                     resolution=17)
        points = scaler_1.transform(
            np.array(json_poly['geometry']['coordinates'][0]))
        points += 0.0002 * np.random.randn(*points.shape)
Beispiel #50
0
    mapbox_service.session.params['access_token'] = os.environ[
        'MAPBOX_ACCESS_TOKEN']

    # alternatively, you can uncomment the next line and pass your token directly
    # mapbox_service.session.params['access_token'] = 'YOUR_TOKEN'

    mainCats = [
        'Arts & Entertainment', 'College & University', 'Food',
        'Nightlife Spot', 'Outdoors & Recreation', 'Shop & Service',
        'Professional & Other Places', 'Residence', 'Travel & Transport'
    ]
    main_cats_plot = mainCats[:]
    mainCats = {c: i for i, c in enumerate(mainCats)}
    mainCats['Event'] = 9

    name_to_main_cat = p.load_var('name_to_cat.my')
    name_to_main_cat['Event'] = 'Event'
    name_to_main_cat['NOPRIMCAT'] = 'Event'
    name_to_main_cat['Tennis'] = 'Outdoors & Recreation'

    timeOfDay = [
        'MORNING', 'NOON', 'AFTERNOON', 'EVENING', 'NIGHT', 'LATENIGHT'
    ]
    timeOfDay_plot = ['MORN', 'NOON', 'AFNN', 'EVEN', 'NGHT', 'LATE']
    dayOfWeek = [
        'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
        'Sunday'
    ]
    dayOfWeek_plot = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

    city = args.city
Beispiel #51
0
        'src': name + '.shp',
        'labeling': {
            'key': 'tag'
        }
    }
    color = '#ffa873'
    style.append(CSS.format(name, color, 'black'))
    with fiona.collection(mkpath('disc', name + '.shp'), "w", "ESRI Shapefile",
                          schema) as f:
        f.writerecords(polys)

    with open(mkpath('disc', 'photos.json'), 'w') as f:
        json.dump(KARTO_CONFIG, f)
    style.append('#top_disc-label {font-family: OpenSans; font-size: 14px}')
    with open(mkpath('disc', 'photos.css'), 'w') as f:
        f.write('\n'.join(style))
    sf = box(SF_BBOX[1], SF_BBOX[0], SF_BBOX[3], SF_BBOX[2])
    print(sf.bounds)
    print(100 * cover.area, sf.area)


if __name__ == '__main__':
    t = persistent.load_var('disc/all')
    # top = get_top_tags(2000, 'nsf_tag.dat')
    supported = [v[0] for v in persistent.load_var('supported')][:600]
    d = top_discrepancy(t, supported)
    N = 11
    print([v[2] for v in d[-N:]], [v[2] for v in d[:N]])
    # plot_some(d, 20)
    # js_some(d, 15)
Beispiel #52
0
 def start_requests(self):
     self.seen_users = persistent.load_var('seen_users.my')
     urls = [MEMBER_PREFIX+id_ for id_ in persistent.load_var('next_users.my')]
     for url in urls:
         yield scrapy.Request(url=url, callback=self.parse)
Beispiel #53
0
 def load_list(name):
     try:
         return p.load_var(name + '.my')
     except IOError:
         return []
Beispiel #54
0
import persistent
import codecs
import math
import scipy.io as sio
import numpy
d = persistent.load_var('tag_support')
t = sorted(d.iteritems(), key=lambda x: (x[1][0], x[1][1]), reverse=True)
res = []
template = u'{}: {} photos by {} users over {} days'
numeric = numpy.zeros((len(t), 3), dtype=numpy.int32)
i = 0
for tag, info in t:
    days = int(math.ceil((info[3] - info[2]).total_seconds()/3600))
    res.append(template.format(tag, info[0], info[1], days))
    numeric[i, :] = [info[0], info[1], days]
    i += 1

sio.savemat('tag_support_num', {'t': numeric})
# with codecs.open('tag_support.txt', 'w', 'utf8') as f:
#     f.write('\n'.join(res))
Beispiel #55
0
def post_process(tag):
    top_loc = persistent.load_var(u'disc/top_{}_{}'.format(tag, GRID_SIZE))
    merged = merge_regions(top_loc)
    persistent.save_var(u'disc/post_{}_{}'.format(tag, GRID_SIZE), merged)