def run_ring_experiment(size, nb_rings, ring_size_ratio=1.0, shared_sign=True, rigged=False, n_rep=100, shared_edges=None, pivot_strategy=densify.PivotStrategy.uniform, triangle_strategy=TriangleStatus.closeable, one_at_a_time=True, pool=None): args = repeat({"size": size, "nb_rings": nb_rings, "ring_size_ratio": ring_size_ratio, "shared_sign": shared_sign, "rigged": rigged, "shared_edges": shared_edges, "pivot_strategy": pivot_strategy, "triangle_strategy": triangle_strategy, "one_at_a_time": one_at_a_time}, n_rep) if pool: runs = list(pool.imap_unordered(process_rings, args, chunksize=n_rep//NUM_THREADS)) else: runs = list(map(process_rings, args)) res = {'time': list(map(itemgetter(0), runs)), 'nb_error': list(map(itemgetter(2), runs))} suffix = 'pos' if shared_sign else 'neg' suffix += '_rigged' if rigged else '' suffix += '_' + str(n_rep) heuristic = strategy_to_str(pivot_strategy, triangle_strategy, one_at_a_time) exp_name = 'square_{:04d}_{:02d}_{:.1f}_{}_{}_{}.my' p.save_var(exp_name.format(size, nb_rings, ring_size_ratio, suffix, heuristic, int(time.time())), res)
def interpolate_distances(values_map, filename): """Plot the distance at every circle center and interpolate between""" from scipy.interpolate import griddata from matplotlib import pyplot as plt import persistent as p filename = os.path.join('distance_map', filename) x, y, z = [np.array(dim) for dim in zip(*[a for a in values_map])] x_ext = [x.min(), x.max()] y_ext = [y.min(), y.max()] xi = np.linspace(x_ext[0], x_ext[1], 100) yi = np.linspace(y_ext[0], y_ext[1], 100) zi = griddata((x, y), z, (xi[None, :], yi[:, None]), method='cubic') fig = plt.figure(figsize=(22, 18)) plt.contour(xi, yi, zi, 20, linewidths=0.8, colors='#282828') plt.contourf(xi, yi, zi, 20, cmap=plt.cm.Greens) plt.colorbar() plt.scatter(x, y, marker='o', c='#282828', s=5) plt.tight_layout(pad=0) plt.xlim(*x_ext) plt.ylim(*y_ext) plt.savefig(filename, dpi=96, transparent=False, frameon=False, bbox_inches='tight', pad_inches=0.01) p.save_var(filename.replace('.png', '.my'), values_map) plt.close(fig)
def augmented_ancestor(tree_adj, X): tree_root = max(((node, len(adj)) for node, adj in tree_adj.items()), key=lambda x: x[1])[0] prt = gs.ancestor_info(tree_adj, tree_root) if len(prt) != len(tree_adj): persistent.save_var('bug_parent.my', (prt, tree_adj)) assert len(prt) == len(tree_adj), set(tree_adj.keys()) - set(prt.keys()) leaves = {u for u, adj in tree_adj.items() if len(adj) == 1} infos = {u: (prt[u], 1, int(u in X)) for u in leaves} possible_inclusion = defaultdict(int) for _ in infos.values(): if _[0] is not None: possible_inclusion[_[0]] += 1 def ready(u, vote): threshold = len(tree_adj[u]) - 1 if prt[u] is None: threshold += 1 return vote == threshold border = {u for u, vote in possible_inclusion.items() if ready(u, vote)} while border: for u in border: children = {v for v in tree_adj[u] if v in infos} subtree_size, num_in_x, parent = 1, int(u in X), prt[u] for v in children: subtree_size += infos[v][1] num_in_x += infos[v][2] infos[u] = (parent, subtree_size, num_in_x) del possible_inclusion[u] if parent is not None: possible_inclusion[parent] += 1 border = {u for u, vote in possible_inclusion.items() if ready(u, vote)} return infos, {u: v[0] for u, v in infos.items()}
def global_info(city, standalone=False): """Gather global statistics about `city`.""" lvenues = geo_project(city, DB.venue.find({'city': city}, {'loc': 1})) lcheckins = geo_project(city, DB.checkin.find({'city': city}, {'loc': 1})) lphotos = geo_project(city, CLIENT.world.photos.find({'hint': city}, {'loc': 1})) local_projection = [lvenues, lcheckins, lphotos] visits = xp.get_visits(CLIENT, xp.Entity.venue, city) visitors = xp.get_visitors(CLIENT, city) density = estimate_density(city) activity = [visits, visitors, density] global TOP_CATS TOP_CATS = p.load_var('top_cats.my') infos = {'venue': [] if standalone else ['cat', 'cats'], 'photo': ['taken'] if standalone else ['venue']} svenues = s.Surrounding(DB.venue, {'city': city}, infos['venue'], lvenues) scheckins = s.Surrounding(DB.checkin, {'city': city}, ['time'], lcheckins) sphotos = s.Surrounding(CLIENT.world.photos, {'hint': city}, infos['photo'], lphotos) surroundings = [svenues, scheckins, sphotos] p.save_var('{}_s{}s.my'.format(city, 'venue'), svenues) if standalone: for name, var in zip(['venue', 'checkin', 'photo'], surroundings): p.save_var('{}_s{}s.my'.format(city, name), var) return local_projection + activity + surroundings
def users_and_tag(tag): r = DB.photos.aggregate([ {"$match": {"hint": "sf", "ntags": tag}}, {"$project": {"uid": 1}}, {"$group": {"_id": "$uid", "count": {"$sum": 1}}}, {"$sort": SON([("count", -1), ("_id", -1)])} ]) save_var('u14', r['result'])
def increase_coverage(upto=5000): """Save `upto` unprocessed San Francisco tags""" from more_query import get_top_tags sup = persistent.load_var('supported') more = get_top_tags(upto, 'nsf_tag.dat') already = [v[0] for v in sup] addition = set(more).difference(set(already)) persistent.save_var('addition', addition)
def run_experiment(pool, process_function, savefile, process_args, n_rep): if pool: runs = list(pool.imap_unordered(process_function, process_args, chunksize=n_rep//NUM_THREADS)) else: runs = [process_communities(_) for _ in process_args] res = {'time': list(map(itemgetter(0), runs)), 'delta': list(map(itemgetter(1), runs)), 'nb_error': list(map(itemgetter(2), runs))} p.save_var(savefile, res)
def run_circle_experiment(size, one_at_a_time, rigged=False, n_rep=100, pivot=redensify.PivotSelection.Uniform, pool=None): args = repeat({"circle_size": size, "rigged": rigged, "pivot": pivot, "one_at_a_time": one_at_a_time}, n_rep) if pool: runs = list(pool.imap_unordered(process_circle, args, chunksize=n_rep//NUM_THREADS)) else: runs = list(map(process_circle, args)) res = {'time': list(map(itemgetter(0), runs)), 'nb_error': list(map(itemgetter(2), runs))} p.save_var(savefile_name('circle', [size, 0], pivot, one_at_a_time), res)
def brute_search(city_desc, hsize, distance_function, threshold, metric='jsd'): """Move a sliding circle over the whole city and keep track of the best result.""" global SURROUNDINGS, CITY_FEATURES, THRESHOLD, RADIUS global METRIC_NAME, CITY_SUPPORT, DISTANCE_FUNCTION import multiprocessing RADIUS = hsize THRESHOLD = threshold METRIC_NAME = metric city_size, CITY_SUPPORT, CITY_FEATURES, city_infos = city_desc SURROUNDINGS, bounds = city_infos DISTANCE_FUNCTION = distance_function minx, miny, maxx, maxy = bounds nb_x_step = int(3*np.floor(city_size[0]) / hsize + 1) nb_y_step = int(3*np.floor(city_size[1]) / hsize + 1) best = [1e20, [], [], RADIUS] res_map = [] pool = multiprocessing.Pool(4) x_steps = np.linspace(minx+hsize, maxx-hsize, nb_x_step) y_steps = np.linspace(miny+hsize, maxy-hsize, nb_y_step) x_vals, y_vals = np.meshgrid(x_steps, y_steps) to_cell_arg = lambda _: (float(_[1][0]), float(_[1][1]), _[0] % nb_x_step, _[0]/nb_x_step, _[0]) cells = i.imap(to_cell_arg, enumerate(i.izip(np.nditer(x_vals), np.nditer(y_vals)))) res = pool.map(one_cell, cells) pool.close() pool.join() res_map = [] if metric == 'leftover': dsts = emd_leftover.collect_matlab_output(len(res)) for cell, dst in i.izip(res, dsts): if cell[0]: cell[2] = dst clean_tmp_mats() for cell in res: if cell[0] is None: continue res_map.append(cell[:3]) if cell[2] < best[0]: best = [cell[2], cell[3], [cell[0], cell[1]], RADIUS] if QUERY_NAME: import persistent as p logging.info('wrote: '+str(os.path.join(OTMPDIR, QUERY_NAME))) p.save_var(os.path.join(OTMPDIR, QUERY_NAME), [[cell[2], cell[3], [cell[0], cell[1]], RADIUS] for cell in res if cell[0]]) yield best, res_map, 1.0
def brute_search(city_desc, hsize, distance_function, threshold, metric='jsd'): """Move a sliding circle over the whole city and keep track of the best result.""" global SURROUNDINGS, CITY_FEATURES, THRESHOLD, RADIUS global METRIC_NAME, CITY_SUPPORT, DISTANCE_FUNCTION import multiprocessing RADIUS = hsize THRESHOLD = threshold METRIC_NAME = metric city_size, CITY_SUPPORT, CITY_FEATURES, city_infos = city_desc SURROUNDINGS, bounds = city_infos DISTANCE_FUNCTION = distance_function minx, miny, maxx, maxy = bounds nb_x_step = int(3 * np.floor(city_size[0]) / hsize + 1) nb_y_step = int(3 * np.floor(city_size[1]) / hsize + 1) best = [1e20, [], [], RADIUS] res_map = [] pool = multiprocessing.Pool(4) x_steps = np.linspace(minx + hsize, maxx - hsize, nb_x_step) y_steps = np.linspace(miny + hsize, maxy - hsize, nb_y_step) x_vals, y_vals = np.meshgrid(x_steps, y_steps) to_cell_arg = lambda _: (float(_[1][0]), float(_[1][1]), _[0] % nb_x_step, _[0] / nb_x_step, _[0]) cells = i.imap(to_cell_arg, enumerate(i.izip(np.nditer(x_vals), np.nditer(y_vals)))) res = pool.map(one_cell, cells) pool.close() pool.join() res_map = [] if metric == 'leftover': dsts = emd_leftover.collect_matlab_output(len(res)) for cell, dst in i.izip(res, dsts): if cell[0]: cell[2] = dst clean_tmp_mats() for cell in res: if cell[0] is None: continue res_map.append(cell[:3]) if cell[2] < best[0]: best = [cell[2], cell[3], [cell[0], cell[1]], RADIUS] if QUERY_NAME: import persistent as p logging.info('wrote: ' + str(os.path.join(OTMPDIR, QUERY_NAME))) p.save_var(os.path.join(OTMPDIR, QUERY_NAME), [[cell[2], cell[3], [cell[0], cell[1]], RADIUS] for cell in res if cell[0]]) yield best, res_map, 1.0
def get_user_status(with_count=False): name = 'user_status' + ('_full' if with_count else '') fields = {'tourist': 1} if with_count: fields.update({'count': 1}) try: d = load_var(name) except IOError: users = list(DB.users.find(fields=fields)) if with_count: d = dict([(u['_id'], (u['count'], u['tourist'])) for u in users]) else: d = dict([(u['_id'], u['tourist']) for u in users]) save_var(name, d) return d
def run_planted_experiment(ball_size, nb_balls, one_at_a_time=True, n_rep=100, pivot=redensify.PivotSelection.Uniform, pool=None): args = repeat({"ball_size": ball_size, "nb_balls": nb_balls, "pivot": pivot, "one_at_a_time": one_at_a_time}, n_rep) if pool: runs = list(pool.imap_unordered(process_planted, args, chunksize=n_rep//NUM_THREADS)) else: runs = list(map(process_planted, args)) res = {'time': list(map(itemgetter(0), runs)), 'delta': list(map(itemgetter(1), runs)), 'nb_error': list(map(itemgetter(2), runs))} p.save_var(savefile_name('planted', [ball_size, nb_balls], pivot, one_at_a_time), res)
def run_rings_experiment(size, nb_rings, shared_sign, rigged, one_at_a_time, pivot=redensify.PivotSelection.Uniform, n_rep=100, pool=None): args = repeat({"size": size, "nb_rings": nb_rings, "rigged": rigged, "shared_sign": shared_sign, "pivot": pivot, "one_at_a_time": one_at_a_time}, n_rep) if pool: runs = list(pool.imap_unordered(process_rings, args, chunksize=n_rep//NUM_THREADS)) else: runs = [process_rings(_) for _ in args] res = {'time': list(map(itemgetter(0), runs)), 'nb_error': list(map(itemgetter(2), runs))} p.save_var(savefile_name('rings', [size, nb_rings], pivot, one_at_a_time), res)
def load_existing_ids(cmd_args): """Read checkins ids in city from disk or DB.""" city = cmd_args.city if city == 'whole': return [] import persistent as p try: return p.load_var(city+'_checkins_ids.my') except IOError: pass import CommonMongo as cm db = cm.connect_to_db('foursquare', cmd_args.host, cmd_args.port)[0] ids = {str(_['_id']) for _ in db.checkin.find({'city': city}, {'_id': 1}) if not isinstance(_['_id'], long)} p.save_var(city+'_checkins_ids.my', ids) return ids
def get_categories(client=None): """Return categories list from disk or from Foursquare website using client""" if client is None: raw_cats = p.load_var('raw_categories')['categories'] else: raw_cats = client.venues.categories() p.save_var('raw_categories', raw_cats) raw_cats = raw_cats['categories'] cats = Category('1', 'Venue', 0, parse_categories(raw_cats)) # pylint: disable=E1101 id_index = [(id_, idx + 100) for idx, id_ in enumerate(sorted(CAT_TO_ID.values())) if id_ not in ['0', '1']] ID_TO_INDEX.update(id_index) return cats
def seed_region(): geo = f.json.loads(f.request.form['geo']) fields = ['metric', 'candidate', 'clustering'] metric, candidate, clustering = [str(f.request.form[field]) for field in fields] msg = 'From {}@{} to {} using {}, {}, {}' neighborhood = KNOWN_GEO.get(hash(str(geo)), 'custom') args = [ORIGIN['city'], neighborhood, DEST['city'], candidate, metric if candidate == 'dst' else 'NA', clustering] msg = msg.format(*args) print(msg) logging.warn(msg) res, log = nb.one_method_seed_regions(ORIGIN['city'], DEST['city'], geo, metric, candidate, clustering) res = dict(r=res, info=log) p.save_var('candidates/{}_{}_{}_{}_{}.my'.format(*args[1:]), res) return f.jsonify(res)
def get_data(DB): entropies = load_var('Hsupported') tags = sorted([k for k, v in entropies.items() if 2.5 <= v <= 3.01]) save_var('mat_tag', tags) u = load_var('user_status') user_index = {k: i for i, k in enumerate(u)} def format_photo(p): user = user_index[p['uid']] loc = p['loc']['coordinates'] taken = [p['taken'].weekday(), p['taken'].hour, calendar.timegm(p['taken'].utctimetuple())] indicator = [int(t in p['ntags']) for t in tags] return [user] + loc + taken + indicator photos_feature = np.mat(tag_time(DB, tags, format_photo)) sio.savemat('deep', {'A': scipy.sparse.csr_matrix(photos_feature)})
def load_existing_ids(cmd_args): """Read checkins ids in city from disk or DB.""" city = cmd_args.city if city == 'whole': return [] import persistent as p try: return p.load_var(city + '_checkins_ids.my') except IOError: pass import CommonMongo as cm db = cm.connect_to_db('foursquare', cmd_args.host, cmd_args.port)[0] ids = { str(_['_id']) for _ in db.checkin.find({'city': city}, {'_id': 1}) if not isinstance(_['_id'], long) } p.save_var(city + '_checkins_ids.my', ids) return ids
def read_and_insert(): db = db_filepath('laptop') end = datetime.utcnow() with sqlite3.connect(db) as conn: tasks = {id_: name.lower() for id_, name in get_project_list(conn).items()} next_task = parse_task([x.lower() for x in tasks.values()], sys.argv) task = make_task(next_task, tasks) print(task) pending_file = mk_path(get_data_saving_path('tracker'), '_tsk') if task is None: insert_pending_task(conn, end, pending_file) return pending = task['end'] is None if pending: insert_pending_task(conn, end, pending_file) save_var(pending_file, task) else: insert_task(conn, task)
def seed_region(): geo = f.json.loads(f.request.form['geo']) fields = ['metric', 'candidate', 'clustering'] metric, candidate, clustering = [ str(f.request.form[field]) for field in fields ] msg = 'From {}@{} to {} using {}, {}, {}' neighborhood = KNOWN_GEO.get(hash(str(geo)), 'custom') args = [ ORIGIN['city'], neighborhood, DEST['city'], candidate, metric if candidate == 'dst' else 'NA', clustering ] msg = msg.format(*args) print(msg) logging.warn(msg) res, log = nb.one_method_seed_regions(ORIGIN['city'], DEST['city'], geo, metric, candidate, clustering) res = dict(r=res, info=log) p.save_var('candidates/{}_{}_{}_{}_{}.my'.format(*args[1:]), res) return f.jsonify(res)
def users_and_tag(tag): r = DB.photos.aggregate([{ "$match": { "hint": "sf", "ntags": tag } }, { "$project": { "uid": 1 } }, { "$group": { "_id": "$uid", "count": { "$sum": 1 } } }, { "$sort": SON([("count", -1), ("_id", -1)]) }]) save_var('u14', r['result'])
def run_circle_experiment(size, rigged=False, pivot_strategy=densify.PivotStrategy.uniform, triangle_strategy=TriangleStatus.closeable, one_at_a_time=True, n_rep=100, pool=None): args = repeat({"circle_size": size, "rigged": rigged, "shared_edges": False, "pivot_strategy": pivot_strategy, "triangle_strategy": triangle_strategy, "one_at_a_time": one_at_a_time}, n_rep) if pool: runs = list(pool.imap_unordered(process_circle, args, chunksize=n_rep//NUM_THREADS)) else: runs = list(map(process_planted, args)) res = {'time': list(map(itemgetter(0), runs)), 'nb_error': list(map(itemgetter(2), runs))} heuristic = strategy_to_str(pivot_strategy, triangle_strategy, one_at_a_time) p.save_var('circle_{:04d}_{}_{}.my'.format(size, heuristic, int(time.time())), res)
def process_graph(G, E, noise, outname, asym=False): root = max(G.items(), key=lambda x: len(x[1]))[0] if not outname.startswith('belgrade/'): outname = 'belgrade/' + outname basename = '{}_{}'.format(outname, hostname()) suffix = '.asymres' if asym else '.myres' if os.path.isfile(basename+'_perf'+suffix): return bfs = gs.perturbed_bfs(G, root) gtx, _ = galaxy_maker(G, 50, short=True, output_name=outname) stretch = None binary_signs = {e: (1 if s else -1) for e, s in E.items()} perf = [] for train_edges in [bfs, gtx]: if asym: perf.extend(run_asym(G, E, train_edges)) else: tree = {} for u, v in train_edges: gs.add_edge(tree, u, v) tags = pot.dfs_tagging(tree, binary_signs, root) gold, pred = pot.make_pred(tree, tags, binary_signs) tp, tn, fp, fn = confusion_number(gold, pred) perf.extend([accuracy(tp, tn, fp, fn), f1_score(tp, tn, fp, fn), mcc(tp, tn, fp, fn)]) if asym: _, edges = pot.read_tree(outname+'_0.edges') perf.extend(run_asym(G, E, edges)) perf.extend(run_asym(G, E, tree_edges=None)) else: gold, pred, _ = pot.predict_edges(outname+'_0', all_signs=E, degrees={root: 5}) tp, tn, fp, fn = confusion_number(gold, pred) perf.extend([accuracy(tp, tn, fp, fn), f1_score(tp, tn, fp, fn), mcc(tp, tn, fp, fn)]) if noise == 0 and not asym: print(basename) bfsst = average_strech(set(E.keys()), bfs) persistent.save_var(basename+'_bfsst'+suffix, bfsst) gtxst = average_strech(set(E.keys()), gtx) persistent.save_var(basename+'_gtxst'+suffix, gtxst) stretch = [bfsst, gtxst] persistent.save_var(basename+'_perf'+suffix, perf) return perf, stretch
# tmp = load_var('supported') # tags = [v[0] for v in tmp] # shuffle(tags) # tags = [None] + tags # sf_entropy(None) # p = Pool(4) # res = p.map(sf_entropy, tags) # p.close() # outplot('nentropies_{}.dat'.format(GRID_SIZE), ['H', 'tag'], [r[0] for r in res], tags) # outplot('nKentropies_{}.dat'.format(GRID_SIZE), ['D', 'tag'], [r[1] for r in res], tags) # top_metrics(tags) # te = [time_entropy(tag) for tag in tags] # t = prettytable.PrettyTable(['tag'] + PERIOD_NAME, sortby='day') # t.align['tag'] = 'l' # t.padding_width = 0 # for row in te: # t.add_row(row) # with codecs.open('time_entropy.txt', 'w', 'utf8') as f: # f.write(t.get_string(border=False, left_padding_width=0, # right_padding_width=2)) save_var( 'helsinki', tag_location(photos, None, CITY_BBOX, FIRST_TIME, LAST_TIME, extra_info=['taken'])) t = 1000 * (clock() - start) print('done in {:.3f}ms'.format(t))
import real_world as rw import redensify parser = ae.get_parser('Compute a galaxy tree') args = parser.parse_args() a = ae.further_parsing(args) basename, seeds, synthetic_data, prefix, noise, balanced = a if synthetic_data: try: ae.load_raw(basename, redensify, args) except IOError: import graph_tool as gt g = gt.load_graph(basename+'.gt') cexp.to_python_graph(g) else: rw.read_original_graph(basename, seed=args.seed, balanced=balanced) redensify.G = deepcopy(rw.G) redensify.EDGES_SIGN = deepcopy(rw.EDGE_SIGN) suffixes = ('_bal' if args.balanced else '', '_short' if args.short else '', '_safe' if args.safe else '', args.seed) outname = 'lp10/{}{}{}{}_{}'.format(args.data.lower(), *suffixes) print(outname) res = meta_galaxy(redensify.G, redensify.EDGES_SIGN, 10, outname, safe=args.safe, short=args.short) if args.safe: gold, pred, _ = res import persistent persistent.save_var(outname+'_res.my', (gold, pred))
NTEST = 2000 city, districts = sys.argv[1], [] city_info = load_data(city) gold_list = city_info[-1] districts = sorted([nn for nn, gold in gold_list.iteritems() if city in gold['gold']]) try: os.mkdir('random') except OSError: pass for district in districts: savename = 'random/{}_{}.my'.format(city, district) print(savename) if os.path.isfile(savename): continue distrib, best_score, best_region = [], 0, None for i in range(NTEST): regions, score = mock_random_list(city, district, city_info) if score > best_score: best_score, best_region = score, regions distrib.append(score) p.save_var(savename, distrib) outjson = [{ 'pos': rank+1, 'metric': 'random', 'dst': -1, 'venues': r[1], 'geo': mapping(Polygon(np.fliplr(c.euclidean_to_geo(city, r[0]))))} for rank, r in enumerate(best_region)] filename = 'static/random_{}_{}.json'.format(city, district) with open(filename, 'w') as f: json.dump(outjson, f, sort_keys=True, indent=2, separators=(',', ': '))
filename = sys.argv[1] if len(sys.argv) > 2: skip = int(sys.argv[2]) else: skip = 0 csv.field_size_limit(sys.maxsize) allowed_cities = set(SHORT_KEY) with open(filename, 'rb') as f: reader = csv.DictReader(f, delimiter=';', quoting=csv.QUOTE_NONE) for i, row in enumerate(reader): # if i > 31000: # break if i < skip: continue if row['lon,lat'] is None: print(row['vid']) continue venue = reformat(row) # if i == 29950: # print(venue) # break # if venue['_id'] == '4ac518c5f964a520c1a420e3': # print(venue, venue['city'] in allowed_cities) if venue['city'] in allowed_cities: VENUE_LOC[venue['_id']] = (venue['loc'], venue['city']) TO_BE_INSERTED.append(venue) if len(TO_BE_INSERTED) == 400: mongo_insertion(TABLE) mongo_insertion(TABLE) persistent.save_var('venue_loc.my', VENUE_LOC)
city = args.city chunker = Chunker.Chunker(foursquare.MAX_MULTI_REQUESTS) previous = [e['_id'] for e in TABLE.find({'city': city})] potential = gather_all_entities_id(checkins, DB_FIELD, city=city) print('but already {} {}s in DB.'.format(len(previous), ENTITY_KIND)) import persistent as p region = city or 'world' invalid_filename = 'non_{}_id_{}'.format(ENTITY_KIND, region) try: INVALID_ID = p.load_var(invalid_filename) except IOError: pass print('and {} {}s are invalid.'.format(len(INVALID_ID), ENTITY_KIND)) new_ones = set(potential).difference(set(previous)) new_ones = new_ones.difference(set(INVALID_ID)) outside = set([e['_id'] for e in TABLE.find({'city': None}, {'_id': 1})]) outside.intersection_update(new_ones) print('and {} {}s are outside range.'.format(len(outside), ENTITY_KIND)) new_ones = new_ones.difference(outside) print('So only {} new ones.'.format(len(new_ones))) for batch in chunker(new_ones): IDS_QUEUE.put(batch) total_entities += len(batch) IDS_QUEUE.join() ENTITIES_QUEUE.join() mongo_insertion() print('{}/{} invalid id'.format(len(INVALID_ID), total_entities)) print('{}/{} requests'.format(CLIENT.rate_remaining, CLIENT.rate_limit)) p.save_var(invalid_filename, INVALID_ID)
tags = sorted([k for k, v in entropies.items() if 2.5 <= v <= 3.01]) save_var('mat_tag', tags) u = load_var('user_status') user_index = {k: i for i, k in enumerate(u)} def format_photo(p): user = user_index[p['uid']] loc = p['loc']['coordinates'] taken = [p['taken'].weekday(), p['taken'].hour, calendar.timegm(p['taken'].utctimetuple())] indicator = [int(t in p['ntags']) for t in tags] return [user] + loc + taken + indicator photos_feature = np.mat(tag_time(DB, tags, format_photo)) sio.savemat('deep', {'A': scipy.sparse.csr_matrix(photos_feature)}) if __name__ == '__main__': import arguments args = arguments.city_parser().parse_args() city = args.city DB, client = cm.connect_to_db('world', args.host, args.port) s = clock() tags = supported_tags(DB, city, photos_threshold=30, users_threshold=5, timespan=60) save_var(city+'_tag_support', tags) # entropies = {t[0]: period_entropy(DB, t[0]) for t in tags} # save_var('Hsupported', entropies) # get_data(DB) t = clock() print(t-s)
if s == -1 == spred: path_lengths[0].append(slen) if s == -1 != spred: path_lengths[1].append(slen) if s == 1 != spred: path_lengths[2].append(slen) if s == 1 == spred: path_lengths[3].append(slen) # acc = accuracy_score(gold, pred) # f1, mcc = f1_score(gold, pred), matthews_corrcoef(gold, pred) mcc = compute_mcc(gold, pred) return (root_degree, branching_factors, positive_fraction, path_lengths, mcc) if __name__ == '__main__': # pylint: disable=C0103 num_threads, per_thread = 13, 6 tasks = (num_threads*per_thread) rw.read_original_graph('soc-wiki.txt') roots = [_[0] for _ in rw.DEGREES[-tasks:]] edge_binary = {e: 2*int(s)-1 for e, s in rw.EDGE_SIGN.items()} features = [] target = [] pool = Pool(num_threads) res = list(pool.imap_unordered(do_it, roots[:tasks], chunksize=per_thread)) pool.close() pool.join() import persistent persistent.save_var('wik_feature.my', res)
#! /usr/bin/env python # vim: set fileencoding=utf-8 """Create Delaunay triangulation of random points in the plane.""" import sys from timeit import default_timer as clock import graph_tool.generation as gen import numpy as np import persistent def to_python_graph(graph): """represents `graph` by two dictionaries""" G = {int(u): {int(v) for v in u.out_neighbours()} for u in graph.vertices()} E = {(int(e.source()), int(e.target())): True for e in graph.edges()} return G, E if __name__ == '__main__': # pylint: disable=C0103 n = int(sys.argv[1]) start = clock() points = np.random.random((n, 2))*(n//50+1) g, _ = gen.triangulation(points, type="delaunay") persistent.save_var('belgrade/triangle_{}.my'.format(n), to_python_graph(g)) print('create {} edges in {:.3f} seconds'.format(g.num_edges(), clock() - start))
sys.exit() start = clock() shazoo(*make_graph(4000)) print(clock() - start) adj, _, ew, _, _, gold_sign = make_graph(400) train_vertices = random.sample(gold_sign.keys(), 70) gold, pred = offline_shazoo(adj, ew, gold_sign, train_vertices) print(sum((1 for g, p in zip(gold, pred) if g != p))) timing = [] for i in range(8): del FLEP_CALLS_TIMING[:] start = clock() shazoo(*make_graph(3250)) p.save_var('flep_{}.my'.format(i), FLEP_CALLS_TIMING) # print('done in {:.3f} sec'.format(clock() - start)) timing.append(clock() - start) print('avrg run: {:.3f}'.format(sum(timing)/len(timing))) def run_once(size): cexp.fast_preferential_attachment(size, 1) adj = cexp.redensify.G ew = {e: 120*random.random() for e in cexp.redensify.EDGES_SIGN} ns = {n: random.random() > .5 for n in adj if len(adj[n]) == 1 and random.random() < .7} root = max(adj.items(), key=lambda x: len(x[1]))[0] flep(adj, ns, ew, root) run_once(1000) run_once(1000) start = clock()
chunker = Chunker.Chunker(foursquare.MAX_MULTI_REQUESTS) previous = [e['_id'] for e in TABLE.find({'city': city})] print previous potential = gather_all_entities_id(checkins, DB_FIELD, city=city) print('but already {} {}s in DB.'.format(len(previous), ENTITY_KIND)) import persistent as p region = city or 'world' invalid_filename = 'non_{}_id_{}'.format(ENTITY_KIND, region) try: INVALID_ID = p.load_var(invalid_filename) except IOError: pass print('and {} {}s are invalid.'.format(len(INVALID_ID), ENTITY_KIND)) new_ones = set(potential).difference(set(previous)) new_ones = new_ones.difference(set(INVALID_ID)) outside = set([e['_id'] for e in TABLE.find({'city': None}, {'_id': 1})]) outside.intersection_update(new_ones) print('and {} {}s are outside range.'.format(len(outside), ENTITY_KIND)) new_ones = new_ones.difference(outside) print('So only {} new ones.'.format(len(new_ones))) for batch in chunker(new_ones): IDS_QUEUE.put(batch) total_entities += len(batch) IDS_QUEUE.join() ENTITIES_QUEUE.join() mongo_insertion() print('{}/{} invalid id'.format(len(INVALID_ID), total_entities)) print('{}/{} requests'.format(CLIENT.rate_remaining, CLIENT.rate_limit)) p.save_var(invalid_filename, INVALID_ID)
Atms.append(tms) Arr.append(rrs) with open('static/cmp_{}.json'.format(qcity)) as infile: star = ap.ujson.load(infile) get_gold = lambda c, d: [_['dst'] for _ in star[c][d] if _['metric'] == 'emd'] rq = [star.get(_[0], {}).get(_[1]) is not None and np.min(get_gold(*_)) < 1e5 and len(dsts[i]) > 0 for i, _ in enumerate(ALL_Q)] rqs = list(itertools.compress(ALL_Q, rq)) t_slow = np.hstack([t_slow, np.array([t for t, q in zip(emd, AQ) if q[0] == qcity and q[1:] in rqs])]) t_fast = np.hstack([t_fast, np.array(list(itertools.compress(tms, rq)))]) slow = np.hstack([slow, np.array([np.min(get_gold(*q)) for q in itertools.compress(ALL_Q, rq)])]) fast = np.hstack([fast, np.array([10 if len(_) == 0 else min(_) for _ in itertools.compress(dsts, rq)])]) full_data.append((Adsts, Atms, Arr, t_fast, t_slow, fast, slow)) import persistent as p p.save_var('approx_brute_relevance.my', full_data) import sys sys.exit() del full_data[:] full_data = [] n_step = 1 # for n_step in range(5): for knn in [8, 25, 50, 80, 160]: Adsts, Atms, Arr = [], [], [] t_fast, t_slow, fast, slow = np.array([]), np.array([]), np.array([]), np.array([]) for qcity in QCITIES: ALL_Q = queries(qcity) dsts, tms, rrs = ap.test_all_queries(ALL_Q, qcity, n_steps=1, k=knn) Adsts.append(dsts) Atms.append(tms) with open('static/cmp_{}.json'.format(qcity)) as infile:
#! /usr/bin/python2 # vim: set fileencoding=utf-8 import scipy.io as sio import VenueFeature as vf import CommonMongo as cm import persistent as p DB, CLIENT = cm.connect_to_db('foursquare') vf.DB = DB vf.CLIENT = CLIENT brands = ["mcdonald's", 'starbucks'] import cities as C starbucks = list(vf.DB.venue.find({'name': {'$in': ['Starbucks Coffee', 'Starbucks']}}, {'city': 1})) macdo = list(vf.DB.venue.find({'name': "McDonald's"}, {'city': 1})) for city in C.SHORT_KEY: vindex = set(list(sio.loadmat(city+'_fv')['i'])) fromdb = set([_['_id'] for _ in macdo if _['city'] == city]) res = list(fromdb.intersection(vindex)) p.save_var('{}_{}.my'.format(city, brands[0]), res) print('saved {} {} in {}'.format(len(res), brands[0], city)) fromdb = set([_['_id'] for _ in starbucks if _['city'] == city]) res = list(fromdb.intersection(vindex)) p.save_var('{}_{}.my'.format(city, brands[1]), res) print('saved {} {} in {}'.format(len(res), brands[1], city))
def consolidate(tags): d = { tag: persistent.load_var(u'disc/post_{}_{}'.format(tag, GRID_SIZE)) for tag in tags } persistent.save_var(u'disc/all_{}'.format(GRID_SIZE), d)
def post_process(tag): top_loc = persistent.load_var(u'disc/top_{}_{}'.format(tag, GRID_SIZE)) merged = merge_regions(top_loc) persistent.save_var(u'disc/post_{}_{}'.format(tag, GRID_SIZE), merged)
s = clock() llr.fit(Xa[train_feat], ya[train]) pred = llr.predict(Xa[test_feat]) end = clock() append_res(lesko, s, end, pred, ya[test], frac) s = clock() olr.fit(Xa[train, 15:17], ya[train]) pred = olr.predict(Xa[test, 15:17]) end = clock() append_res(logreg, s, end, pred, ya[test], frac) s = clock() dt.fit(Xa[train, 15:17], ya[train]) pred = dt.predict(Xa[test, 15:17]) end = clock() append_res(dectree, s, end, pred, ya[test], frac) res[0].append(lesko) res[1].append(fixed) res[2].append(simple_fixed) res[3].append(tuned) res[4].append(cmp_tuned) res[5].append(logreg) res[6].append(dectree) res[7].append(left_fixed) res[8].append(rev_fixed) p.save_var('{}_{}_{}.my'.format(pref, start, part+1), (alphas, res))
try: os.mkdir('random') except OSError: pass for district in districts: savename = 'random/{}_{}.my'.format(city, district) print(savename) if os.path.isfile(savename): continue distrib, best_score, best_region = [], 0, None for i in range(NTEST): regions, score = mock_random_list(city, district, city_info) if score > best_score: best_score, best_region = score, regions distrib.append(score) p.save_var(savename, distrib) outjson = [{ 'pos': rank + 1, 'metric': 'random', 'dst': -1, 'venues': r[1], 'geo': mapping(Polygon(np.fliplr(c.euclidean_to_geo(city, r[0])))) } for rank, r in enumerate(best_region)] filename = 'static/random_{}_{}.json'.format(city, district) with open(filename, 'w') as f: json.dump(outjson,
def batch_predict(tree_adj, training_signs, edge_weight): """Predict all node's signs of a weighted tree which are not given as training. It works by computing all the border trees, computing the sign of their hinge nodes (at most once), extracting all hinge tree within, and predicting the signs of non revealed by propagating hinge values. """ # since shazoo use the revealed signs as-is, it's ok to use the same name training_signs, l2_values, rta_signs = training_signs all_nodes_to_predict = set(tree_adj) - set(training_signs) logging.debug('batch_predict has %d nodes to predict', len(all_nodes_to_predict)) methods = ['l2cost', 'rta', 'shazoo'] # fields are current_closest_hinge, current_sign, current_dst_to_closest_hinge node_predictions = {m: defaultdict(lambda: (None, None, 2e9)) for m in methods} hinge_value = {m: {} for m in methods} total_iter = 0 while all_nodes_to_predict: some_root_of_a_border_tree = next(iter(all_nodes_to_predict)) hinge_nodes, border_tree_nodes = find_hinge_nodes(tree_adj, edge_weight, training_signs, some_root_of_a_border_tree, with_visited=True) unmarked = border_tree_nodes - hinge_nodes for u in hinge_nodes: if u in hinge_value['shazoo']: continue vals, _, status = flep(tree_adj, (training_signs, rta_signs), edge_weight, u) hinge_value['shazoo'][u] = sgn(vals[0]) hinge_value['rta'][u] = sgn(vals[1]) if not USE_SCIPY: continue border_tree = build_border_tree_from_mincut_run(status, edge_weight) _, E, El, leaves_sign, _, _ = border_tree L = {u: l2_values[u] for u in leaves_sign} mapped_E, mapped_El_L, mapping = preprocess_edge_and_leaves(E, El, L) val = solve_by_zeroing_derivative(mapped_E, mapped_El_L, mapping, L, reorder=False)[0][u] hinge_value['l2cost'][u] = sgn(val) predicted_in_that_border_tree = set() inner_iter = 0 # to avoid the same fork being picked again and again unmarked.add(some_root_of_a_border_tree) while unmarked: one_to_predict = next(iter(unmarked)) hinge_tree = get_hinge_tree(one_to_predict, tree_adj, hinge_nodes) other_predicted = set() for h, h_val in iteritems(hinge_value['shazoo']): if h not in hinge_tree: continue predicted = propagate_hinge(hinge_tree, h, h_val, node_predictions['shazoo'], edge_weight) for u in predicted: prediction_info = node_predictions['shazoo'][u] used_hinge = prediction_info[0] node_predictions['rta'][u] = (used_hinge, hinge_value['rta'][used_hinge], prediction_info[2]) if not USE_SCIPY: continue node_predictions['l2cost'][u] = (used_hinge, hinge_value['l2cost'][used_hinge], prediction_info[2]) other_predicted.update(predicted) predicted_in_that_border_tree.update(other_predicted) unmarked -= other_predicted inner_iter += 1 if inner_iter > len(tree_adj): import time logging.critical('batch predict failed in the inner loop') persistent.save_var('__fail_{}.my'.format(int(time.time())), (tree_adj, (training_signs, l2_values, rta_signs), edge_weight)) raise RuntimeError('batch predict failed in the inner loop') all_nodes_to_predict -= predicted_in_that_border_tree total_iter += 1 if total_iter > len(tree_adj): import time logging.critical('batch predict failed in the outer loop') persistent.save_var('__fail_{}.my'.format(int(time.time())), (tree_adj, (training_signs, l2_values, rta_signs), edge_weight)) raise RuntimeError('batch predict failed in the outer loop') logging.debug('batch_predict has actually predicted %d nodes', len(node_predictions) - len(training_signs)) return {m: {u: v[1] for u, v in iteritems(node_predictions[m]) if u not in training_signs} for m in methods}
[t_fast, np.array(list(itertools.compress(tms, rq)))]) slow = np.hstack([ slow, np.array( [np.min(get_gold(*q)) for q in itertools.compress(ALL_Q, rq)]) ]) fast = np.hstack([ fast, np.array([ 10 if len(_) == 0 else min(_) for _ in itertools.compress(dsts, rq) ]) ]) full_data.append((Adsts, Atms, Arr, t_fast, t_slow, fast, slow)) import persistent as p p.save_var('approx_brute_relevance.my', full_data) import sys sys.exit() del full_data[:] full_data = [] n_step = 1 # for n_step in range(5): for knn in [8, 25, 50, 80, 160]: Adsts, Atms, Arr = [], [], [] t_fast, t_slow, fast, slow = np.array([]), np.array([]), np.array( []), np.array([]) for qcity in QCITIES: ALL_Q = queries(qcity) dsts, tms, rrs = ap.test_all_queries(ALL_Q, qcity, n_steps=1, k=knn) Adsts.append(dsts) Atms.append(tms)
u, v = int(e.source()), int(e.target()) if (u, v) in bfs_tree: bfsmap[e] = True else: bfsmap[e] = False k.set_vertex_filter(None) k.set_edge_filter(bfsmap) print_diag('build tree {}, {} test edges'.format(root, len(test_edges))) bfs_dst = shortest_distance(k, dense=False) bfs_mat = np.zeros((n, n), dtype=np.uint8) for v in k.vertices(): bfs_mat[int(v), :] = bfs_dst[v].a.astype(np.uint8) print_diag('computed pairwise distance') bsum = 0 bsize = 0 esum = 0 for i, v in enumerate(lcc_nodes): graph_distance = dst_mat[v, lcc_nodes[i+1:]] tree_distance = bfs_mat[v, lcc_nodes[i+1:]] if v in test_graph: esum += bfs_mat[v, sorted(test_graph[v])].sum() ratio = (tree_distance/graph_distance) bsum += ratio.sum() bsize += ratio.shape[0] path_stretch = bsum/bsize edge_stretch = (esum/2)/len(test_edges) print_diag('computed stats') print(idx, root, path_stretch, edge_stretch) p.save_var('{}_out_{}.my'.format(prefix, idx), (idx, root, path_stretch, edge_stretch))
def save_new_lid(self): """save these new lid because we need to build their profile""" region = 'world' if self.city is None else self.city id_ = str(hash(self.cids[0]))[:5] output = 'new_venue_id_{}_{}'.format(id_, region) p.save_var(output, set(self.new_venues))