def near_triads(rfr_triads): """ Comparison of distance to mutual friend (labeled our) vs non-mutual friend (labeled my). """ labels = ["",'0-10','10-100','100-1000','1000+'] data = defaultdict(list) for quad in rfr_triads: for key,color,fill in (('my','b','dashed'),('our','r','solid')): edist = coord_in_miles(quad[key]['loc'],quad['you']['loc']) bin = min(4,len(str(int(edist)))) label = '%s %s'%(key,labels[bin]) dist = coord_in_miles(quad[key]['loc'],quad['me']['loc']) width = .6*1.8**(bin-1) data[label,color,'solid',width].append(dist) ugly_graph_hist(data, "near_triads.pdf", bins=dist_bins(120), xlim=(1,30000), label_len=True, kind="cumulog", normed=True, xlabel = "distance between edges in miles", ylabel = "number of users", )
def cheap_locals(nebr_ids,mloc_uids,cutoff=20): """ local contact ratio based on 20 leafs """ seen = set() # There can be duplicates because nebr_ids is created by clumping nebr_split for nebr_id in nebr_ids: if nebr_id in seen: continue seen.add(nebr_id) user = User.get_id(nebr_id) user_loc = user.geonames_place.to_d() cids = [ cid for key in User.NEBR_KEYS for cid in (getattr(user,key) or []) if cid not in mloc_uids ] if not cids: continue random.shuffle(cids) leafs = User.find(User._id.is_in(cids[:cutoff]), fields=['gnp']) dists = [ coord_in_miles(user_loc,leaf.geonames_place.to_d()) for leaf in leafs if leaf.has_place() ] if dists: blur = sum(1.0 for d in dists if d<25)/len(dists) yield user._id,blur
def graph_rfrd_mdist(edges_d): """ graph CDF of distance to recip friend split into bins based on median location error """ data = defaultdict(list) labels = ["",'0<=MLE<10','10<=MLE<100','100<=MLE<1000'] for edge_d in edges_d: amigo = edge_d.get('rfrd') if not amigo: continue dist = coord_in_miles(edge_d['mloc'],amigo) bin = len(str(int(amigo['mdist']))) width = .3*2.5**bin data[labels[bin],FL_PURP,'solid',width].append(dist) for label, dists in data.iteritems(): print label,peek.local_ratio(dists),peek.local_ratio(dists,1000),len(dists) ugly_graph_hist(data, "rfrd_mdist.png", xlim= (1,15000), normed=True, label_len=True, kind="cumulog", ylabel = "fraction of edges", xlabel = "length of edge in miles", figsize=(12,6), bins = dist_bins(120), key_order = sorted(data,key=itemgetter(3)), )
def mdists(gnp_gps): "find median location error and save to a dict using geocoded users" item_cutoff=2 kind_cutoff=5 mdist = {} dists = defaultdict(list) gnps = {} for gnp,mloc in gnp_gps: d = utils.coord_in_miles(gnp,mloc) id = gnp.get('fid','COORD') dists[id].append(d) gnps[id] = gnp codes = defaultdict(list) for k,gnp in gnps.iteritems(): if len(dists[k])>item_cutoff: #add an entry for each feature that has a meaningful median mdist[str(k)] = numpy.median(dists[k]) else: codes[gnp.get('code')].append(dists[k][0]) other = [] for k,code in codes.iteritems(): if len(code)>kind_cutoff: #add an entry for each feature code that has a meaningful median mdist[k] = numpy.median(codes[k]) else: other.extend(code) #add a catch-all for everything else mdist['other'] = numpy.median(other) yield mdist
def diff_mloc_mdist(uids): """ for each target user return the location error and median location error """ for contact in _paged_users(uids,fields=['gnp','mloc']): if contact.geonames_place: dist = coord_in_miles(contact.geonames_place.to_d(),contact.median_loc) yield dist,contact.geonames_place.mdist
def nebr_dists(mloc_tile): """ find the distances from target users to their contacts """ nebrs = User.find(User._id.is_in(mloc_tile['nebrs']),fields=['gnp']) for nebr in nebrs: dist = coord_in_miles(mloc_tile['mloc'], nebr.geonames_place.to_d()) # add a one at the end to make the output format identical to # stranger_dists. yield dist,1
def rfrd_dists(edge_d,dirt_cheap_locals,cheap_locals,aint_cheap_locals): """ get the three different local contact ratios for recip friends, combine with actual distance from target to recip friend """ amigo = edge_d.get('rfrd') if amigo: amigo['dist'] = coord_in_miles(edge_d['mloc'],amigo) amigo['cheap'] = cheap_locals.get(amigo['_id']) amigo['dirt'] = dirt_cheap_locals.get(amigo['_id']) amigo['aint'] = aint_cheap_locals.get(amigo['_id']) yield amigo
def edge_dists(edge_d): """ distill an edge_d into a smaller amount of information about each of the types of edges """ keys = ('jfol','jfrd','rfrd','jat') for key in keys: amigo = edge_d.get(key) if amigo: assert amigo['mdist']<1000 dist = coord_in_miles(edge_d['mloc'],amigo) yield (key,amigo['i_at'],amigo['u_at'],amigo['prot']),dist if key=='rfrd' and amigo['usa']: yield ('usa',amigo['i_at'],amigo['u_at'],amigo['prot']),dist
def mloc_users(users_and_coords): """ pick users with good home locations from geotweets """ users, locs = _untangle_users_and_coords(users_and_coords) selected = [] for uid,user in users.iteritems(): spots = locs[uid] if len(spots)<=2: continue if user['followers_count']==0 and user['friends_count']==0: continue median = utils.median_2d(spots) dists = [utils.coord_in_miles(median,spot) for spot in spots] if numpy.median(dists)>50: continue #user moves too much user['mloc'] = median selected.append(user) random.shuffle(selected) return selected
def nebr_vect(user,geo_ated,dirt_cheap_locals): """ create a vector for each edge from a target to a contact """ mentioned = geo_ated.get(user['_id'],()) for nebr in user['nebrs']: # I really don't like the way I did these flags. ated,fols,frds = [nebr['kind'] >>i & 1 for i in range(3)] at_back = int(nebr['_id'] in mentioned) flags = [ated, at_back, ated and at_back, fols, frds, fols and frds] logged = [logify(nebr[k]) for k in ('mdist','folc','frdc')] if 'mloc' in user: mloc_dist = logify(coord_in_miles(user['mloc'],nebr),fudge=.01) else: mloc_dist = float('nan') # why not nan for missing lorat? lorat = dirt_cheap_locals.get(nebr['_id'],.25) others = [ lorat, int(bool(nebr['prot'])), mloc_dist, ] yield flags + logged + others
def predictions(self, nebrs_ds, in_paths, geo_ated, dirt_cheap_locals): results = defaultdict(list) clump = in_paths[0][-1] self.load_env(self.env,clump) self.geo_ated = geo_ated self.cheap_locals = dirt_cheap_locals for nebrs_d in nebrs_ds: if not nebrs_d['nebrs']: continue self.prep_nebrs(nebrs_d) for key,classifier in self.classifiers.iteritems(): index = classifier.predict(nebrs_d,self.vect_fit) if index==len(nebrs_d['vects']): # the last one is the gnp one dist = utils.coord_in_miles(nebrs_d['gnp'],nebrs_d['mloc']) else: dist = unlogify(nebrs_d['vects'][index][-1],.01) results[key].append(dist) return results.iteritems()
def mloc_reject_count(users_and_coords): """ count the number of users we ignored in mloc_users. (This job was done to calculate a number for the paper, and is almost trash.) """ results = collections.defaultdict(int) users, locs = _untangle_users_and_coords(users_and_coords) for uid,user in users.iteritems(): spots = locs[uid] if len(spots)<=2: results['spots']+=1 continue median = utils.median_2d(spots) dists = [utils.coord_in_miles(median,spot) for spot in spots] if numpy.median(dists)>50: results['moves']+=1 elif user['followers_count']==0 and user['friends_count']==0: results['counts']+=1 else: results['good']+=1 return results.iteritems()
def _calc_lorat(nebrs,twit,gis): leaf_ids = {uid for nebr in nebrs for uid in nebr.contacts[:10]} leafs_ = _fetch_profiles(list(leaf_ids),twit,gis) leafs = {leaf._id:leaf for leaf in leafs_} for nebr in nebrs: # Does this break if the contact does not exist? nebr_loc = nebr.geonames_place.to_d() dists = [] for leaf_id in nebr.contacts[:10]: leaf = leafs.get(leaf_id) if leaf and leaf.has_place(): dist = utils.coord_in_miles(nebr_loc,leaf.geonames_place.to_d()) dists.append(dist) if dists: lorat = sum(1.0 for d in dists if d<25)/len(dists) else: lorat = float('nan') nebr.local_ratio = lorat