def cheap_locals(nebr_ids,mloc_uids,cutoff=20): """ local contact ratio based on 20 leafs """ seen = set() # There can be duplicates because nebr_ids is created by clumping nebr_split for nebr_id in nebr_ids: if nebr_id in seen: continue seen.add(nebr_id) user = User.get_id(nebr_id) user_loc = user.geonames_place.to_d() cids = [ cid for key in User.NEBR_KEYS for cid in (getattr(user,key) or []) if cid not in mloc_uids ] if not cids: continue random.shuffle(cids) leafs = User.find(User._id.is_in(cids[:cutoff]), fields=['gnp']) dists = [ coord_in_miles(user_loc,leaf.geonames_place.to_d()) for leaf in leafs if leaf.has_place() ] if dists: blur = sum(1.0 for d in dists if d<25)/len(dists) yield user._id,blur
def _paged_users(uids, **find_kwargs): # save some round trips by asking for 100 at a time groups = utils.grouper(100, uids, dontfill=True) return chain.from_iterable( User.find(User._id.is_in(list(group)), **find_kwargs) for group in groups )
def pred_users(uids): """ fetch target users from database """ for g in utils.grouper(100,uids,dontfill=True): ids_group = tuple(g) for u in User.find(User._id.is_in(ids_group)): yield u.to_d()
def nebr_dists(mloc_tile): """ find the distances from target users to their contacts """ nebrs = User.find(User._id.is_in(mloc_tile['nebrs']),fields=['gnp']) for nebr in nebrs: dist = coord_in_miles(mloc_tile['mloc'], nebr.geonames_place.to_d()) # add a one at the end to make the output format identical to # stranger_dists. yield dist,1
def mloc_tile(mloc_uids): """ split the target users into tiles based on their home location """ users = User.find(User._id.is_in(tuple(mloc_uids)),fields=['mloc','nebrs']) for user in users: if not user.neighbors: continue lng,lat = user.median_loc yield _tile(lat),user.to_d()
def mloc_uids(user_ds): """ pick 2500 target users who have locations and good contacts """ retrieved = [u['id'] for u in itertools.islice(user_ds,2600)] users = User.find(User._id.is_in(retrieved)) good_ = { u._id for u in users if any(getattr(u,k) for k in NEBR_KEYS)} good = [uid for uid in retrieved if uid in good_] logging.info("found %d of %d",len(good),len(retrieved)) # throw away accounts that didn't work to get down to the 2500 good users return good[:2500]
def _fetch_profiles(uids,twit,gis): users = list(User.find(User._id.is_in(uids))) existing_ids = {u._id for u in users} missing_ids = [uid for uid in uids if uid not in existing_ids] chunks = utils.grouper(100, missing_ids, dontfill=True) for chunk in chunks: found = twit.user_lookup(user_ids=list(chunk)) for amigo in filter(None,found): amigo.geonames_place = gis.twitter_loc(amigo.location) amigo.merge() users.append(amigo) return users
def nebrs_d(user_d,mloc_blur): """ create dict with lots of information about a target user's located contacts """ mb = MlocBlur(*mloc_blur) user = User(user_d) nebrs = User.find(User._id.is_in(user_d['nebrs'])) tweets = Tweets.get_id(user_d['_id'],fields=['ats']) res = make_nebrs_d(user,nebrs,tweets.ats) res['mloc'] = user_d['mloc'] res['gnp'] = _blur_gnp(mb, user_d) return [res]
def _pick_neighbors(user): nebrs = {} for key in NEBR_KEYS: cids = getattr(user,key) if not cids: continue # this is slowish contacts = User.find(User._id.is_in(cids), fields=['gnp']) nebrs[key] = set(u._id for u in contacts if u.has_place()) picked_ = filter(None, itertools.chain.from_iterable( itertools.izip_longest(*nebrs.values()))) picked = picked_[:25] logging.info('picked %d of %d contacts',len(picked),len(user.contacts)) return picked
def fix_mloc_mdists(mloc_uids,mdists): """ Add the median location error to profiles of contacts and target users. """ gis = gisgraphy.GisgraphyResource() gis.set_mdists(mdists) # We didn't have mdists at the time the mloc users were saved. This # function could be avoided by running the mdist calculation before # running find_contacts. fixed = 0 users = User.find(User._id.is_in(tuple(mloc_uids))) for user in users: user.geonames_place = gis.twitter_loc(user.location) user.save() if user.geonames_place: fixed+=1 logging.info("fixed %d mdists",fixed) return [fixed]