Esempio n. 1
0
def near_triads(rfr_triads):
    """
    Comparison of distance to mutual friend (labeled our) vs non-mutual friend
    (labeled my).
    """
    labels = ["",'0-10','10-100','100-1000','1000+']
    data = defaultdict(list)

    for quad in rfr_triads:
        for key,color,fill in (('my','b','dashed'),('our','r','solid')):
            edist = coord_in_miles(quad[key]['loc'],quad['you']['loc'])
            bin = min(4,len(str(int(edist))))
            label = '%s %s'%(key,labels[bin])
            dist = coord_in_miles(quad[key]['loc'],quad['me']['loc'])
            width = .6*1.8**(bin-1)
            data[label,color,'solid',width].append(dist)
    ugly_graph_hist(data,
            "near_triads.pdf",
            bins=dist_bins(120),
            xlim=(1,30000),
            label_len=True,
            kind="cumulog",
            normed=True,
            xlabel = "distance between edges in miles",
            ylabel = "number of users",
            )
Esempio n. 2
0
def cheap_locals(nebr_ids,mloc_uids,cutoff=20):
    """
    local contact ratio based on 20 leafs
    """
    seen = set()
    # There can be duplicates because nebr_ids is created by clumping nebr_split
    for nebr_id in nebr_ids:
        if nebr_id in seen:
            continue
        seen.add(nebr_id)

        user = User.get_id(nebr_id)
        user_loc = user.geonames_place.to_d()

        cids = [
            cid
            for key in User.NEBR_KEYS
            for cid in (getattr(user,key) or [])
            if cid not in mloc_uids
            ]
        if not cids:
            continue
        random.shuffle(cids)
        leafs = User.find(User._id.is_in(cids[:cutoff]), fields=['gnp'])

        dists = [
            coord_in_miles(user_loc,leaf.geonames_place.to_d())
            for leaf in leafs
            if leaf.has_place()
        ]
        if dists:
            blur = sum(1.0 for d in dists if d<25)/len(dists)
            yield user._id,blur
Esempio n. 3
0
def graph_rfrd_mdist(edges_d):
    """
    graph CDF of distance to recip friend split into bins based on median
    location error
    """
    data = defaultdict(list)
    labels = ["",'0<=MLE<10','10<=MLE<100','100<=MLE<1000']

    for edge_d in edges_d:
        amigo = edge_d.get('rfrd')
        if not amigo:
            continue
        dist = coord_in_miles(edge_d['mloc'],amigo)
        bin = len(str(int(amigo['mdist'])))
        width = .3*2.5**bin
        data[labels[bin],FL_PURP,'solid',width].append(dist)

    for label, dists in data.iteritems():
        print label,peek.local_ratio(dists),peek.local_ratio(dists,1000),len(dists)

    ugly_graph_hist(data,
            "rfrd_mdist.png",
            xlim= (1,15000),
            normed=True,
            label_len=True,
            kind="cumulog",
            ylabel = "fraction of edges",
            xlabel = "length of edge in miles",
            figsize=(12,6),
            bins = dist_bins(120),
            key_order = sorted(data,key=itemgetter(3)),
            )
Esempio n. 4
0
def mdists(gnp_gps):
    "find median location error and save to a dict using geocoded users"
    item_cutoff=2
    kind_cutoff=5
    mdist = {}

    dists = defaultdict(list)
    gnps = {}
    for gnp,mloc in gnp_gps:
        d = utils.coord_in_miles(gnp,mloc)
        id = gnp.get('fid','COORD')
        dists[id].append(d)
        gnps[id] = gnp

    codes = defaultdict(list)
    for k,gnp in gnps.iteritems():
        if len(dists[k])>item_cutoff:
            #add an entry for each feature that has a meaningful median
            mdist[str(k)] = numpy.median(dists[k])
        else:
            codes[gnp.get('code')].append(dists[k][0])
    other = []

    for k,code in codes.iteritems():
        if len(code)>kind_cutoff:
            #add an entry for each feature code that has a meaningful median
            mdist[k] = numpy.median(codes[k])
        else:
            other.extend(code)
    #add a catch-all for everything else
    mdist['other'] = numpy.median(other)
    yield mdist
Esempio n. 5
0
def diff_mloc_mdist(uids):
    """
    for each target user return the location error and median location error
    """
    for contact in _paged_users(uids,fields=['gnp','mloc']):
        if contact.geonames_place:
            dist = coord_in_miles(contact.geonames_place.to_d(),contact.median_loc)
            yield dist,contact.geonames_place.mdist
Esempio n. 6
0
def nebr_dists(mloc_tile):
    """
    find the distances from target users to their contacts
    """
    nebrs = User.find(User._id.is_in(mloc_tile['nebrs']),fields=['gnp'])
    for nebr in nebrs:
        dist = coord_in_miles(mloc_tile['mloc'], nebr.geonames_place.to_d())
        # add a one at the end to make the output format identical to
        # stranger_dists.
        yield dist,1
Esempio n. 7
0
def rfrd_dists(edge_d,dirt_cheap_locals,cheap_locals,aint_cheap_locals):
    """
    get the three different local contact ratios for recip friends, combine with
    actual distance from target to recip friend
    """
    amigo = edge_d.get('rfrd')
    if amigo:
        amigo['dist'] = coord_in_miles(edge_d['mloc'],amigo)
        amigo['cheap'] = cheap_locals.get(amigo['_id'])
        amigo['dirt'] = dirt_cheap_locals.get(amigo['_id'])
        amigo['aint'] = aint_cheap_locals.get(amigo['_id'])
        yield amigo
Esempio n. 8
0
def edge_dists(edge_d):
    """
    distill an edge_d into a smaller amount of information about each of the
    types of edges
    """
    keys = ('jfol','jfrd','rfrd','jat')
    for key in keys:
        amigo = edge_d.get(key)
        if amigo:
            assert amigo['mdist']<1000
            dist = coord_in_miles(edge_d['mloc'],amigo)
            yield (key,amigo['i_at'],amigo['u_at'],amigo['prot']),dist
            if key=='rfrd' and amigo['usa']:
                yield ('usa',amigo['i_at'],amigo['u_at'],amigo['prot']),dist
Esempio n. 9
0
def mloc_users(users_and_coords):
    """
    pick users with good home locations from geotweets
    """
    users, locs = _untangle_users_and_coords(users_and_coords)
    selected = []
    for uid,user in users.iteritems():
        spots = locs[uid]
        if len(spots)<=2: continue
        if user['followers_count']==0 and user['friends_count']==0: continue
        median = utils.median_2d(spots)
        dists = [utils.coord_in_miles(median,spot) for spot in spots]
        if numpy.median(dists)>50:
            continue #user moves too much
        user['mloc'] = median
        selected.append(user)
    random.shuffle(selected)
    return selected
Esempio n. 10
0
def nebr_vect(user,geo_ated,dirt_cheap_locals):
    """
    create a vector for each edge from a target to a contact
    """
    mentioned = geo_ated.get(user['_id'],())
    for nebr in user['nebrs']:
        # I really don't like the way I did these flags.
        ated,fols,frds = [nebr['kind'] >>i & 1 for i in range(3)]
        at_back = int(nebr['_id'] in mentioned)
        flags = [ated, at_back, ated and at_back, fols, frds, fols and frds]
        logged = [logify(nebr[k]) for k in ('mdist','folc','frdc')]
        if 'mloc' in user:
            mloc_dist = logify(coord_in_miles(user['mloc'],nebr),fudge=.01)
        else:
            mloc_dist = float('nan')
        # why not nan for missing lorat?
        lorat = dirt_cheap_locals.get(nebr['_id'],.25)
        others = [ lorat, int(bool(nebr['prot'])), mloc_dist, ]
        yield flags + logged + others
Esempio n. 11
0
    def predictions(self, nebrs_ds, in_paths, geo_ated, dirt_cheap_locals):
        results = defaultdict(list)
        clump = in_paths[0][-1]
        self.load_env(self.env,clump)

        self.geo_ated = geo_ated
        self.cheap_locals = dirt_cheap_locals

        for nebrs_d in nebrs_ds:
            if not nebrs_d['nebrs']:
                continue
            self.prep_nebrs(nebrs_d)
            for key,classifier in self.classifiers.iteritems():
                index = classifier.predict(nebrs_d,self.vect_fit)
                if index==len(nebrs_d['vects']):
                    # the last one is the gnp one
                    dist = utils.coord_in_miles(nebrs_d['gnp'],nebrs_d['mloc'])
                else:
                    dist = unlogify(nebrs_d['vects'][index][-1],.01)
                results[key].append(dist)
        return results.iteritems()
Esempio n. 12
0
def mloc_reject_count(users_and_coords):
    """
    count the number of users we ignored in mloc_users. (This job was done to
    calculate a number for the paper, and is almost trash.)
    """
    results = collections.defaultdict(int)
    users, locs = _untangle_users_and_coords(users_and_coords)
    for uid,user in users.iteritems():
        spots = locs[uid]
        if len(spots)<=2:
            results['spots']+=1
            continue
        median = utils.median_2d(spots)
        dists = [utils.coord_in_miles(median,spot) for spot in spots]
        if numpy.median(dists)>50:
            results['moves']+=1
        elif user['followers_count']==0 and user['friends_count']==0:
            results['counts']+=1
        else:
            results['good']+=1
    return results.iteritems()
Esempio n. 13
0
def _calc_lorat(nebrs,twit,gis):
    leaf_ids = {uid
             for nebr in nebrs
             for uid in nebr.contacts[:10]}
    leafs_ = _fetch_profiles(list(leaf_ids),twit,gis)
    leafs = {leaf._id:leaf for leaf in leafs_}

    for nebr in nebrs:
        # Does this break if the contact does not exist?
        nebr_loc = nebr.geonames_place.to_d()
        dists = []
        for leaf_id in nebr.contacts[:10]:
            leaf = leafs.get(leaf_id)
            if leaf and leaf.has_place():
                dist = utils.coord_in_miles(nebr_loc,leaf.geonames_place.to_d())
                dists.append(dist)
        if dists:
            lorat = sum(1.0 for d in dists if d<25)/len(dists)
        else:
            lorat = float('nan')
        nebr.local_ratio = lorat