Ejemplo n.º 1
0
def force_lookup(to_db="hou",start_id='',end_id=None):
    "Lookup users who were not included in the original crawl."
    start ='U'+start_id
    end = 'U'+end_id if end_id else 'V'
    user_view = Model.database.paged_view('_all_docs',include_docs=True,startkey=start,endkey=end)
    users = (User(d['doc']) for d in user_view)
    Model.database = connect(to_db)
    found_db = connect("houtx")
    found_view = found_db.paged_view('_all_docs',startkey=start,endkey=end)
    found = set(d['id'] for d in found_view)
    scores = Scores()
    scores.read(settings.lookup_out)
    region = ("Texas","United States")
    for user in users:
        int_uid = as_int_id(user._id)
        if (    user.lookup_done or
                user.protected or
                int_uid not in scores or
                user.local_prob==1 or
                (user.local_prob==0 and user.geonames_place.name not in region) or
                user._id in found
           ):
            continue
        state, rfs, ats = scores.split(int_uid)
        if user.utc_offset == -21600:
            if log_score(rfs,ats,.9) < 1: continue
        else:
            if log_score(rfs,ats) < settings.non_local_cutoff: continue
        user_lookup(user)
Ejemplo n.º 2
0
 def pick_users(self, cutoff):
     logging.info("pick_users with score %d", cutoff)
     for uid in self.scores:
         state, rfs, ats = self.scores.split(uid)
         if state==scoredict.NEW and log_score(rfs,ats) >= cutoff:
             self._send_job(uid,rfs,ats)
             self.lookups+=1
Ejemplo n.º 3
0
def _users_from_scores():
    scores = Scores()
    scores.read(settings.lookup_out)
    for uid in scores:
        state, rfs, ats = scores.split(uid)
        if log_score(rfs,ats)>=11:
            yield uid
Ejemplo n.º 4
0
 def pick_users(self, cutoff):
     for uid in self.scores:
         state, rfs, ats = self.scores.split(uid)
         if state==scoredict.NEW and log_score(rfs,ats) >= cutoff:
             job = LookupJobBody(
                 _id=as_local_id('U',uid),
                 rfriends_score=rfs,
                 mention_score=ats,
             )
             job.put(self.stalk)
             self.scores.set_state(uid, scoredict.LOOKUP)
             self.lookups+=1
Ejemplo n.º 5
0
 def pick_users(self, cutoff):
     for uid in self.scores:
         state, rfs, ats = self.scores.split(uid)
         if state == scoredict.NEW and log_score(rfs, ats) >= cutoff:
             job = LookupJobBody(
                 _id=as_local_id('U', uid),
                 rfriends_score=rfs,
                 mention_score=ats,
             )
             job.put(self.stalk)
             self.scores.set_state(uid, scoredict.LOOKUP)
             self.lookups += 1
Ejemplo n.º 6
0
 def calc_cutoff(self):
     self.stats = [0 for x in xrange(BUCKETS)]
     for u in self.scores:
         state, rfs, ats = self.scores.split(u)
         if state==scoredict.NEW:
             self.stats[log_score(rfs,ats)]+=1
     for count,score in zip(self.stats,xrange(BUCKETS)):
         logging.info("%d %d",score,count)
     total = 0
     for i in xrange(BUCKETS-1,-1,-1):
         total+=self.stats[i]
         if total > settings.crawl_ratio*(len(self.scores)-self.lookups):
             return i
     return 0
Ejemplo n.º 7
0
 def calc_cutoff(self):
     self.stats = [0 for x in xrange(BUCKETS)]
     for u in self.scores:
         state, rfs, ats = self.scores.split(u)
         if state == scoredict.NEW:
             self.stats[log_score(rfs, ats)] += 1
     for count, score in zip(self.stats, xrange(BUCKETS)):
         logging.info("%d %d", score, count)
     total = 0
     for i in xrange(BUCKETS - 1, -1, -1):
         total += self.stats[i]
         if total > settings.crawl_ratio * (len(self.scores) -
                                            self.lookups):
             return i
     return 0
Ejemplo n.º 8
0
    def force_lookup(self):
        "Lookup users who were not included in the original crawl."
        for user in User.get_all():
            if (    user.lookup_done or
                    user.protected or
                    user._id not in self.scores or
                    user.local_prob==1
               ):
                continue

            state, rfs, ats = self.scores.split(user._id)
            reasons = [
                user.utc_offset == settings.utc_offset,
                log_score(rfs,ats) >= settings.non_local_cutoff,
                user.local_prob == .5,
            ]
            if sum(reasons)>=2:
                logging.info("force %s - %d for %r", user.screen_name, user._id, reasons)
                self._send_job(user._id,rfs,ats,True)
Ejemplo n.º 9
0
def analyze():
    "Find out how the scoring algorithm did."
    scores = Scores()
    scores.read(settings.lookup_out)
    local_db = CouchDB('http://127.0.0.1:5984/hou',True)
    local_view = local_db.paged_view('_all_docs',startkey='U',endkey='V')
    local_users = set(r['id'] for r in local_view)

    locs = (-1,0,.5,1)
    weights =(.1,.3,.5,.7,.9)
    counts = dict(
        (score, dict(
            (loc, dict(
                (weight,0)
                for weight in weights))
            for loc in locs))
        for score in xrange(BUCKETS))
    

    for user in all_users():
        if user['doc'].get('utco')!=-21600:
            continue
        state, rfs, ats = scores.split(as_int_id(user['id']))
        if user['id'] in local_users:
            loc = 1
        else:
            try:
                loc = .5 if user['doc']['prob']==.5 else 0
            except ResourceNotFound:
                loc = -1

        for weight in weights:
            score = log_score(rfs,ats,weight)
            counts[score][loc][weight]+=1

    print "todo\t\t\t\t\tnon\t\t\t\t\tunk\t\t\t\t\tlocal"
    for score in xrange(BUCKETS):
        for loc in locs:
            for weight in weights:
                print "%d\t"%counts[score][loc][weight],
        print
Ejemplo n.º 10
0
def analyze():
    "Find out how the scoring algorithm did."
    scores = Scores()
    scores.read(settings.lookup_out)
    local_db = CouchDB('http://127.0.0.1:5984/hou', True)
    local_view = local_db.paged_view('_all_docs', startkey='U', endkey='V')
    local_users = set(r['id'] for r in local_view)

    locs = (-1, 0, .5, 1)
    weights = (.1, .3, .5, .7, .9)
    counts = dict((score,
                   dict((loc, dict((weight, 0) for weight in weights))
                        for loc in locs)) for score in xrange(BUCKETS))

    for user in all_users():
        if user['doc'].get('utco') != -21600:
            continue
        state, rfs, ats = scores.split(as_int_id(user['id']))
        if user['id'] in local_users:
            loc = 1
        else:
            try:
                loc = .5 if user['doc']['prob'] == .5 else 0
            except ResourceNotFound:
                loc = -1

        for weight in weights:
            score = log_score(rfs, ats, weight)
            counts[score][loc][weight] += 1

    print "todo\t\t\t\t\tnon\t\t\t\t\tunk\t\t\t\t\tlocal"
    for score in xrange(BUCKETS):
        for loc in locs:
            for weight in weights:
                print "%d\t" % counts[score][loc][weight],
        print