def force_lookup(to_db="hou",start_id='',end_id=None): "Lookup users who were not included in the original crawl." start ='U'+start_id end = 'U'+end_id if end_id else 'V' user_view = Model.database.paged_view('_all_docs',include_docs=True,startkey=start,endkey=end) users = (User(d['doc']) for d in user_view) Model.database = connect(to_db) found_db = connect("houtx") found_view = found_db.paged_view('_all_docs',startkey=start,endkey=end) found = set(d['id'] for d in found_view) scores = Scores() scores.read(settings.lookup_out) region = ("Texas","United States") for user in users: int_uid = as_int_id(user._id) if ( user.lookup_done or user.protected or int_uid not in scores or user.local_prob==1 or (user.local_prob==0 and user.geonames_place.name not in region) or user._id in found ): continue state, rfs, ats = scores.split(int_uid) if user.utc_offset == -21600: if log_score(rfs,ats,.9) < 1: continue else: if log_score(rfs,ats) < settings.non_local_cutoff: continue user_lookup(user)
def pick_users(self, cutoff): logging.info("pick_users with score %d", cutoff) for uid in self.scores: state, rfs, ats = self.scores.split(uid) if state==scoredict.NEW and log_score(rfs,ats) >= cutoff: self._send_job(uid,rfs,ats) self.lookups+=1
def _users_from_scores(): scores = Scores() scores.read(settings.lookup_out) for uid in scores: state, rfs, ats = scores.split(uid) if log_score(rfs,ats)>=11: yield uid
def pick_users(self, cutoff): for uid in self.scores: state, rfs, ats = self.scores.split(uid) if state==scoredict.NEW and log_score(rfs,ats) >= cutoff: job = LookupJobBody( _id=as_local_id('U',uid), rfriends_score=rfs, mention_score=ats, ) job.put(self.stalk) self.scores.set_state(uid, scoredict.LOOKUP) self.lookups+=1
def pick_users(self, cutoff): for uid in self.scores: state, rfs, ats = self.scores.split(uid) if state == scoredict.NEW and log_score(rfs, ats) >= cutoff: job = LookupJobBody( _id=as_local_id('U', uid), rfriends_score=rfs, mention_score=ats, ) job.put(self.stalk) self.scores.set_state(uid, scoredict.LOOKUP) self.lookups += 1
def calc_cutoff(self): self.stats = [0 for x in xrange(BUCKETS)] for u in self.scores: state, rfs, ats = self.scores.split(u) if state==scoredict.NEW: self.stats[log_score(rfs,ats)]+=1 for count,score in zip(self.stats,xrange(BUCKETS)): logging.info("%d %d",score,count) total = 0 for i in xrange(BUCKETS-1,-1,-1): total+=self.stats[i] if total > settings.crawl_ratio*(len(self.scores)-self.lookups): return i return 0
def calc_cutoff(self): self.stats = [0 for x in xrange(BUCKETS)] for u in self.scores: state, rfs, ats = self.scores.split(u) if state == scoredict.NEW: self.stats[log_score(rfs, ats)] += 1 for count, score in zip(self.stats, xrange(BUCKETS)): logging.info("%d %d", score, count) total = 0 for i in xrange(BUCKETS - 1, -1, -1): total += self.stats[i] if total > settings.crawl_ratio * (len(self.scores) - self.lookups): return i return 0
def force_lookup(self): "Lookup users who were not included in the original crawl." for user in User.get_all(): if ( user.lookup_done or user.protected or user._id not in self.scores or user.local_prob==1 ): continue state, rfs, ats = self.scores.split(user._id) reasons = [ user.utc_offset == settings.utc_offset, log_score(rfs,ats) >= settings.non_local_cutoff, user.local_prob == .5, ] if sum(reasons)>=2: logging.info("force %s - %d for %r", user.screen_name, user._id, reasons) self._send_job(user._id,rfs,ats,True)
def analyze(): "Find out how the scoring algorithm did." scores = Scores() scores.read(settings.lookup_out) local_db = CouchDB('http://127.0.0.1:5984/hou',True) local_view = local_db.paged_view('_all_docs',startkey='U',endkey='V') local_users = set(r['id'] for r in local_view) locs = (-1,0,.5,1) weights =(.1,.3,.5,.7,.9) counts = dict( (score, dict( (loc, dict( (weight,0) for weight in weights)) for loc in locs)) for score in xrange(BUCKETS)) for user in all_users(): if user['doc'].get('utco')!=-21600: continue state, rfs, ats = scores.split(as_int_id(user['id'])) if user['id'] in local_users: loc = 1 else: try: loc = .5 if user['doc']['prob']==.5 else 0 except ResourceNotFound: loc = -1 for weight in weights: score = log_score(rfs,ats,weight) counts[score][loc][weight]+=1 print "todo\t\t\t\t\tnon\t\t\t\t\tunk\t\t\t\t\tlocal" for score in xrange(BUCKETS): for loc in locs: for weight in weights: print "%d\t"%counts[score][loc][weight], print
def analyze(): "Find out how the scoring algorithm did." scores = Scores() scores.read(settings.lookup_out) local_db = CouchDB('http://127.0.0.1:5984/hou', True) local_view = local_db.paged_view('_all_docs', startkey='U', endkey='V') local_users = set(r['id'] for r in local_view) locs = (-1, 0, .5, 1) weights = (.1, .3, .5, .7, .9) counts = dict((score, dict((loc, dict((weight, 0) for weight in weights)) for loc in locs)) for score in xrange(BUCKETS)) for user in all_users(): if user['doc'].get('utco') != -21600: continue state, rfs, ats = scores.split(as_int_id(user['id'])) if user['id'] in local_users: loc = 1 else: try: loc = .5 if user['doc']['prob'] == .5 else 0 except ResourceNotFound: loc = -1 for weight in weights: score = log_score(rfs, ats, weight) counts[score][loc][weight] += 1 print "todo\t\t\t\t\tnon\t\t\t\t\tunk\t\t\t\t\tlocal" for score in xrange(BUCKETS): for loc in locs: for weight in weights: print "%d\t" % counts[score][loc][weight], print