def map(self, items): self.twitter = TwitterResource() Model.database = MongoDB(name=self.db_name, host=settings.db_host) for user in items: try: self.crawl(user) self.twitter.sleep_if_needed() except Exception: logging.exception("exception for user %s" % user.to_d()) yield None
class CrawlProcess(SplitProcess): def __init__(self, db_name, **kwargs): SplitProcess.__init__(self, **kwargs) self.db_name = db_name self.waiting = set() def produce(self): Model.database = MongoDB(name=self.db_name, host=settings.db_host) endtime = datetime.utcnow() return User.find(User.next_crawl_date < endtime, sort=User.next_crawl_date, timeout=False) def map(self, items): self.twitter = TwitterResource() Model.database = MongoDB(name=self.db_name, host=settings.db_host) for user in items: try: self.crawl(user) self.twitter.sleep_if_needed() except Exception: logging.exception("exception for user %s" % user.to_d()) yield None def crawl(self, user): logging.info("visiting %s - %s", user._id, user.screen_name) tweets = self.twitter.save_timeline(user._id, user.last_tid) if tweets: user.last_tid = tweets[0]._id logging.info("saved %d for %s", len(tweets), user.screen_name) now = datetime.utcnow() last = user.last_crawl_date if user.last_crawl_date is not None else datetime(2010, 11, 12) delta = now - last seconds = delta.seconds + delta.days * 24 * 3600 tph = (3600.0 * len(tweets) / seconds + user.tweets_per_hour) / 2 user.tweets_per_hour = tph hours = min(settings.tweets_per_crawl / tph, settings.max_hours) user.next_crawl_date = now + timedelta(hours=hours) user.last_crawl_date = now user.save()
def __init__(self,slave_id): LocalProc.__init__(self,'lookup',slave_id) self.twitter = TwitterResource() self.gisgraphy = GisgraphyResource()
class LookupSlave(LocalProc): def __init__(self,slave_id): LocalProc.__init__(self,'lookup',slave_id) self.twitter = TwitterResource() self.gisgraphy = GisgraphyResource() def run(self): while True: jobs = [] for x in xrange(100): try: # reserve blocks to wait when x is 0, but returns None for 1-99 j = self.stalk.reserve(0 if x else None) except beanstalkc.DeadlineSoon: break if j is None: break jobs.append(j) bodies = [LookupJobBody.from_job(j) for j in jobs] try: users =self.twitter.user_lookup([b._id for b in bodies]) except ResourceNotFound: logging.info("no profile for %r",[b._id for b in bodies]) continue logging.info("looking at %r"%[getattr(u,'screen_name','') for u in users]) for job,body,user in zip(jobs,bodies,users): if user is None: logging.info("no profile for %d",body._id) job.delete() continue try: self.twitter.sleep_if_needed() logging.info("look at %s",user.screen_name) if (not body.force) and User.in_db(user._id): job.delete() continue self.crawl_user(user,body.force) user.save() job.delete() except: logging.exception("exception for job %s"%job.body) job.bury() logging.info("api calls remaining: %d",self.twitter.remaining) def crawl_user(self,user,force): user.local_prob = guess_location(user,self.gisgraphy) if (user.local_prob != 1.0 and not force) or user.protected: return rels=None tweets=None if user.followers_count>0 and user.friends_count>0: rels = self.twitter.get_edges(user._id) rels.attempt_save() if user.statuses_count>0: tweets = self.twitter.save_timeline(user._id,last_tid=settings.min_tweet_id) if tweets: user.next_crawl_date = datetime.utcnow() user.last_crawl_date = datetime.utcnow() user.tweets_per_hour = settings.tweets_per_hour user.last_tid = tweets[0]._id user.lookup_done = True if user.local_prob == 1.0 and not force: self.score_new_users(user, rels, tweets) def score_new_users(self, user, rels, tweets): jobs = defaultdict(LookupJobBody) jobs[user._id].done = True if rels: rfriends = rels.rfriends() if len(rfriends) < RFRIEND_POINTS: for u in rfriends: jobs[u].rfriends_score = RFRIEND_POINTS/len(rfriends) if tweets: ats = defaultdict(int) for tweet in tweets: for uid in tweet.mentions: ats[uid]+=1 for u,c in ats.iteritems(): points = c*MENTION_POINTS if points >0: jobs[u].mention_score = points for k,j in jobs.iteritems(): j._id = k j.put(self.stalk)