def parse(self, response): page = response.meta['page'] uid = response.meta['uid'] resp = json.loads(response.body) results = [] if not resp.get('statuses'): raise ShouldNotEmptyError() for status in resp['statuses']: items = resp2item_v2(status) results.extend(items) # filter or mongo, 检查是否有大于70个有效更新,有则翻页,如果是page=1 还得做积分反馈 update_count = 0 if self.bloom: for status in resp['statuses']: if 'mid' in status and not self.bloom.check(status['mid']): update_count += 1 # 更新到filter self.bloom.add(status['mid'], int(time.time() * 1000)) else: for status in resp['statuses']: if 'id' in status and self.db.master_timeline_weibo.find({ '_id': status['id'] }).limit(1).count() == 0: update_count += 1 if page == 1: if update_count > 0 and self.r.hget(self.uids_priority_set, uid) < 10: self.r.hincrby(self.uids_priority_set, uid, 1) elif update_count == 0 and self.r.hget(self.uids_priority_set, uid) > 0: self.r.hincrby(self.uids_priority_set, uid, -1) log.msg(format='Score [uid:%(uid)s] update to %(score)s', level=log.INFO, uid=uid, score=self.r.hget(self.uids_priority_set, uid)) if update_count > AT_LEAST_UPDATE_COUNT: page += 1 request = Request(BASE_URL.format(uid=uid, page=page), headers=None) request.meta['page'] = page request.meta['uid'] = uid results.append(request) log.msg( format= 'One more page [uid:%(uid)s] page:%(page)s update_count:%(update_count)s', level=log.INFO, uid=uid, page=page, update_count=update_count) return results
def more_reposts(self, response): source_weibo = response.meta['source_weibo'] resp = json.loads(response.body) results = [] if resp['reposts'] == []: raise ShouldNotEmptyError() for repost in resp['reposts']: items = resp2item_v2(repost) if items == []: continue weibo = items[0] # 取出转发微博 source_weibo['reposts'].append(weibo['id']) results.extend(items) results.append(source_weibo) return results
def source_user(self, response): uid = response.meta['uid'] resp = json.loads(response.body) results = [] items = resp2item_v2(resp) if len(items) < 2: raise ShouldNotEmptyError() results.extend(items) user = items[0] request = Request(FOLLOWERS_URL.format(uid=uid, cursor=0), headers=None, callback=self.more_followers) request.meta['uid'] = uid request.meta['cursor'] = 0 request.meta['source_user'] = user results.append(request) return results
def parse(self, response): page = response.meta['page'] uid = response.meta['uid'] resp = json.loads(response.body) results = [] if resp == []: raise ShouldNotEmptyError() for status in resp: items = resp2item_v1(status) results.extend(items) if self.mode == 'allpages': page += 1 request = Request(BASE_URL.format(uid=uid, page=page)) request.meta['page'] = page request.meta['uid'] = uid results.append(request) return results
def soucre_weibo(self, response): resp = json.loads(response.body) results = [] items = resp2item_v2(resp) if len(items) < 2: raise ShouldNotEmptyError() results.extend(items) weibo = items[0] reposts_count = weibo['reposts_count'] wid = weibo['id'] for i in range(1, int(math.ceil(reposts_count / 200.0)) + 1): request = Request(BASE_URL.format(id=wid, page=i), headers=None, callback=self.more_reposts) request.meta['page'] = i request.meta['wid'] = wid request.meta['source_weibo'] = weibo results.append(request) return results
def parse(self, response): page = response.meta['page'] uid = response.meta['uid'] resp = json.loads(response.body) results = [] if resp.get('statuses') == []: raise ShouldNotEmptyError() for status in resp['statuses']: items = resp2item_v2(status) results.extend(items) page += 1 request = Request(BASE_URL.format(uid=uid, page=page, since_id=self.since_id, max_id=self.max_id), headers=None) request.meta['page'] = page request.meta['uid'] = uid results.append(request) return results