def catchUserFollowingThread(self): s = ZhiHuSpider() d = DBUtil() st = Status.Following() while self.isExit == False: #取出第一个用户 userId, currentPage = d.getFirstUserToFollowing2() log('开始抓取用户关注者,user_id={0}, current_page={1}'.format( userId, currentPage)) if userId is None: time.sleep(3) continue d.setUserIsFollowing(userId, st.is_catching) #获取关注者页数 total = self.getUserFollowingPageNum(userId) log('当前用户总的关注者的页数,user_id={0}, total_page={1}'.format( userId, total)) #用户没有关注任何人 if total == 0: d.setUserIsFollowing(userId, st.user_following_none) continue #标识是否正常退出 isFinished = True for i in range(currentPage + 1, total + 1): # 判断是否要退出 if self.isExit: isFinished = False break list = self.getUserFollowingPageContent(userId, i) #获取关注者成功 if len(list) > 0: d.saveFollowerInfo(userId, list) #设置状态 d.setUserIsFollowing(userId, st.is_catching) #设置这一页抓取完毕了 d.setUserFollowingPage(userId, i) log('抓取完一页用户的关注者,user_id={0}, page={1}, list.size={2}'.format( userId, i, len(list))) time.sleep(self.time_duration * 20) # 全部抓取成功 if isFinished: # 设置抓取完毕 d.setUserIsFollowing(userId, st.catched) log('当前用户关注的人全部抓取完毕,user_id= %s' % userId) # 没有抓取完毕 else: log('当前用户关注的人没有抓取完毕,中途退出,user_id = {0}'.format(userId)) log('获取用户关注者的线程运行结束')
def catchUserInfoThread(self): s = ZhiHuSpider() d = DBUtil() st = Status.Catch() while self.isExit == False: #取出第一个用户 userId = d.getFirstUserToCatch() if userId is None: time.sleep(3) continue d.setUserIsCatch(userId, st.is_catching) #获取用户信息 dict = s.getUserInfo(userId) code = dict['code'] #用户没有价值 if code == s.code_user_not_useful: d.setUserIsCatch(userId, st.user_not_useful) #用户不存在 elif code == s.code_user_not_exist: d.setUserIsCatch(userId, st.user_not_exist) #抓取失败 elif code == s.code_failure: d.setUserIsCatch(userId, st.failed) #抓取成功 else: d.updateUserInfo(userId, dict) d.saveAchieveInfo(userId, dict)
if os.path.exists(file): self.isExit = True log('检测到退出文件,退出程序.exit_file = {0}'.format(file)) break else: duration = spider_const.control_exit_duration * 60 log('未检测到退出文件,休眠{0}秒'.format(duration)) time.sleep(duration) def start(self): t1 = threading.Thread(target=self.catchUserInfoThread) t2 = threading.Thread(target=self.catchUserFollowingThread) t3 = threading.Thread(target=self.exitThread) t1.start() t2.start() t3.start() t1.join() t2.join() t3.join() log("全部程序运行完毕") if __name__ == '__main__': # 删除控制退出的文件 if os.path.exists(spider_const.control_exit_file): os.remove(spider_const.control_exit_file) d = DBUtil() d.init('excited-vczh') z = ZhiHuSpider() z.start()
def catchUserFollowingProcess(self,lock): z = ZhiHuSpider() d = DBUtil() st = Status.Following() while True: lock.acquire() # 取出第一个用户 userId, currentPage = d.getFirstUserToFollowing2() print('开始抓取用户关注者,user_id={0}, current_page={1}'.format(userId, currentPage)) if userId is None: lock.release() time.sleep(3) continue d.setUserIsFollowing(userId, st.multi_catching) lock.release() # 获取关注者页数 total = z.getUserFollowingPageNum(userId) print('当前用户总的关注者的页数,user_id={0}, total_page={1}'.format(userId, total)) # 用户没有关注任何人 if total == 0: d.setUserIsFollowing(userId, st.user_following_none) continue for i in range(currentPage + 1, total + 1): list = z.getUserFollowingPageContent(userId, i) # 获取关注者成功 if len(list) > 0: d.saveFollowerInfo(userId, list) # 设置这一页抓取完毕了 d.setUserFollowingPage(userId, i) print('抓取完一页用户的关注者,user_id={0}, page={1}, list.size={2}'.format(userId, i, len(list))) time.sleep(z.time_duration) # 设置抓取完毕 d.setUserIsFollowing(userId, st.catched) print('当前用户全部抓取完毕,user_id=', userId)
def catchUserInfoProcess(self,lock): s = ZhiHuSpider() db = DBUtil() st = Status.Catch() while True: #加锁 lock.acquire() #获取第一个用户开始爬 userId = db.getFirstUserToCatch() if userId is None: lock.release() time.sleep(3) continue #设置为正在爬取 db.setUserIsCatch(userId,st.is_catching) lock.release() print('开始爬取用户,pid={1}, user_id={0}'.format(os.getpid(),userId)) #开始爬取用户信息 dict = s.getUserInfo(userId) code = dict['code'] # 用户没有价值 if code == s.code_user_not_useful: print('用户没有价值,pid={1}, user_id={0}'.format(os.getpid(),userId)) db.setUserIsCatch(userId, st.user_not_useful) # 用户不存在 elif code == s.code_user_not_exist: print('用户不存在,是僵尸粉,pid={1}, user_id={0}'.format(os.getpid(), userId)) db.setUserIsCatch(userId, st.user_not_exist) # 抓取失败 elif code == s.code_failure: print('用户抓取失败,pid={1}, user_id={0}'.format(os.getpid(), userId)) db.setUserIsCatch(userId, st.failed) # 抓取成功 else: print('用户抓取成功,pid={1}, user_id={0}'.format(os.getpid(), userId)) db.updateUserInfo(userId, dict)
def catchUserInfoThread(self, lock): s = ZhiHuSpider() db = DBUtil() st = Status.Catch() while not self.isExit: #加锁 lock.acquire() #获取第一个用户开始爬 userId = db.getFirstUserToCatch() if userId is None: lock.release() time.sleep(5) continue #设置为正在爬取 db.setUserIsCatch(userId, st.is_catching) lock.release() log('开始爬取用户,pid={0}, user_id={1}'.format(os.getpid(), userId)) #开始爬取用户信息 dict = s.getUserInfo(userId) code = dict['code'] # 用户没有价值 if code == s.code_user_not_useful: log('用户没有价值,pid={0}, user_id={1}'.format(os.getpid(), userId)) db.setUserIsCatch(userId, st.user_not_useful) # 用户不存在 elif code == s.code_user_not_exist: log('用户不存在,是僵尸粉,pid={0}, user_id={1}'.format( os.getpid(), userId)) db.setUserIsCatch(userId, st.user_not_exist) # 抓取失败 elif code == s.code_failure: log('用户抓取失败,pid={0}, user_id={1}'.format(os.getpid(), userId)) db.setUserIsCatch(userId, st.failed) # 抓取成功 else: log('用户抓取成功,pid={0}, user_id={1}'.format(os.getpid(), userId)) db.updateUserInfo(userId, dict) db.saveAchieveInfo(userId, dict) log('获取用户详细信息的线程结束,tid = {0}'.format(self.getThreadId()))