コード例 #1
0
 def catchUserFollowingThread(self):
     s = ZhiHuSpider()
     d = DBUtil()
     st = Status.Following()
     while self.isExit == False:
         #取出第一个用户
         userId, currentPage = d.getFirstUserToFollowing2()
         log('开始抓取用户关注者,user_id={0}, current_page={1}'.format(
             userId, currentPage))
         if userId is None:
             time.sleep(3)
             continue
         d.setUserIsFollowing(userId, st.is_catching)
         #获取关注者页数
         total = self.getUserFollowingPageNum(userId)
         log('当前用户总的关注者的页数,user_id={0}, total_page={1}'.format(
             userId, total))
         #用户没有关注任何人
         if total == 0:
             d.setUserIsFollowing(userId, st.user_following_none)
             continue
         #标识是否正常退出
         isFinished = True
         for i in range(currentPage + 1, total + 1):
             # 判断是否要退出
             if self.isExit:
                 isFinished = False
                 break
             list = self.getUserFollowingPageContent(userId, i)
             #获取关注者成功
             if len(list) > 0:
                 d.saveFollowerInfo(userId, list)
                 #设置状态
                 d.setUserIsFollowing(userId, st.is_catching)
             #设置这一页抓取完毕了
             d.setUserFollowingPage(userId, i)
             log('抓取完一页用户的关注者,user_id={0}, page={1}, list.size={2}'.format(
                 userId, i, len(list)))
             time.sleep(self.time_duration * 20)
         # 全部抓取成功
         if isFinished:
             # 设置抓取完毕
             d.setUserIsFollowing(userId, st.catched)
             log('当前用户关注的人全部抓取完毕,user_id= %s' % userId)
         # 没有抓取完毕
         else:
             log('当前用户关注的人没有抓取完毕,中途退出,user_id = {0}'.format(userId))
     log('获取用户关注者的线程运行结束')
コード例 #2
0
 def catchUserInfoThread(self):
     s = ZhiHuSpider()
     d = DBUtil()
     st = Status.Catch()
     while self.isExit == False:
         #取出第一个用户
         userId = d.getFirstUserToCatch()
         if userId is None:
             time.sleep(3)
             continue
         d.setUserIsCatch(userId, st.is_catching)
         #获取用户信息
         dict = s.getUserInfo(userId)
         code = dict['code']
         #用户没有价值
         if code == s.code_user_not_useful:
             d.setUserIsCatch(userId, st.user_not_useful)
         #用户不存在
         elif code == s.code_user_not_exist:
             d.setUserIsCatch(userId, st.user_not_exist)
         #抓取失败
         elif code == s.code_failure:
             d.setUserIsCatch(userId, st.failed)
         #抓取成功
         else:
             d.updateUserInfo(userId, dict)
             d.saveAchieveInfo(userId, dict)
コード例 #3
0
            if os.path.exists(file):
                self.isExit = True
                log('检测到退出文件,退出程序.exit_file = {0}'.format(file))
                break
            else:
                duration = spider_const.control_exit_duration * 60
                log('未检测到退出文件,休眠{0}秒'.format(duration))
                time.sleep(duration)

    def start(self):
        t1 = threading.Thread(target=self.catchUserInfoThread)
        t2 = threading.Thread(target=self.catchUserFollowingThread)
        t3 = threading.Thread(target=self.exitThread)
        t1.start()
        t2.start()
        t3.start()
        t1.join()
        t2.join()
        t3.join()
        log("全部程序运行完毕")


if __name__ == '__main__':
    # 删除控制退出的文件
    if os.path.exists(spider_const.control_exit_file):
        os.remove(spider_const.control_exit_file)
    d = DBUtil()
    d.init('excited-vczh')
    z = ZhiHuSpider()
    z.start()
コード例 #4
0
 def catchUserFollowingProcess(self,lock):
     z = ZhiHuSpider()
     d = DBUtil()
     st = Status.Following()
     while True:
         lock.acquire()
         # 取出第一个用户
         userId, currentPage = d.getFirstUserToFollowing2()
         print('开始抓取用户关注者,user_id={0}, current_page={1}'.format(userId, currentPage))
         if userId is None:
             lock.release()
             time.sleep(3)
             continue
         d.setUserIsFollowing(userId, st.multi_catching)
         lock.release()
         # 获取关注者页数
         total = z.getUserFollowingPageNum(userId)
         print('当前用户总的关注者的页数,user_id={0}, total_page={1}'.format(userId, total))
         # 用户没有关注任何人
         if total == 0:
             d.setUserIsFollowing(userId, st.user_following_none)
             continue
         for i in range(currentPage + 1, total + 1):
             list = z.getUserFollowingPageContent(userId, i)
             # 获取关注者成功
             if len(list) > 0:
                 d.saveFollowerInfo(userId, list)
             # 设置这一页抓取完毕了
             d.setUserFollowingPage(userId, i)
             print('抓取完一页用户的关注者,user_id={0}, page={1}, list.size={2}'.format(userId, i, len(list)))
             time.sleep(z.time_duration)
         # 设置抓取完毕
         d.setUserIsFollowing(userId, st.catched)
         print('当前用户全部抓取完毕,user_id=', userId)
コード例 #5
0
 def catchUserInfoProcess(self,lock):
     s = ZhiHuSpider()
     db = DBUtil()
     st = Status.Catch()
     while True:
         #加锁
         lock.acquire()
         #获取第一个用户开始爬
         userId = db.getFirstUserToCatch()
         if userId is None:
             lock.release()
             time.sleep(3)
             continue
         #设置为正在爬取
         db.setUserIsCatch(userId,st.is_catching)
         lock.release()
         print('开始爬取用户,pid={1}, user_id={0}'.format(os.getpid(),userId))
         #开始爬取用户信息
         dict = s.getUserInfo(userId)
         code = dict['code']
         # 用户没有价值
         if code == s.code_user_not_useful:
             print('用户没有价值,pid={1}, user_id={0}'.format(os.getpid(),userId))
             db.setUserIsCatch(userId, st.user_not_useful)
         # 用户不存在
         elif code == s.code_user_not_exist:
             print('用户不存在,是僵尸粉,pid={1}, user_id={0}'.format(os.getpid(), userId))
             db.setUserIsCatch(userId, st.user_not_exist)
         # 抓取失败
         elif code == s.code_failure:
             print('用户抓取失败,pid={1}, user_id={0}'.format(os.getpid(), userId))
             db.setUserIsCatch(userId, st.failed)
         # 抓取成功
         else:
             print('用户抓取成功,pid={1}, user_id={0}'.format(os.getpid(), userId))
             db.updateUserInfo(userId, dict)
コード例 #6
0
 def catchUserInfoThread(self, lock):
     s = ZhiHuSpider()
     db = DBUtil()
     st = Status.Catch()
     while not self.isExit:
         #加锁
         lock.acquire()
         #获取第一个用户开始爬
         userId = db.getFirstUserToCatch()
         if userId is None:
             lock.release()
             time.sleep(5)
             continue
         #设置为正在爬取
         db.setUserIsCatch(userId, st.is_catching)
         lock.release()
         log('开始爬取用户,pid={0}, user_id={1}'.format(os.getpid(), userId))
         #开始爬取用户信息
         dict = s.getUserInfo(userId)
         code = dict['code']
         # 用户没有价值
         if code == s.code_user_not_useful:
             log('用户没有价值,pid={0}, user_id={1}'.format(os.getpid(), userId))
             db.setUserIsCatch(userId, st.user_not_useful)
         # 用户不存在
         elif code == s.code_user_not_exist:
             log('用户不存在,是僵尸粉,pid={0}, user_id={1}'.format(
                 os.getpid(), userId))
             db.setUserIsCatch(userId, st.user_not_exist)
         # 抓取失败
         elif code == s.code_failure:
             log('用户抓取失败,pid={0}, user_id={1}'.format(os.getpid(), userId))
             db.setUserIsCatch(userId, st.failed)
         # 抓取成功
         else:
             log('用户抓取成功,pid={0}, user_id={1}'.format(os.getpid(), userId))
             db.updateUserInfo(userId, dict)
             db.saveAchieveInfo(userId, dict)
     log('获取用户详细信息的线程结束,tid = {0}'.format(self.getThreadId()))