コード例 #1
0
class KeywordsController:
    """ 获取待搜索关键词 """
    def __init__(self):
        self.sleep_time = 0
        self.task_list = [
            ZhiDaoGenerator().crawl_zhidao_words,
            BaiduHotGenerator().crawl_hot_words
        ]
        self.logger = LogHandler('keywords_controller')

    def speed_control(self, kw):
        """ 检查一次新关键词数量,控制在500-1000范围内 """
        nums = redis_cli.scard('zhTemporaryWords')
        if nums > 1000:  # 多一些可以接受,所以缓慢增加睡眠时间
            self.sleep_time += 10
        elif nums < 500:  # 过少时迅速降低睡眠时间防止饥饿
            self.sleep_time = int(self.sleep_time / 2)
        self.logger.info('keywords crawler sleep time:%d' % self.sleep_time)
        time.sleep(self.sleep_time)

    def save_keyword(self, kw):
        """ 新待搜索关键词结果存入zhTemporaryWords表中 """
        if redis_cli.sadd('zhKeyWords', kw):
            # 添加未发生覆盖,将此关键词存至zhTemporaryWords内待被搜索
            self.logger.info(str(kw))
            redis_cli.sadd('zhTemporaryWords', kw)

    def kw_run(self):
        """ 所有新关键词存至zhTemporaryWords内待被搜索 """
        while True:
            # 每5次更新一次速度
            for action in self.task_list * 5 + [self.speed_control]:
                action(self.save_keyword)
コード例 #2
0
ファイル: ProxyCheck.py プロジェクト: prophetss/zhihu-crawler
class ProxyCheck(ProxyManager, Thread):
    def __init__(self, queue, item_dict):
        ProxyManager.__init__(self)
        Thread.__init__(self)
        self.log = LogHandler('proxy_check', file=False)  # 多线程同时写一个日志文件会有问题
        self.queue = queue
        self.item_dict = item_dict

    def run(self):
        self.db.changeTable(self.useful_proxy_queue)
        while self.queue.qsize():
            proxy = self.queue.get()
            count = self.item_dict[proxy]
            if validUsefulProxy(proxy):
                # 验证通过计数器减1
                if count and int(count) > 0:
                    self.db.put(proxy, num=int(count) - 1)
                else:
                    pass
                self.log.info('ProxyCheck: {} validation pass'.format(proxy))
            else:
                self.log.info('ProxyCheck: {} validation fail'.format(proxy))
                if count and int(count) + 1 >= FAIL_COUNT:
                    self.log.info(
                        'ProxyCheck: {} fail too many, delete!'.format(proxy))
                    self.db.delete(proxy)
                else:
                    self.db.put(proxy, num=int(count) + 1)
            self.queue.task_done()
コード例 #3
0
class ProxyRefreshSchedule(ProxyManager):
    """
    代理定时刷新
    """
    def __init__(self):
        ProxyManager.__init__(self)
        self.log = LogHandler('refresh_schedule')

    def validProxy(self):
        """
        验证raw_proxy_queue中的代理, 将可用的代理放入useful_proxy_queue
        :return:
        """
        self.db.changeTable(self.raw_proxy_queue)
        raw_proxy_item = self.db.pop()
        self.log.info('ProxyRefreshSchedule: %s start validProxy' %
                      time.ctime())
        # 计算剩余代理,用来减少重复计算
        remaining_proxies = self.getAll()
        while raw_proxy_item:
            raw_proxy = raw_proxy_item.get('proxy')
            if isinstance(raw_proxy, bytes):
                # 兼容Py3
                raw_proxy = raw_proxy.decode('utf8')

            if (raw_proxy
                    not in remaining_proxies) and validUsefulProxy(raw_proxy):
                self.db.changeTable(self.useful_proxy_queue)
                self.db.put(raw_proxy)
                self.log.info('ProxyRefreshSchedule: %s validation pass' %
                              raw_proxy)
            else:
                self.log.info('ProxyRefreshSchedule: %s validation fail' %
                              raw_proxy)
            self.db.changeTable(self.raw_proxy_queue)
            raw_proxy_item = self.db.pop()
            remaining_proxies = self.getAll()
        self.log.info('ProxyRefreshSchedule: %s validProxy complete' %
                      time.ctime())
コード例 #4
0
class ProxyManager(object):
    """
    ProxyManager
    """
    def __init__(self):
        self.db = ProxyDBClient('proxy')
        self.raw_proxy_queue = 'raw_proxy'
        self.log = LogHandler('proxy_manager')
        self.useful_proxy_queue = 'useful_proxy'

    def refresh(self):
        """
        fetch proxy into Db by ProxyGetter/getFreeProxy.py
        :return:
        """
        self.db.changeTable(self.raw_proxy_queue)
        for proxyGetter in conf.proxy_getter_functions:
            # fetch
            try:
                self.log.info(
                    "{func}: fetch proxy start".format(func=proxyGetter))
                for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
                    # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能
                    proxy = proxy.strip()
                    if proxy and verifyProxyFormat(proxy):
                        self.log.info('{func}: fetch proxy {proxy}'.format(
                            func=proxyGetter, proxy=proxy))
                        self.db.put(proxy)
                    else:
                        self.log.error(
                            '{func}: fetch proxy {proxy} error'.format(
                                func=proxyGetter, proxy=proxy))
            except Exception as e:
                self.log.warn(
                    "{func}: fetch proxy fail".format(func=proxyGetter))
                continue

    def get(self):
        """
        return a useful proxy
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        item_dict = self.db.getAll()
        if item_dict:
            if EnvUtil.PY3:
                return random.choice(list(item_dict.keys()))
            else:
                return random.choice(item_dict.keys())
        return None
        # return self.db.pop()

    def delete(self, proxy):
        """
        delete proxy from pool
        :param proxy:
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        self.db.delete(proxy)

    def getAll(self):
        """
        get all proxy from pool as list
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        item_dict = self.db.getAll()
        if EnvUtil.PY3:
            return list(item_dict.keys()) if item_dict else list()
        return item_dict.keys() if item_dict else list()

    def getNumber(self):
        self.db.changeTable(self.raw_proxy_queue)
        total_raw_proxy = self.db.getNumber()
        self.db.changeTable(self.useful_proxy_queue)
        total_useful_queue = self.db.getNumber()
        return {
            'raw_proxy': total_raw_proxy,
            'useful_proxy': total_useful_queue
        }
コード例 #5
0
class ZhihuTopicGenerator:
    """ 分为两个过程:id获取和扩展 """
    def __init__(self):
        # 代理ip
        self.p_receiver = ProxiesReceiver()
        # 建立会话,设置requests重连次数和重连等待时间
        self.session = requests.Session()
        retry = Retry(connect=3, backoff_factor=0.5)
        adapter = HTTPAdapter(max_retries=retry)
        self.session.mount('https://', adapter)
        self.logger = LogHandler('topics_generator')
        logging.getLogger("urllib3").setLevel(logging.ERROR)

    def __get_topic_message(self, tid):
        """
        话题信息获取,最终结果存放:
        zhTopicMessage哈希表内,key-话题id,value-{'name':名称, 'introduction': 简介, 'questions_count':问题数,
        'best_answers_count':精华问题数, 'followers_count':关注人数, 'best_answerers_count':优秀回答者人数}
        """
        try:
            j_rst = self.session.get(url=topic_message_url % tid,
                                     headers=headers,
                                     proxies=self.p_receiver.one_random,
                                     timeout=3).json()
            if redis_cli.hset(
                    'zhTopicMessage', tid,
                    str({
                        "name":
                        j_rst.get("name"),
                        'introduction':
                        j_rst.get("introduction"),
                        "questions_count":
                        j_rst.get("questions_count"),
                        "best_answers_count":
                        j_rst.get("best_answers_count"),
                        'followers_count':
                        j_rst.get("followers_count"),
                        "best_answerers_count":
                        j_rst.get("best_answerers_count")
                    })):
                # 待获取相关信息
                redis_cli.sadd('zhNewTopicID', tid)
                self.logger.info("zhNewTopicID:%d", tid)
                return True
        except RequestException as re:
            self.logger.warn(re)
        except Exception as e:
            raise e
        return False

    def __get_hot_topics(self):
        """ 搜索zhTemporaryWords内关键词,从其结果中得到相关话题id和名称 """
        tw = redis_cli.block_pop('zhTemporaryWords').decode('utf-8')  # pop
        # 不断翻页至最后,最大获取1000条
        for offset in range(0, 1000, 10):
            try:
                url = zh_search_url % (tw, offset)
                j_topics = self.session.get(url=url,
                                            headers=headers,
                                            proxies=self.p_receiver.one_random,
                                            timeout=3).json()
                topics = j_topics.get('data', None) if j_topics else None
                if not topics:  # 已到最后
                    return
                # 每一页获取话题相关详细信息
                for t in topics:
                    if t.get('object') and t.get('object').get('id'):
                        try:
                            tid = int(t['object']['id'])
                        except ValueError as ve:
                            self.logger.warning(ve, t['object']['id'])
                            continue
                        if self.__get_topic_message(tid):
                            yield tid
                    else:
                        break
            except RequestException as re:
                self.logger.warn((re, url))
            except ReadTimeout as rte:
                self.logger.warn((rte, url))
            except KeyError as ke:
                self.logger.warn((ke, url))
            except Exception as e:
                raise e

    @staticmethod
    def __save_to_dag(child_topic_id, parent_topic_id):
        """ 按其结构保存为有向无环图 """
        ids = redis_cli.hget('zhTopicDAG', parent_topic_id)
        if not ids or ids.decode() == "None":
            redis_cli.hset('zhTopicDAG', parent_topic_id,
                           str({child_topic_id}))
        else:
            new_ids = eval(ids)
            new_ids.add(child_topic_id)
            redis_cli.hset('zhTopicDAG', parent_topic_id, str(new_ids))

    def __add_topics(self, url, topic_id, func):
        try:
            req = self.session.get(url=url % int(topic_id),
                                   headers=headers,
                                   proxies=self.p_receiver.one_random,
                                   timeout=3)
            if not req:  # 获取子父话题有可能不存在
                return
            for p in req.json()['data']:
                expand_topic_id = int(p['id'])
                func(topic_id, expand_topic_id)
                self.__get_topic_message(expand_topic_id)
        except RequestException as re:
            self.logger.warn(re)
        except ReadTimeout as rte:
            self.logger.warn(rte)
        except Exception as e:
            raise e

    def __expand_topics(self, tid):
        """ 话题扩展,分别向父子话题不断扩展 """
        self.__add_topics(parent_url, tid,
                          lambda a, b: self.__save_to_dag(a, b))
        self.__add_topics(child_url, tid,
                          lambda a, b: self.__save_to_dag(b, a))

    @timethis
    def process(self):
        for tid in self.__get_hot_topics():
            self.__expand_topics(tid)