コード例 #1
0
class KeywordsController:
    """ 获取待搜索关键词 """
    def __init__(self):
        self.sleep_time = 0
        self.task_list = [
            ZhiDaoGenerator().crawl_zhidao_words,
            BaiduHotGenerator().crawl_hot_words
        ]
        self.logger = LogHandler('keywords_controller')

    def speed_control(self, kw):
        """ 检查一次新关键词数量,控制在500-1000范围内 """
        nums = redis_cli.scard('zhTemporaryWords')
        if nums > 1000:  # 多一些可以接受,所以缓慢增加睡眠时间
            self.sleep_time += 10
        elif nums < 500:  # 过少时迅速降低睡眠时间防止饥饿
            self.sleep_time = int(self.sleep_time / 2)
        self.logger.info('keywords crawler sleep time:%d' % self.sleep_time)
        time.sleep(self.sleep_time)

    def save_keyword(self, kw):
        """ 新待搜索关键词结果存入zhTemporaryWords表中 """
        if redis_cli.sadd('zhKeyWords', kw):
            # 添加未发生覆盖,将此关键词存至zhTemporaryWords内待被搜索
            self.logger.info(str(kw))
            redis_cli.sadd('zhTemporaryWords', kw)

    def kw_run(self):
        """ 所有新关键词存至zhTemporaryWords内待被搜索 """
        while True:
            # 每5次更新一次速度
            for action in self.task_list * 5 + [self.speed_control]:
                action(self.save_keyword)
コード例 #2
0
 def __init__(self):
     self.sleep_time = 0
     self.task_list = [
         ZhiDaoGenerator().crawl_zhidao_words,
         BaiduHotGenerator().crawl_hot_words
     ]
     self.logger = LogHandler('keywords_controller')
コード例 #3
0
 def __init__(self):
     self.p_receiver = ProxiesReceiver()
     self.essence_dict = dict()
     self.activity_dict = dict()
     self.session = requests.Session()
     # 先从话题id中筛选出未获取问题的id放入待获取队列
     QuestionsGenerator.save_new_topic_id()
     self.logger = LogHandler("question_generator")
コード例 #4
0
class ZhiDaoGenerator:
    """ 百度知道关键词抓取,关键词有两部分,1.问题内容包含的关键词,2.问题所属的关键词 """
    def __init__(self):
        self.logger = LogHandler('zhidao_crawl')
        try:
            # 知道请求头,获取Cookie的BAIDUID,否则抓取不到数据
            self.zhidao_headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 16 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
                'Cookie':
                'BAIDUID=%s' % requests.get('https://zhidao.baidu.com/browse/',
                                            timeout=3).cookies['BAIDUID']
            }
        except Exception as e:
            raise e

    def __zd_question_keywords(self, url):
        """ 获取知道问题内容的关键词 """
        try:
            req = requests.get(url, timeout=3)
            req.encoding = req.apparent_encoding
            kws = []
            for kw_tag in BeautifulSoup(req.text, "html.parser").find_all(
                    'li',
                    class_=lambda class_: class_ and ('word grid' in class_)):
                kw = kw_tag.find(class_="word-text")
                if kw is not None:
                    kws.append(kw.string)
            return kws
        except RequestException as re:
            self.logger.warning(re)
            return []
        except Exception as e:
            raise e

    @timethis
    def crawl_zhidao_words(self, save_keyword):
        """ 百度知道抓取 """
        try:
            req = requests.get(
                url=
                'https://zhidao.baidu.com/list?_pjax=%23j-question-list-pjax-container',
                headers=self.zhidao_headers,
                timeout=3)
            req.encoding = req.apparent_encoding
            for qs in BeautifulSoup(req.text, "html.parser").find_all(
                    'div', class_='question-title-section'):
                # 问题所属领域关键词提取
                for qt in map(lambda x: x.string.replace('\n', ''),
                              qs.find_all('a', class_='tag-item')):
                    save_keyword(qt)
                # 问题内容包含关键词提取
                for qm in self.__zd_question_keywords(qs.a.get('href')):
                    save_keyword(str(qm))
        except RequestException as re:
            self.logger.warn(re)
        except Exception as e:
            raise e
コード例 #5
0
 def __init__(self):
     # 代理ip
     self.p_receiver = ProxiesReceiver()
     # 建立会话,设置requests重连次数和重连等待时间
     self.session = requests.Session()
     retry = Retry(connect=3, backoff_factor=0.5)
     adapter = HTTPAdapter(max_retries=retry)
     self.session.mount('https://', adapter)
     self.logger = LogHandler('topics_generator')
     logging.getLogger("urllib3").setLevel(logging.ERROR)
コード例 #6
0
 def __init__(self):
     self.logger = LogHandler('zhidao_crawl')
     try:
         # 知道请求头,获取Cookie的BAIDUID,否则抓取不到数据
         self.zhidao_headers = {
             'User-Agent':
             'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 16 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
             'Cookie':
             'BAIDUID=%s' % requests.get('https://zhidao.baidu.com/browse/',
                                         timeout=3).cookies['BAIDUID']
         }
     except Exception as e:
         raise e
コード例 #7
0
ファイル: ProxyCheck.py プロジェクト: prophetss/zhihu-crawler
class ProxyCheck(ProxyManager, Thread):
    def __init__(self, queue, item_dict):
        ProxyManager.__init__(self)
        Thread.__init__(self)
        self.log = LogHandler('proxy_check', file=False)  # 多线程同时写一个日志文件会有问题
        self.queue = queue
        self.item_dict = item_dict

    def run(self):
        self.db.changeTable(self.useful_proxy_queue)
        while self.queue.qsize():
            proxy = self.queue.get()
            count = self.item_dict[proxy]
            if validUsefulProxy(proxy):
                # 验证通过计数器减1
                if count and int(count) > 0:
                    self.db.put(proxy, num=int(count) - 1)
                else:
                    pass
                self.log.info('ProxyCheck: {} validation pass'.format(proxy))
            else:
                self.log.info('ProxyCheck: {} validation fail'.format(proxy))
                if count and int(count) + 1 >= FAIL_COUNT:
                    self.log.info(
                        'ProxyCheck: {} fail too many, delete!'.format(proxy))
                    self.db.delete(proxy)
                else:
                    self.db.put(proxy, num=int(count) + 1)
            self.queue.task_done()
コード例 #8
0
class ProxyRefreshSchedule(ProxyManager):
    """
    代理定时刷新
    """
    def __init__(self):
        ProxyManager.__init__(self)
        self.log = LogHandler('refresh_schedule')

    def validProxy(self):
        """
        验证raw_proxy_queue中的代理, 将可用的代理放入useful_proxy_queue
        :return:
        """
        self.db.changeTable(self.raw_proxy_queue)
        raw_proxy_item = self.db.pop()
        self.log.info('ProxyRefreshSchedule: %s start validProxy' %
                      time.ctime())
        # 计算剩余代理,用来减少重复计算
        remaining_proxies = self.getAll()
        while raw_proxy_item:
            raw_proxy = raw_proxy_item.get('proxy')
            if isinstance(raw_proxy, bytes):
                # 兼容Py3
                raw_proxy = raw_proxy.decode('utf8')

            if (raw_proxy
                    not in remaining_proxies) and validUsefulProxy(raw_proxy):
                self.db.changeTable(self.useful_proxy_queue)
                self.db.put(raw_proxy)
                self.log.info('ProxyRefreshSchedule: %s validation pass' %
                              raw_proxy)
            else:
                self.log.info('ProxyRefreshSchedule: %s validation fail' %
                              raw_proxy)
            self.db.changeTable(self.raw_proxy_queue)
            raw_proxy_item = self.db.pop()
            remaining_proxies = self.getAll()
        self.log.info('ProxyRefreshSchedule: %s validProxy complete' %
                      time.ctime())
コード例 #9
0
 def __init__(self):
     self.db = ProxyDBClient('proxy')
     self.raw_proxy_queue = 'raw_proxy'
     self.log = LogHandler('proxy_manager')
     self.useful_proxy_queue = 'useful_proxy'
コード例 #10
0
class ProxyManager(object):
    """
    ProxyManager
    """
    def __init__(self):
        self.db = ProxyDBClient('proxy')
        self.raw_proxy_queue = 'raw_proxy'
        self.log = LogHandler('proxy_manager')
        self.useful_proxy_queue = 'useful_proxy'

    def refresh(self):
        """
        fetch proxy into Db by ProxyGetter/getFreeProxy.py
        :return:
        """
        self.db.changeTable(self.raw_proxy_queue)
        for proxyGetter in conf.proxy_getter_functions:
            # fetch
            try:
                self.log.info(
                    "{func}: fetch proxy start".format(func=proxyGetter))
                for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
                    # 直接存储代理, 不用在代码中排重, hash 结构本身具有排重功能
                    proxy = proxy.strip()
                    if proxy and verifyProxyFormat(proxy):
                        self.log.info('{func}: fetch proxy {proxy}'.format(
                            func=proxyGetter, proxy=proxy))
                        self.db.put(proxy)
                    else:
                        self.log.error(
                            '{func}: fetch proxy {proxy} error'.format(
                                func=proxyGetter, proxy=proxy))
            except Exception as e:
                self.log.warn(
                    "{func}: fetch proxy fail".format(func=proxyGetter))
                continue

    def get(self):
        """
        return a useful proxy
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        item_dict = self.db.getAll()
        if item_dict:
            if EnvUtil.PY3:
                return random.choice(list(item_dict.keys()))
            else:
                return random.choice(item_dict.keys())
        return None
        # return self.db.pop()

    def delete(self, proxy):
        """
        delete proxy from pool
        :param proxy:
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        self.db.delete(proxy)

    def getAll(self):
        """
        get all proxy from pool as list
        :return:
        """
        self.db.changeTable(self.useful_proxy_queue)
        item_dict = self.db.getAll()
        if EnvUtil.PY3:
            return list(item_dict.keys()) if item_dict else list()
        return item_dict.keys() if item_dict else list()

    def getNumber(self):
        self.db.changeTable(self.raw_proxy_queue)
        total_raw_proxy = self.db.getNumber()
        self.db.changeTable(self.useful_proxy_queue)
        total_useful_queue = self.db.getNumber()
        return {
            'raw_proxy': total_raw_proxy,
            'useful_proxy': total_useful_queue
        }
コード例 #11
0
class QuestionsGenerator:
    """ 获取话题内的精华和讨论问题"""
    top_activity_url = "https://www.zhihu.com/api/v4/topics/%u/feeds/top_activity?limit=10&offset=%u"  # 讨论问题
    essence_url = "https://www.zhihu.com/api/v4/topics/%u/feeds/essence?limit=10&offset=%u"  # 精华问题

    def __init__(self):
        self.p_receiver = ProxiesReceiver()
        self.essence_dict = dict()
        self.activity_dict = dict()
        self.session = requests.Session()
        # 先从话题id中筛选出未获取问题的id放入待获取队列
        QuestionsGenerator.save_new_topic_id()
        self.logger = LogHandler("question_generator")

    @staticmethod
    def badge_parse(badge):
        res_array = []
        for b in badge:
            res_array.append({
                'author_type': b['type'],
                'author_description': b['description']
            })
        return res_array

    def topic_sticky_parse(self, topic_sticky):
        res_array = []
        try:
            for t in topic_sticky:
                if t['target'].get('title'):
                    res_array.append({
                        'id':
                        t['target']['id'],
                        'title':
                        t['target']['title'],
                        'author_name':
                        t['target']['author']['name'],
                        'author_gender':
                        t['target']['author']['gender'],
                        'author_headline':
                        t['target']['author']['headline'],
                        'author_url_token':
                        t['target']['author']['url_token'],
                        'excerpt':
                        t['target']['excerpt'],
                        'updated_time':
                        t['target']['updated'],
                        'contents':
                        t['target']['comment_count'],
                        'likes':
                        t['target']['voteup_count'],
                        'created_time':
                        t['target']['created']
                    })
                else:
                    res_array.append({
                        'id':
                        t['target']['id'],  # 热门回答
                        'title':
                        t['target']['question']['title'],
                        'author_name':
                        t['target']['author']['name'],
                        'author_gender':
                        t['target']['author']['gender'],
                        'author_headline':
                        t['target']['author']['headline'],
                        'author_url_token':
                        t['target']['author']['url_token'],
                        'excerpt':
                        t['target']['excerpt'],
                        'updated_time':
                        t['target']['updated_time'],
                        'created':
                        t['target']['question']['created'],
                        'created_time':
                        t['target']['created_time']
                    })
        except KeyError as ke:
            self.logger.error(ke, topic_sticky)
        except Exception as e:
            raise e
        return res_array

    @timethis
    def crawl_topic_message(self, tid, q_url, num, message_dict):
        """
        话题对应问题/文章获取,最终结果存放:
        message_dict: key-类型(文章/问题),value-{key-id, value-{'question/title':问题/文章名称, author':作者,
        'gender':性别,'author_badge':作者标签, 'author_headline':作者签名, 'author_url_token':作者url标识,excerpt':摘录,
        'created_time':创建时间, 'updated_time':最后更新时间, 'comment_count':'评论数', likes':点赞数}}
        :param tid:话题ID
        :param q_url:请求url
        :param num 抓取数量
        :param message_dict:存储字典
        :return:
        """
        session = requests.session()
        sticky_num = 0
        for offset in range(0, num, 10):
            url = q_url % (tid, offset)
            try:
                ques = session.get(url=url,
                                   headers=headers,
                                   proxies=self.p_receiver.one_random,
                                   timeout=3)
            except Exception as re:
                self.logger.warn((re, url))
                continue
            try:
                q_json = ques.json() if ques else {}
            except JSONDecodeError as je:
                self.logger.error((je, url, ques))
                continue
            for q in q_json.get('data', []):
                target = q.get('target', {})
                question_type = str(target.get('type', 'none_type')).lower()
                if question_type == 'none_type':
                    continue
                elif question_type == 'answer':
                    # 问题回答是双id,使用元组转成字符串
                    addtwodimdict(
                        message_dict, 'answer',
                        str((target['question']['id'], target['id'])), {
                            'question':
                            target['question']['title'],
                            'author_name':
                            target['author']['name'],
                            'author_gender':
                            target['author']['gender'],
                            'author_badge':
                            QuestionsGenerator.badge_parse(
                                target['author']['badge']),
                            'author_headline':
                            target['author']['headline'],
                            'author_url_token':
                            target['author']['url_token'],
                            'excerpt':
                            target['excerpt'],
                            'created_time':
                            target['created_time'],
                            'updated_time':
                            target['updated_time'],
                            'contents':
                            target['comment_count'],
                            'likes':
                            target['voteup_count']
                        })
                elif question_type == 'article':
                    # 文章是单id
                    addtwodimdict(
                        message_dict, 'article', target['id'], {
                            'title':
                            target['title'],
                            'author_name':
                            target['author']['name'],
                            'author_gender':
                            target['author']['gender'],
                            'author_badge':
                            QuestionsGenerator.badge_parse(
                                target['author']['badge']),
                            'author_headline':
                            target['author']['headline'],
                            'author_url_token':
                            target['author']['url_token'],
                            'excerpt':
                            target['excerpt'],
                            'created_time':
                            target['created'],
                            'updated_time':
                            target['updated'],
                            'contents':
                            target['comment_count'],
                            'likes':
                            target['voteup_count']
                        })
                elif question_type == 'question':
                    pass  # 目前抓取到的是一些没有回答的问题,这里过滤掉
                elif question_type == 'topic_sticky_module':  # 热门置顶
                    addtwodimdict(
                        message_dict, 'topic_sticky_module', sticky_num, {
                            'title': target['title'],
                            'data': self.topic_sticky_parse(target['data'])
                        })
                    sticky_num += 1
                else:
                    self.logger.error(
                        "There was a new type:{}!\n".format(question_type))
            if str(q_json.get('paging', {}).get('is_end',
                                                'none')).lower() == 'true':
                return

    def process(self):
        """
        zhTopicQuestions内包含精华(点赞比较多)和讨论(最新比较热)的问题和文章
        对应结构essence_dict和activity_dict
        :return:
        """
        tid = int(redis_cli.block_pop('zhNewTopicID'))
        self.crawl_topic_message(tid, QuestionsGenerator.essence_url,
                                 conf.essence_nums, self.essence_dict)
        self.crawl_topic_message(tid, QuestionsGenerator.top_activity_url,
                                 conf.top_activity_nums, self.activity_dict)
        with open(os.path.join(OUTPUT_PATH, "{}.json".format(int(tid))),
                  "w",
                  encoding="GB18030") as f:
            json.dump(
                {
                    'essence': self.essence_dict,
                    'top_activity': self.activity_dict
                },
                f,
                indent=4,
                ensure_ascii=False)

    @staticmethod
    def save_new_topic_id():
        """ output目录下不要手动放其他名称文件 """
        for nid in set(redis_cli.hkeys('zhTopicMessage')) - set(
            {x.split('.')[0]
             for x in os.listdir(OUTPUT_PATH)}):
            redis_cli.sadd('zhNewTopicID', nid)
コード例 #12
0
class ZhihuTopicGenerator:
    """ 分为两个过程:id获取和扩展 """
    def __init__(self):
        # 代理ip
        self.p_receiver = ProxiesReceiver()
        # 建立会话,设置requests重连次数和重连等待时间
        self.session = requests.Session()
        retry = Retry(connect=3, backoff_factor=0.5)
        adapter = HTTPAdapter(max_retries=retry)
        self.session.mount('https://', adapter)
        self.logger = LogHandler('topics_generator')
        logging.getLogger("urllib3").setLevel(logging.ERROR)

    def __get_topic_message(self, tid):
        """
        话题信息获取,最终结果存放:
        zhTopicMessage哈希表内,key-话题id,value-{'name':名称, 'introduction': 简介, 'questions_count':问题数,
        'best_answers_count':精华问题数, 'followers_count':关注人数, 'best_answerers_count':优秀回答者人数}
        """
        try:
            j_rst = self.session.get(url=topic_message_url % tid,
                                     headers=headers,
                                     proxies=self.p_receiver.one_random,
                                     timeout=3).json()
            if redis_cli.hset(
                    'zhTopicMessage', tid,
                    str({
                        "name":
                        j_rst.get("name"),
                        'introduction':
                        j_rst.get("introduction"),
                        "questions_count":
                        j_rst.get("questions_count"),
                        "best_answers_count":
                        j_rst.get("best_answers_count"),
                        'followers_count':
                        j_rst.get("followers_count"),
                        "best_answerers_count":
                        j_rst.get("best_answerers_count")
                    })):
                # 待获取相关信息
                redis_cli.sadd('zhNewTopicID', tid)
                self.logger.info("zhNewTopicID:%d", tid)
                return True
        except RequestException as re:
            self.logger.warn(re)
        except Exception as e:
            raise e
        return False

    def __get_hot_topics(self):
        """ 搜索zhTemporaryWords内关键词,从其结果中得到相关话题id和名称 """
        tw = redis_cli.block_pop('zhTemporaryWords').decode('utf-8')  # pop
        # 不断翻页至最后,最大获取1000条
        for offset in range(0, 1000, 10):
            try:
                url = zh_search_url % (tw, offset)
                j_topics = self.session.get(url=url,
                                            headers=headers,
                                            proxies=self.p_receiver.one_random,
                                            timeout=3).json()
                topics = j_topics.get('data', None) if j_topics else None
                if not topics:  # 已到最后
                    return
                # 每一页获取话题相关详细信息
                for t in topics:
                    if t.get('object') and t.get('object').get('id'):
                        try:
                            tid = int(t['object']['id'])
                        except ValueError as ve:
                            self.logger.warning(ve, t['object']['id'])
                            continue
                        if self.__get_topic_message(tid):
                            yield tid
                    else:
                        break
            except RequestException as re:
                self.logger.warn((re, url))
            except ReadTimeout as rte:
                self.logger.warn((rte, url))
            except KeyError as ke:
                self.logger.warn((ke, url))
            except Exception as e:
                raise e

    @staticmethod
    def __save_to_dag(child_topic_id, parent_topic_id):
        """ 按其结构保存为有向无环图 """
        ids = redis_cli.hget('zhTopicDAG', parent_topic_id)
        if not ids or ids.decode() == "None":
            redis_cli.hset('zhTopicDAG', parent_topic_id,
                           str({child_topic_id}))
        else:
            new_ids = eval(ids)
            new_ids.add(child_topic_id)
            redis_cli.hset('zhTopicDAG', parent_topic_id, str(new_ids))

    def __add_topics(self, url, topic_id, func):
        try:
            req = self.session.get(url=url % int(topic_id),
                                   headers=headers,
                                   proxies=self.p_receiver.one_random,
                                   timeout=3)
            if not req:  # 获取子父话题有可能不存在
                return
            for p in req.json()['data']:
                expand_topic_id = int(p['id'])
                func(topic_id, expand_topic_id)
                self.__get_topic_message(expand_topic_id)
        except RequestException as re:
            self.logger.warn(re)
        except ReadTimeout as rte:
            self.logger.warn(rte)
        except Exception as e:
            raise e

    def __expand_topics(self, tid):
        """ 话题扩展,分别向父子话题不断扩展 """
        self.__add_topics(parent_url, tid,
                          lambda a, b: self.__save_to_dag(a, b))
        self.__add_topics(child_url, tid,
                          lambda a, b: self.__save_to_dag(b, a))

    @timethis
    def process(self):
        for tid in self.__get_hot_topics():
            self.__expand_topics(tid)
コード例 #13
0
ファイル: ProxyCheck.py プロジェクト: prophetss/zhihu-crawler
 def __init__(self, queue, item_dict):
     ProxyManager.__init__(self)
     Thread.__init__(self)
     self.log = LogHandler('proxy_check', file=False)  # 多线程同时写一个日志文件会有问题
     self.queue = queue
     self.item_dict = item_dict
コード例 #14
0
   Description :   used for check getFreeProxy.py
   Author :        JHao
   date:          2018/7/10
-------------------------------------------------
   Change Activity:
                   2018/7/10: CheckProxy
-------------------------------------------------
"""
__author__ = 'JHao'

from proxy.ProxyGetter.getFreeProxy import GetFreeProxy
from proxy.Util.utilFunction import verifyProxyFormat

from util.loghandler import LogHandler

log = LogHandler('check_proxy', file=False)


class CheckProxy(object):
    @staticmethod
    def checkAllGetProxyFunc():
        """
        检查getFreeProxy所有代理获取函数运行情况
        Returns:
            None
        """
        import inspect
        member_list = inspect.getmembers(GetFreeProxy,
                                         predicate=inspect.isfunction)
        proxy_count_dict = dict()
        for func_name, func in member_list:
コード例 #15
0
 def __init__(self):
     ProxyManager.__init__(self)
     self.log = LogHandler('refresh_schedule')