Beispiel #1
0
    def make_document(cls, meta, cont):
        """根据所给的cont和meta生成html或markdown文件"""
        if config.get_setting('running/file_type') == cls.DEFAULT_TYPE:
            doc = cls.item2html(cont, meta)
        else:
            doc = cls.item2md(cont, meta)

        if config.get_setting('running/download_image'):
            cls.download_image(doc)
        cls.show_info(meta)
Beispiel #2
0
    def parse_data(cls, data):
        meta = Meta(pattern=Meta.simple)

        title = data.find('h2', _class='zm-item-title')
        try:
            meta.title = title.string
        except AttributeError:
            print(data)
            raise AttributeError
        original_url = title.find('a').get_attrs('href')

        if data.get_attrs('data-type') == 'Answer':
            head = data.find('div', _class='answer-head')
            original_url += config.get_setting('API/host')
        else:
            head = data.find('div', _class='post-head')

        meta.original_url = original_url

        try:
            author = head.find('a', _class='author-link')
            meta.author = author.string
            meta.author_homepage = config.get_setting(
                'API/host') + author.get_attrs('href')
        except AttributeError:
            try:
                author = head.find('span', _class='name')
                meta.author = author.string
                meta.author_homepage = config.get_setting('API/host')
            except AttributeError:
                print(head)
                raise

        meta.voteup = int(
            head.find('div',
                      _class='zm-item-vote-info').get_attrs('data-votecount'))

        # <meta itemprop="post-id" content="107121832">
        # <meta itemprop="answer-id" content="107121832">
        # https://www.zhihu.com/node/AnswerVoteInfoV2?params={"answer_id":"203923119"}
        # https://www.zhihu.com/node/ColumnPostVoteInfoV2?params={"post_id":"103306156"}

        def stg(r):
            return {
                '&quot;': '"',
                '&lt;': '<',
                '&gt;': '>'
            }.get(r.group(0), '')

        return meta, re.sub('(&quot;)|(&lt;)|(&gt;)', stg,
                            data.find('textarea', _class='content').string)
Beispiel #3
0
 def item2html(cls, cont, meta):
     mushroom = html.Mushroom(
         cont, meta, css_output=config.get_setting('running/css_output'))
     with open(format_file_name('html', meta.author, meta.title),
               'w',
               encoding='utf8') as foo:
         mushroom.write_down(foo)
     if config.get_setting('running/css_output'):
         stylesheets = mushroom.output_css_code()
         for css in stylesheets:
             with open(format_file_name('css', css['file_name']),
                       'w',
                       encoding='utf8') as foo:
                 foo.write(css['code'])
     return mushroom
Beispiel #4
0
    def make_document(cls, meta, cont):
        """根据所给的cont和meta生成html或markdown文件"""
        if cont is None or cont == '':
            return

        if config.get_setting('running/file_type') == cls.DEFAULT_TYPE:
            doc = cls.item2html(cont, meta)
        else:
            doc = cls.item2md(cont, meta)

        cls.show_info(meta)

        if config.get_setting('running/download_image'):
            cls.download_image(doc)
            print('-' * 53 + '\n')
        cls.index += 1
Beispiel #5
0
class Crawler(requests.Session, API):
    UA = config.get_setting('Crawler/user-agent')

    def __init__(self):
        super().__init__()
        self.headers.update(Crawler.UA)

    def get_network_data_package(self, item_name, item_id, **kwargs):
        resp = self.get(self.get_url(item_name, item_id, **kwargs), timeout=30)
        try:
            resp.raise_for_status()
        except HTTPError:
            raise VerityError(status_code=resp.status_code, url=resp.url)
        except MissingSchema:
            raise ValueError('url error: ', item_name, item_id, kwargs)
        if config.get_setting('running/cached'):
            self.cached_network_data(resp, item_name, item_id, **kwargs)
        return resp

    def download(self, url, **kwargs):
        return self.get(url, timeout=30, **kwargs)

    @classmethod
    def cached_network_data(cls, data, item_name, item_id, **kwargs):
        """缓存原始数据"""
        ofs = kwargs.get('offset', None) or kwargs.get(
            'page', None) or timer.timestamp_str()
        file = os.path.join(config.cached_warehouse(),
                            '%s-%s-%s.json' % (item_name, item_id, ofs))
        with open(file, 'w', encoding='utf8') as foo:
            foo.write(data.text)
        return file
Beispiel #6
0
 def __init__(self, column_id):
     super(ColumnManage, self).__init__(column_id)
     resp = self.get_network_data_package('column_meta', self.item_id)
     item_words = re.search(config.get_setting('ColumnManage/title_reg'),
                            resp.text).group(1)
     self.item_words = codecs.decode(item_words, 'unicode_escape')
     config.warehouse('~column/%s' % format_path(self.item_words))
Beispiel #7
0
    def __init__(self, question_id):
        super(QuestionManage, self).__init__(question_id)
        response = self.get_network_data_package('question_meta', self.item_id)

        self.title = re.search(config.get_setting('QuestionManage/title_reg'),
                               response.text).group(1)
        config.warehouse('~question/%s' % format_path(self.title))
Beispiel #8
0
 def _make_link_card(self, tag):
     """生成卡片链接标签"""
     url = tag.get_attrs('href')
     img = tag.get_attrs('image')
     if re.search('zhihu', url) and img is None:
         img = config.get_setting('Formatter/link_card_default_image')
     return self.link_card(url=url, title=tag.string, img=img)
Beispiel #9
0
 def get_network_data_package(self, item_name, item_id, **kwargs):
     resp = self.get(self.get_url(item_name, item_id, **kwargs), timeout=30)
     try:
         resp.raise_for_status()
     except HTTPError:
         raise VerityError(status_code=resp.status_code, url=resp.url)
     except MissingSchema:
         raise ValueError('url error: ', item_name, item_id, kwargs)
     if config.get_setting('running/cached'):
         self.cached_network_data(resp, item_name, item_id, **kwargs)
     return resp
Beispiel #10
0
def format_file_name(suffix, *part_name):
    """返回正确的文件名"""
    names = format_path('-'.join(part_name))
    if (suffix is not None) and (suffix != ''):
        file = os.path.join(config.wh(), '%s.%s' % (names, suffix))
    else:
        file = os.path.join(config.wh(), names)
    if not config.get_setting('running/cover'):
        return file

    REPETITION = 1
    while os.path.exists(file):
        file = os.path.join(config.wh(),
                            '%s-%d.%s' % (names, REPETITION, suffix))
        REPETITION += 1
    return file
Beispiel #11
0
    def formatter(self, meta, otp: Mushroom):
        """处理Tags,修改属性、生成视频标签等"""
        r = self.format(self.tag_list)

        if len(self.reference_list) != 0:
            r.append(Tag('span', attrs={'style': 'font-size:24px'}, string='参考资料'))
            n = self.reference_table(self.reference_list)
            r.append(n)

        otp.insert_article_title(self.article_tile(meta))
        otp.insert_article_text(self.article_text(*r))

        for stylesheet in self.style_meta:
            otp.stylesheets.append(config.get_setting('head/style/%s' % stylesheet))
        otp.image_list = self.image_list

        return otp
Beispiel #12
0
class Crawler(API):
    UA = config.get_setting('Crawler/user-agent')

    def __init__(self):
        super().__init__()
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': Crawler.UA})
        try:
            mod = __import__('zhihu.spider.login', None, None, ['__all__'])
            ckf = getattr(mod, 'cookies_file')
            self.session.cookies = cookiejar.LWPCookieJar(filename=ckf)
            self.session.cookies.load()
        except (FileNotFoundError, ImportError, AttributeError) as e:
            pass

    def __del__(self):
        self.session.close()

    def get_network_data_package(self, item_name, item_id, **kwargs):
        resp = self.session.get(self.get_url(item_name, item_id, **kwargs),
                                timeout=30)
        try:
            resp.raise_for_status()
        except HTTPError:
            raise VerityError(status_code=resp.status_code, url=resp.url)
        except MissingSchema:
            raise ValueError('url error: ', item_name, item_id, kwargs)
        if config.get_setting('running/cached'):
            self.cached_network_data(resp, item_name, item_id, **kwargs)
        return resp

    def download(self, url, **kwargs):
        return self.session.get(url, timeout=30, **kwargs)

    @classmethod
    def cached_network_data(cls, data, item_name, item_id, **kwargs):
        """缓存原始数据"""
        ofs = kwargs.get('offset', None) or kwargs.get(
            'page', None) or timer.timestamp_str()
        file = os.path.join(config.cached_warehouse(),
                            '%s-%s-%s.json' % (item_name, item_id, ofs))
        with open(file, 'w', encoding='utf8') as foo:
            foo.write(data.text)
        return file
Beispiel #13
0
class API:
    """获得有关数据的链接类"""

    SORT_BY_DEF = config.get_setting('API/SORT_BY_DEF')
    SORT_BY_VOT = config.get_setting('API/SORT_BY_VOT')
    SORT_BY_DAT = config.get_setting('API/SORT_BY_DAT')
    PLATFORM = config.get_setting('API/PLATFORM')

    api = {
        'question': config.get_setting('API/question'),
        'question_meta': config.get_setting('API/question_meta'),
        'answer': config.get_setting('API/answer'),
        'article': config.get_setting('API/article'),
        'column': config.get_setting('API/column'),
        'column_meta': config.get_setting('API/column_meta'),
        'answer_link': config.get_setting('API/answer_link'),
        'article_link': config.get_setting('API/article_link'),
        'author_homepage': config.get_setting('API/author_homepage'),
        'user_answers': config.get_setting('API/user_answers'),
        'user_articles': config.get_setting('API/user_articles'),
        'user_meta': config.get_setting('API/user_meta'),
        'collection': config.get_setting('API/collection'),
        'collection_meta': config.get_setting('API/collection_meta'),
    }

    @classmethod
    def get_url(cls, item_name, item_id, **kwargs):
        """
        :param item_name: question, answer, column, ...
        :param item_id: question_id, answer_id, ...
        :param kwargs: offset, limit, sort_by
        :return: str, url
        """
        params = {
            'item_id': item_id,
            'offset': 0,
            'limit': 20,
            'sort_by': cls.SORT_BY_VOT
        }
        params.update(kwargs)
        return cls.api.get(item_name, '').format(**params)

    @classmethod
    def format_url(cls, item_name, **kwargs):
        return cls.get_url(item_name, None, **kwargs)
Beispiel #14
0
 def get_network_data_package(self, item_name, item_id, **kwargs):
     resp = self.get(self.get_url(item_name, item_id, **kwargs), timeout=30)
     if config.get_setting('running/cached'):
         self.cached_network_data(resp, item_name, item_id, **kwargs)
     return resp
Beispiel #15
0
class ZhihuAccount:
    UA = config.get_setting('Crawler/user-agent')

    BASE_HEAD = {'Host': 'www.zhihu.com', 'User-Agent': UA}

    LOGIN_UP = 1  # 登录了
    LOGIN_IN = 0  # 请求登录

    def __init__(self):
        self.session = requests.Session()
        self.session.cookies = cookiejar.LWPCookieJar(filename=cookies_file)
        try:
            self.session.cookies.load(ignore_discard=True)
        except FileNotFoundError:
            pass

    def __del__(self):
        try:
            os.remove(os.path.abspath('QR.jpg'))
        except FileNotFoundError:
            pass

    def login_up(self):
        if self.login_status() == ZhihuAccount.LOGIN_UP:
            print('已登录!')
        else:
            print('开始登录...')
            if self.__login():
                if self.login_status() == ZhihuAccount.LOGIN_UP:
                    self.session.cookies.save()
                    print('登录成功!')
                    return
            print('登录失败!')

    def login_out(self):
        self.session.get('https://www.zhihu.com/logout',
                         headers=ZhihuAccount.BASE_HEAD,
                         allow_redirects=False)
        self.session.cookies.save()
        # try:
        #     os.remove('cookies')
        # except FileNotFoundError:
        #     pass
        print('已退出!')

    def login_status(self):
        resp = self.session.get('https://www.zhihu.com/signup',
                                headers=ZhihuAccount.BASE_HEAD,
                                allow_redirects=False)

        if resp.status_code == 302:
            return ZhihuAccount.LOGIN_UP
        else:
            return ZhihuAccount.LOGIN_IN

    def __login(self):
        try:
            self.session.get("https://www.zhihu.com/signup?next=%2F",
                             headers=ZhihuAccount.BASE_HEAD)
            captcha_head = {"Referer": "https://www.zhihu.com/"}
            captcha_head.update(ZhihuAccount.BASE_HEAD)
            self.session.get(
                "https://www.zhihu.com/api/v3/oauth/captcha?lang=en",
                headers=captcha_head)

            resp = self.session.post("https://www.zhihu.com/udid",
                                     headers=ZhihuAccount.BASE_HEAD)
            token_head = {
                'Origin': 'https://www.zhihu.com',
                'Referer': 'https://www.zhihu.com/signup?next=%2F',
                'x-udid': resp.content.decode('utf8')
            }

            token_head.update(ZhihuAccount.BASE_HEAD)
            resp = self.session.post(
                "https://www.zhihu.com/api/v3/account/api/login/qrcode",
                headers=token_head)
            token = resp.json().get('token')

            qr = self.session.get(
                f'https://www.zhihu.com/api/v3/account/api/login/qrcode/{token}/image',
                headers=token_head)

            self.__show_qr_code(qr.content)

            print('操作系统已使用关联程序显示二维码,请使用知乎APP扫描。\n'
                  '小提示:知乎APP扫码特别慢,建议使用微信扫描,按屏幕提示继续操作也可登录。\n')

            time.sleep(10)
            start = time.time()
            while True:
                rjs = self.session.get(
                    f'https://www.zhihu.com/api/v3/account/api/login/qrcode/{token}/scan_info',
                    headers=captcha_head).json()
                if rjs.get('user_id', None) or rjs.get(
                        'status', None) == 6 or rjs.get('error'):
                    break
                if time.time() - start >= 90:
                    print('登录超时!(<90s)')
                    break
                time.sleep(2)

            return True
        except RequestException as e:
            return False

    @staticmethod
    def __show_qr_code(image):
        """
        调用系统软件显示图片
        """
        image_file = os.path.abspath('QR.jpg')

        with open(image_file, 'wb') as foo:
            foo.write(image)

        if platform.system() == 'Darwin':
            os.subprocess.call(['open', image_file])
        elif platform.system() == 'Linux':
            os.subprocess.call(['xdg-open', image_file])
        else:
            os.startfile(image_file)

    def __enter__(self):
        self.login_up()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.login_out()
Beispiel #16
0
 def template(cls, name):
     try:
         return config.get_setting('tag/%s' % name)
     except KeyError:
         raise KeyError('not find template named %s.' % name)