コード例 #1
0
    def __init__(self, question_id):
        super(QuestionManage, self).__init__(question_id)
        response = self.get_network_data_package('question_meta', self.item_id)

        self.title = re.search(config.get_setting('QuestionManage/title_reg'),
                               response.text).group(1)
        config.warehouse('~question/%s' % format_path(self.title))
コード例 #2
0
 def __init__(self, collection_id):
     super(CollectionManage, self).__init__(collection_id)
     resp = self.get_network_data_package('collection_meta', self.item_id)
     jsd = resp.json()
     self.title = jsd.get('title')
     self.item_totals = jsd.get('item_count')
     config.warehouse('~collection/%s' % format_path(self.title))
コード例 #3
0
 def __init__(self, column_id):
     super(ColumnManage, self).__init__(column_id)
     resp = self.get_network_data_package('column_meta', self.item_id)
     item_words = re.search(config.get_setting('ColumnManage/title_reg'),
                            resp.text).group(1)
     self.item_words = codecs.decode(item_words, 'unicode_escape')
     config.warehouse('~column/%s' % format_path(self.item_words))
コード例 #4
0
 def __init__(self, user_id):
     super(UserArticlesManage, self).__init__(user_id)
     config.warehouse('~articles')
コード例 #5
0
 def __init__(self, user_id):
     super(UserAnswersManage, self).__init__(user_id)
     config.warehouse('~answers')
コード例 #6
0
 def __init__(self, user_id):
     super(UserMetaManage, self).__init__(user_id)
     resp = self.get_network_data_package(UserMetaManage.item_name,
                                          self.item_id)
     self.user_name = resp.json().get('name')
     config.warehouse(config.wh() + '/' + format_path(self.user_name))
コード例 #7
0
import zhihu.spider
from zhihu.conf import config

# ### 程序设置(务必设置存储路径) #### #

# 默认存储路径为用户文档,开发环境下可设置为用户桌面或其他路径,方便查看结果
config.warehouse(r'/home/{}/Documents'.format('??'))

config.setting('running/file_type', 0)
config.setting('running/cached', False)
config.setting('running/css_output', False)
config.setting('running/download_image', False)
config.setting('running/cover', False)

# ### 启动爬虫 #### #
zhihu.spider.start(r'https://www.zhihu.com/question/371430700')
コード例 #8
0
import zhihu.spider
from zhihu.conf import config

# ### 程序设置(务必设置存储路径) #### #

# 默认存储路径为用户文档,开发环境下可设置为用户桌面或其他路径,方便查看结果
config.warehouse(r'C:\Users\{}\Desktop'.format('86137'))

config.setting('running/file_type', 0)
config.setting('running/cached', False)
config.setting('running/css_output', False)
config.setting('running/download_image', False)
config.setting('running/cover', False)

# ### 启动爬虫 #### #
zhihu.spider.start(r'https://www.zhihu.com/question/371430700')
コード例 #9
0
def main():
    if len(sys.argv) == 1:
        sys.argv.append('-h')

    parser = argparse.ArgumentParser(description='Zhihu Spider')

    parser.add_argument('-u', action='store', help='项目url,多个用"$"分割')
    parser.add_argument('-r', action='store', help='url文本文件,换行分割')
    parser.add_argument('-w',
                        action='store',
                        default=config.wh(),
                        help='文件保存位置')
    parser.add_argument('-f',
                        action='store',
                        default='html',
                        help='文件输出类型(html/markdown)')
    parser.add_argument('-cd', action='store_true', help='缓存原始数据')
    parser.add_argument('-cso', action='store_true', help='输出css文件')
    parser.add_argument('-dg', action='store_true', help='下载图片')
    parser.add_argument('--cover', action='store_true', help='覆盖同名文件')

    parser.add_argument('-v',
                        action='version',
                        version='%(prog)s {}'.format(zhihu.__version__))
    parser.add_argument('-version',
                        action='version',
                        version='%(prog)s {}'.format(zhihu.__version__))

    args = parser.parse_args()

    if args.u is None and args.r is None:
        print('请输入url!')
        sys.exit(0)

    urls = list()

    if args.u is not None:
        urls.extend(re.split(r'[\s$]+', args.u))

    if args.r is not None:
        read_succeed = False
        for enc in ('utf8', 'gbk'):
            try:
                with open(args.r, 'r', encoding=enc) as foo:
                    urls.extend(re.split(r'\s+', foo.read()))
                read_succeed = True
                break
            except (UnicodeError, UnicodeDecodeError):
                pass
            except FileNotFoundError:
                print('url文件不存在(%s),请提供正确路径!' % args.r)
                sys.exit(0)

        if not read_succeed:
            print('无法读取文件,请提供UTF-8或GBK编码的文本文件!')
            sys.exit(0)

    urls = set(urls)
    try:
        urls.remove('')
    except KeyError:
        pass

    file_type = {'html': 0, 'md': 1, 'markdown': 1}

    config.warehouse(args.w)
    config.setting('running/file_type', file_type.get(args.f, 0))
    config.setting('running/cached', args.cd)
    config.setting('running/css_output', args.cso)
    config.setting('running/download_image', args.dg)
    config.setting('running/cover', args.cover)

    for url in urls:
        zhihu.spider.start(url)
    sys.exit(0)
コード例 #10
0
import zhihu.spider
from zhihu.conf import config

config.warehouse(r'C:\Users\Milloy\Desktop')
config.setting('running/file_type', 0)
config.setting('running/cached', False)
config.setting('running/css_output', False)
config.setting('running/download_image', False)
config.setting('running/cover', False)
zhihu.spider.start(
    r'https://www.zhihu.com/question/311008958/answer/592584375')

# gif pic https://www.zhihu.com/question/59392068/answer/541759976
# pics https://www.zhihu.com/question/311008958/answer/592584375
コード例 #11
0
def main():
    parser = argparse.ArgumentParser(description='Zhihu Spider',
                                     add_help=False)

    parser.add_argument('-u', action='store', help='项目url,多个用"$"分割')
    parser.add_argument('-r', action='store', help='url文本文件,换行分割')
    parser.add_argument('-w',
                        action='store',
                        default=config.wh(),
                        help='文件保存位置')
    parser.add_argument('-f',
                        action='store',
                        default='html',
                        help='文件输出类型(html/markdown)')
    parser.add_argument('-cd', action='store_true', help='缓存原始数据')
    parser.add_argument('-cso', action='store_true', help='输出css文件')
    parser.add_argument('-dg', action='store_true', help='下载图片')
    parser.add_argument('-cv', '--cover', action='store_true', help='覆盖同名文件')
    parser.add_argument('-log',
                        '--login',
                        action='store_true',
                        help='模拟登录知乎,可能解决网络问题(当次有效)')
    parser.add_argument('-log2',
                        '--login-long',
                        action='store_true',
                        help='模拟登录知乎,可能解决网络问题(长期有效)')

    parser.add_argument('-v', '--version', action='store_true', help='版本信息')
    parser.add_argument('-h', '--help', action='store_true', help='帮助')

    args = parser.parse_args()

    if args.help:
        parser.print_help()
        sys.exit(0)
    if args.version:
        print('zhihu %s 本地化收藏知乎优质内容' % zhihu.__version__)
        sys.exit(0)

    if args.login or args.login_long:
        # 仅登录账号或临时登录以退出账号
        pass
    elif args.u is None and args.r is None:
        print('请输入url!')
        sys.exit(0)

    urls = list()

    if args.u is not None:
        urls.extend(re.split(r'[\s$]+', args.u))

    if args.r is not None:
        read_succeed = False
        for enc in ('utf8', 'gbk'):
            try:
                with open(args.r, 'r', encoding=enc) as foo:
                    urls.extend(re.split(r'\s+', foo.read()))
                read_succeed = True
                break
            except (UnicodeError, UnicodeDecodeError):
                pass
            except FileNotFoundError:
                print('url文件不存在(%s),请提供正确路径!' % args.r)
                sys.exit(0)

        if not read_succeed:
            print('无法读取文件,请提供UTF-8或GBK编码的文本文件!')
            sys.exit(0)

    urls = set(urls)
    try:
        urls.remove('')
    except KeyError:
        pass

    file_type = {'html': 0, 'md': 1, 'markdown': 1}

    config.warehouse(args.w)
    config.setting('running/file_type', file_type.get(args.f, 0))
    config.setting('running/cached', args.cd)
    config.setting('running/css_output', args.cso)
    config.setting('running/download_image', args.dg)
    config.setting('running/cover', args.cover)

    acc = None

    if args.login or args.login_long:
        acc = login.ZhihuAccount()
        acc.login_up()

    for url in urls:
        zhihu.spider.start(url)

    if args.login:
        acc.login_out()

    sys.exit(0)