def __init__(self, question_id): super(QuestionManage, self).__init__(question_id) response = self.get_network_data_package('question_meta', self.item_id) self.title = re.search(config.get_setting('QuestionManage/title_reg'), response.text).group(1) config.warehouse('~question/%s' % format_path(self.title))
def __init__(self, collection_id): super(CollectionManage, self).__init__(collection_id) resp = self.get_network_data_package('collection_meta', self.item_id) jsd = resp.json() self.title = jsd.get('title') self.item_totals = jsd.get('item_count') config.warehouse('~collection/%s' % format_path(self.title))
def __init__(self, column_id): super(ColumnManage, self).__init__(column_id) resp = self.get_network_data_package('column_meta', self.item_id) item_words = re.search(config.get_setting('ColumnManage/title_reg'), resp.text).group(1) self.item_words = codecs.decode(item_words, 'unicode_escape') config.warehouse('~column/%s' % format_path(self.item_words))
def __init__(self, user_id): super(UserArticlesManage, self).__init__(user_id) config.warehouse('~articles')
def __init__(self, user_id): super(UserAnswersManage, self).__init__(user_id) config.warehouse('~answers')
def __init__(self, user_id): super(UserMetaManage, self).__init__(user_id) resp = self.get_network_data_package(UserMetaManage.item_name, self.item_id) self.user_name = resp.json().get('name') config.warehouse(config.wh() + '/' + format_path(self.user_name))
import zhihu.spider from zhihu.conf import config # ### 程序设置(务必设置存储路径) #### # # 默认存储路径为用户文档,开发环境下可设置为用户桌面或其他路径,方便查看结果 config.warehouse(r'/home/{}/Documents'.format('??')) config.setting('running/file_type', 0) config.setting('running/cached', False) config.setting('running/css_output', False) config.setting('running/download_image', False) config.setting('running/cover', False) # ### 启动爬虫 #### # zhihu.spider.start(r'https://www.zhihu.com/question/371430700')
import zhihu.spider from zhihu.conf import config # ### 程序设置(务必设置存储路径) #### # # 默认存储路径为用户文档,开发环境下可设置为用户桌面或其他路径,方便查看结果 config.warehouse(r'C:\Users\{}\Desktop'.format('86137')) config.setting('running/file_type', 0) config.setting('running/cached', False) config.setting('running/css_output', False) config.setting('running/download_image', False) config.setting('running/cover', False) # ### 启动爬虫 #### # zhihu.spider.start(r'https://www.zhihu.com/question/371430700')
def main(): if len(sys.argv) == 1: sys.argv.append('-h') parser = argparse.ArgumentParser(description='Zhihu Spider') parser.add_argument('-u', action='store', help='项目url,多个用"$"分割') parser.add_argument('-r', action='store', help='url文本文件,换行分割') parser.add_argument('-w', action='store', default=config.wh(), help='文件保存位置') parser.add_argument('-f', action='store', default='html', help='文件输出类型(html/markdown)') parser.add_argument('-cd', action='store_true', help='缓存原始数据') parser.add_argument('-cso', action='store_true', help='输出css文件') parser.add_argument('-dg', action='store_true', help='下载图片') parser.add_argument('--cover', action='store_true', help='覆盖同名文件') parser.add_argument('-v', action='version', version='%(prog)s {}'.format(zhihu.__version__)) parser.add_argument('-version', action='version', version='%(prog)s {}'.format(zhihu.__version__)) args = parser.parse_args() if args.u is None and args.r is None: print('请输入url!') sys.exit(0) urls = list() if args.u is not None: urls.extend(re.split(r'[\s$]+', args.u)) if args.r is not None: read_succeed = False for enc in ('utf8', 'gbk'): try: with open(args.r, 'r', encoding=enc) as foo: urls.extend(re.split(r'\s+', foo.read())) read_succeed = True break except (UnicodeError, UnicodeDecodeError): pass except FileNotFoundError: print('url文件不存在(%s),请提供正确路径!' % args.r) sys.exit(0) if not read_succeed: print('无法读取文件,请提供UTF-8或GBK编码的文本文件!') sys.exit(0) urls = set(urls) try: urls.remove('') except KeyError: pass file_type = {'html': 0, 'md': 1, 'markdown': 1} config.warehouse(args.w) config.setting('running/file_type', file_type.get(args.f, 0)) config.setting('running/cached', args.cd) config.setting('running/css_output', args.cso) config.setting('running/download_image', args.dg) config.setting('running/cover', args.cover) for url in urls: zhihu.spider.start(url) sys.exit(0)
import zhihu.spider from zhihu.conf import config config.warehouse(r'C:\Users\Milloy\Desktop') config.setting('running/file_type', 0) config.setting('running/cached', False) config.setting('running/css_output', False) config.setting('running/download_image', False) config.setting('running/cover', False) zhihu.spider.start( r'https://www.zhihu.com/question/311008958/answer/592584375') # gif pic https://www.zhihu.com/question/59392068/answer/541759976 # pics https://www.zhihu.com/question/311008958/answer/592584375
def main(): parser = argparse.ArgumentParser(description='Zhihu Spider', add_help=False) parser.add_argument('-u', action='store', help='项目url,多个用"$"分割') parser.add_argument('-r', action='store', help='url文本文件,换行分割') parser.add_argument('-w', action='store', default=config.wh(), help='文件保存位置') parser.add_argument('-f', action='store', default='html', help='文件输出类型(html/markdown)') parser.add_argument('-cd', action='store_true', help='缓存原始数据') parser.add_argument('-cso', action='store_true', help='输出css文件') parser.add_argument('-dg', action='store_true', help='下载图片') parser.add_argument('-cv', '--cover', action='store_true', help='覆盖同名文件') parser.add_argument('-log', '--login', action='store_true', help='模拟登录知乎,可能解决网络问题(当次有效)') parser.add_argument('-log2', '--login-long', action='store_true', help='模拟登录知乎,可能解决网络问题(长期有效)') parser.add_argument('-v', '--version', action='store_true', help='版本信息') parser.add_argument('-h', '--help', action='store_true', help='帮助') args = parser.parse_args() if args.help: parser.print_help() sys.exit(0) if args.version: print('zhihu %s 本地化收藏知乎优质内容' % zhihu.__version__) sys.exit(0) if args.login or args.login_long: # 仅登录账号或临时登录以退出账号 pass elif args.u is None and args.r is None: print('请输入url!') sys.exit(0) urls = list() if args.u is not None: urls.extend(re.split(r'[\s$]+', args.u)) if args.r is not None: read_succeed = False for enc in ('utf8', 'gbk'): try: with open(args.r, 'r', encoding=enc) as foo: urls.extend(re.split(r'\s+', foo.read())) read_succeed = True break except (UnicodeError, UnicodeDecodeError): pass except FileNotFoundError: print('url文件不存在(%s),请提供正确路径!' % args.r) sys.exit(0) if not read_succeed: print('无法读取文件,请提供UTF-8或GBK编码的文本文件!') sys.exit(0) urls = set(urls) try: urls.remove('') except KeyError: pass file_type = {'html': 0, 'md': 1, 'markdown': 1} config.warehouse(args.w) config.setting('running/file_type', file_type.get(args.f, 0)) config.setting('running/cached', args.cd) config.setting('running/css_output', args.cso) config.setting('running/download_image', args.dg) config.setting('running/cover', args.cover) acc = None if args.login or args.login_long: acc = login.ZhihuAccount() acc.login_up() for url in urls: zhihu.spider.start(url) if args.login: acc.login_out() sys.exit(0)