Example #1
0
def format_file_name(suffix, *part_name):
    """返回正确的文件名"""
    names = format_path('-'.join(part_name))
    if (suffix is not None) and (suffix != ''):
        file = os.path.join(config.wh(), '%s.%s' % (names, suffix))
    else:
        file = os.path.join(config.wh(), names)
    if not config.get_setting('running/cover'):
        return file

    REPETITION = 1
    while os.path.exists(file):
        file = os.path.join(config.wh(),
                            '%s-%d.%s' % (names, REPETITION, suffix))
        REPETITION += 1
    return file
Example #2
0
 def download_image(cls, doc):
     cra = Crawler()
     for image_url in doc.image_list:
         file_name = os.path.basename(image_url)
         path = os.path.join(config.wh(), 'image')
         if not os.path.exists(path):
             os.makedirs(path)
         with open(os.path.join(path, file_name), 'wb') as foo:
             foo.write(cra.download(image_url).content)
             print(file_name)
Example #3
0
 def download_image(cls, doc):
     cra = Crawler()
     index = 1
     for image_url in doc.image_list:
         file_name = os.path.basename(image_url)
         path = os.path.join(config.wh(), 'image')
         if not os.path.exists(path):
             os.makedirs(path)
         with open(os.path.join(path, file_name), 'wb') as foo:
             foo.write(cra.download(image_url).content)
             print('{:<8}\t{}'.format(
                 str(cls.index) + '-' + str(index), file_name))
             index += 1
Example #4
0
def start_with_id(item_id, item_type):
    load_function(item_type)(item_id)
    print('保存目录:%s' % config.wh())
Example #5
0
 def __init__(self, user_id):
     super(UserMetaManage, self).__init__(user_id)
     resp = self.get_network_data_package(UserMetaManage.item_name,
                                          self.item_id)
     self.user_name = resp.json().get('name')
     config.warehouse(config.wh() + '/' + format_path(self.user_name))
Example #6
0
def main():
    if len(sys.argv) == 1:
        sys.argv.append('-h')

    parser = argparse.ArgumentParser(description='Zhihu Spider')

    parser.add_argument('-u', action='store', help='项目url,多个用"$"分割')
    parser.add_argument('-r', action='store', help='url文本文件,换行分割')
    parser.add_argument('-w',
                        action='store',
                        default=config.wh(),
                        help='文件保存位置')
    parser.add_argument('-f',
                        action='store',
                        default='html',
                        help='文件输出类型(html/markdown)')
    parser.add_argument('-cd', action='store_true', help='缓存原始数据')
    parser.add_argument('-cso', action='store_true', help='输出css文件')
    parser.add_argument('-dg', action='store_true', help='下载图片')
    parser.add_argument('--cover', action='store_true', help='覆盖同名文件')

    parser.add_argument('-v',
                        action='version',
                        version='%(prog)s {}'.format(zhihu.__version__))
    parser.add_argument('-version',
                        action='version',
                        version='%(prog)s {}'.format(zhihu.__version__))

    args = parser.parse_args()

    if args.u is None and args.r is None:
        print('请输入url!')
        sys.exit(0)

    urls = list()

    if args.u is not None:
        urls.extend(re.split(r'[\s$]+', args.u))

    if args.r is not None:
        read_succeed = False
        for enc in ('utf8', 'gbk'):
            try:
                with open(args.r, 'r', encoding=enc) as foo:
                    urls.extend(re.split(r'\s+', foo.read()))
                read_succeed = True
                break
            except (UnicodeError, UnicodeDecodeError):
                pass
            except FileNotFoundError:
                print('url文件不存在(%s),请提供正确路径!' % args.r)
                sys.exit(0)

        if not read_succeed:
            print('无法读取文件,请提供UTF-8或GBK编码的文本文件!')
            sys.exit(0)

    urls = set(urls)
    try:
        urls.remove('')
    except KeyError:
        pass

    file_type = {'html': 0, 'md': 1, 'markdown': 1}

    config.warehouse(args.w)
    config.setting('running/file_type', file_type.get(args.f, 0))
    config.setting('running/cached', args.cd)
    config.setting('running/css_output', args.cso)
    config.setting('running/download_image', args.dg)
    config.setting('running/cover', args.cover)

    for url in urls:
        zhihu.spider.start(url)
    sys.exit(0)
Example #7
0
def main():
    parser = argparse.ArgumentParser(description='Zhihu Spider',
                                     add_help=False)

    parser.add_argument('-u', action='store', help='项目url,多个用"$"分割')
    parser.add_argument('-r', action='store', help='url文本文件,换行分割')
    parser.add_argument('-w',
                        action='store',
                        default=config.wh(),
                        help='文件保存位置')
    parser.add_argument('-f',
                        action='store',
                        default='html',
                        help='文件输出类型(html/markdown)')
    parser.add_argument('-cd', action='store_true', help='缓存原始数据')
    parser.add_argument('-cso', action='store_true', help='输出css文件')
    parser.add_argument('-dg', action='store_true', help='下载图片')
    parser.add_argument('-cv', '--cover', action='store_true', help='覆盖同名文件')
    parser.add_argument('-log',
                        '--login',
                        action='store_true',
                        help='模拟登录知乎,可能解决网络问题(当次有效)')
    parser.add_argument('-log2',
                        '--login-long',
                        action='store_true',
                        help='模拟登录知乎,可能解决网络问题(长期有效)')

    parser.add_argument('-v', '--version', action='store_true', help='版本信息')
    parser.add_argument('-h', '--help', action='store_true', help='帮助')

    args = parser.parse_args()

    if args.help:
        parser.print_help()
        sys.exit(0)
    if args.version:
        print('zhihu %s 本地化收藏知乎优质内容' % zhihu.__version__)
        sys.exit(0)

    if args.login or args.login_long:
        # 仅登录账号或临时登录以退出账号
        pass
    elif args.u is None and args.r is None:
        print('请输入url!')
        sys.exit(0)

    urls = list()

    if args.u is not None:
        urls.extend(re.split(r'[\s$]+', args.u))

    if args.r is not None:
        read_succeed = False
        for enc in ('utf8', 'gbk'):
            try:
                with open(args.r, 'r', encoding=enc) as foo:
                    urls.extend(re.split(r'\s+', foo.read()))
                read_succeed = True
                break
            except (UnicodeError, UnicodeDecodeError):
                pass
            except FileNotFoundError:
                print('url文件不存在(%s),请提供正确路径!' % args.r)
                sys.exit(0)

        if not read_succeed:
            print('无法读取文件,请提供UTF-8或GBK编码的文本文件!')
            sys.exit(0)

    urls = set(urls)
    try:
        urls.remove('')
    except KeyError:
        pass

    file_type = {'html': 0, 'md': 1, 'markdown': 1}

    config.warehouse(args.w)
    config.setting('running/file_type', file_type.get(args.f, 0))
    config.setting('running/cached', args.cd)
    config.setting('running/css_output', args.cso)
    config.setting('running/download_image', args.dg)
    config.setting('running/cover', args.cover)

    acc = None

    if args.login or args.login_long:
        acc = login.ZhihuAccount()
        acc.login_up()

    for url in urls:
        zhihu.spider.start(url)

    if args.login:
        acc.login_out()

    sys.exit(0)