Beispiel #1
0
def fetch_using_store():
    """
    使用已有的代理ip来爬
    """
    proxies = query_proxies()
    bad_proxies = set()

    def job(url, proxy):
        res = safe_http(url,
                        proxies={
                            'https': 'https://{}'.format(proxy),
                            'http': 'http://{}'.format(proxy)
                        },
                        want_obj=True,
                        timeout=15)
        if res is not None:
            check_redirect(res)
            store_movie(url, res.text, res.status_code)
        else:
            bad_proxies.add(proxy)

    missing = query_missing()

    pool = Pool(100)
    while len(missing) > 0:
        for proxy in proxies:
            if len(missing) == 0:
                break

            index = random.randint(0, len(missing) - 1)
            int_mid = missing[index]
            missing.pop(index)  # 从待爬取列表移除

            url = '{:s}/movie/{:s}'.format(home_url, int2mid(int_mid))
            pool.spawn(job, url, proxy)
        pool.join()

        # 清除不可用的proxy
        proxies -= bad_proxies
        log('[Proxy] Clear {:d} bad proxies. Available {:d}'.format(
            len(bad_proxies), len(proxies)))
        bad_proxies.clear()
        if len(proxies) < 80:
            proxies = query_proxies()  # 重查

        # 待爬列表空了之后,再查一遍没有存到数据库中的影片
        # 因为之前会遇到403或timeout等错误
        if len(missing) == 0:
            missing = query_missing()
Beispiel #2
0
def query_missing():
    """
    get还没有爬的movie列表
    :return: mid的10进制形式
    """
    cursor = collection.find(filter=None, projection={'mid': True, '_id': False})
    movies = [mid2int(document['mid']) for document in cursor]
    cursor.close()

    available = set(range(1, mid2int(max_mid) + 1))
    instore = set(movies)

    missing = list(available - instore)
    log('[Missing] {:d} items'.format(len(missing)))

    return missing
Beispiel #3
0
def store_movie(url, source, code=None):
    """
    将电影信息存入mongodb
    :param url: 电影url
    :param source: 对应的源码
    :param code: http状态码
    """

    document = get_movie(url, source, code)

    if document is not None:
        try:
            result = collection.insert_one(document)
            log('[Mongodb] store document {:s}'.format(str(result.inserted_id)))
        except pymongo.errors.DuplicateKeyError:
            log('[Mongodb] Already exists')
Beispiel #4
0
def store_movie(url, source, code=None):
    """
    将电影信息存入mongodb
    :param url: 电影url
    :param source: 对应的源码
    :param code: http状态码
    """

    document = get_movie(url, source, code)

    if document is not None:
        try:
            result = collection.insert_one(document)
            log('[Mongodb] store document {:s}'.format(str(
                result.inserted_id)))
        except pymongo.errors.DuplicateKeyError:
            log('[Mongodb] Already exists')
Beispiel #5
0
def fetch_using_store():
    """
    使用已有的代理ip来爬
    """
    proxies = query_proxies()
    bad_proxies = set()

    def job(url, proxy):
        res = safe_http(url,
                        proxies={
                            'https': 'https://{}'.format(proxy),
                            'http': 'http://{}'.format(proxy)
                        }, want_obj=True, timeout=15)
        if res is not None:
            check_redirect(res)
            store_movie(url, res.text, res.status_code)
        else:
            bad_proxies.add(proxy)

    missing = query_missing()

    pool = Pool(100)
    while len(missing) > 0:
        for proxy in proxies:
            if len(missing) == 0:
                break

            index = random.randint(0, len(missing) - 1)
            int_mid = missing[index]
            missing.pop(index)  # 从待爬取列表移除

            url = '{:s}/movie/{:s}'.format(home_url, int2mid(int_mid))
            pool.spawn(job, url, proxy)
        pool.join()

        # 清除不可用的proxy
        proxies -= bad_proxies
        log('[Proxy] Clear {:d} bad proxies. Available {:d}'.format(len(bad_proxies), len(proxies)))
        bad_proxies.clear()
        if len(proxies) < 80:
            proxies = query_proxies()  # 重查

        # 待爬列表空了之后,再查一遍没有存到数据库中的影片
        # 因为之前会遇到403或timeout等错误
        if len(missing) == 0:
            missing = query_missing()
Beispiel #6
0
def query_missing():
    """
    get还没有爬的movie列表
    :return: mid的10进制形式
    """
    cursor = collection.find(filter=None,
                             projection={
                                 'mid': True,
                                 '_id': False
                             })
    movies = [mid2int(document['mid']) for document in cursor]
    cursor.close()

    available = set(range(1, mid2int(max_mid) + 1))
    instore = set(movies)

    missing = list(available - instore)
    log('[Missing] {:d} items'.format(len(missing)))

    return missing
Beispiel #7
0
                        help='Home URL of the site. Like "https://avmo.pw')

    parser.add_argument('--col',
                        dest='collection',
                        action='store',
                        required=True,
                        choices=["avmoo", "avsox", "avmemo"],
                        help='Mongodb collection name.')

    args = parser.parse_args()

    if args.logging:
        enable_logger()

    max_mid = args.mid if args.mid else get_latest()
    home_url = args.site + '/cn'

    if args.collection == 'avmoo':
        collection = db.avmoo
    elif args.collection == 'avmemo':
        collection = db.avmemo
    else:
        collection = db.avsox

    if args.test:
        fetch_when_test()
    else:
        fetch_using_store()

    log('job done')
Beispiel #8
0
                        help='The max mid among the movies on the site. Like "5f20"')

    parser.add_argument('--site', dest='site', action='store', required=True, type=str,
                        help='Home URL of the site. Like "https://avmo.pw')

    parser.add_argument('--col', dest='collection', action='store', required=True,
                        choices=["avmoo", "avsox", "avmemo"],
                        help='Mongodb collection name.')

    args = parser.parse_args()

    if args.logging:
        enable_logger()

    max_mid = args.mid if args.mid else get_latest()
    home_url = args.site + '/cn'

    if args.collection == 'avmoo':
        collection = db.avmoo
    elif args.collection == 'avmemo':
        collection = db.avmemo
    else:
        collection = db.avsox

    if args.test:
        fetch_when_test()
    else:
        fetch_using_store()

    log('job done')