def fetch_using_store(): """ 使用已有的代理ip来爬 """ proxies = query_proxies() bad_proxies = set() def job(url, proxy): res = safe_http(url, proxies={ 'https': 'https://{}'.format(proxy), 'http': 'http://{}'.format(proxy) }, want_obj=True, timeout=15) if res is not None: check_redirect(res) store_movie(url, res.text, res.status_code) else: bad_proxies.add(proxy) missing = query_missing() pool = Pool(100) while len(missing) > 0: for proxy in proxies: if len(missing) == 0: break index = random.randint(0, len(missing) - 1) int_mid = missing[index] missing.pop(index) # 从待爬取列表移除 url = '{:s}/movie/{:s}'.format(home_url, int2mid(int_mid)) pool.spawn(job, url, proxy) pool.join() # 清除不可用的proxy proxies -= bad_proxies log('[Proxy] Clear {:d} bad proxies. Available {:d}'.format( len(bad_proxies), len(proxies))) bad_proxies.clear() if len(proxies) < 80: proxies = query_proxies() # 重查 # 待爬列表空了之后,再查一遍没有存到数据库中的影片 # 因为之前会遇到403或timeout等错误 if len(missing) == 0: missing = query_missing()
def query_missing(): """ get还没有爬的movie列表 :return: mid的10进制形式 """ cursor = collection.find(filter=None, projection={'mid': True, '_id': False}) movies = [mid2int(document['mid']) for document in cursor] cursor.close() available = set(range(1, mid2int(max_mid) + 1)) instore = set(movies) missing = list(available - instore) log('[Missing] {:d} items'.format(len(missing))) return missing
def store_movie(url, source, code=None): """ 将电影信息存入mongodb :param url: 电影url :param source: 对应的源码 :param code: http状态码 """ document = get_movie(url, source, code) if document is not None: try: result = collection.insert_one(document) log('[Mongodb] store document {:s}'.format(str(result.inserted_id))) except pymongo.errors.DuplicateKeyError: log('[Mongodb] Already exists')
def store_movie(url, source, code=None): """ 将电影信息存入mongodb :param url: 电影url :param source: 对应的源码 :param code: http状态码 """ document = get_movie(url, source, code) if document is not None: try: result = collection.insert_one(document) log('[Mongodb] store document {:s}'.format(str( result.inserted_id))) except pymongo.errors.DuplicateKeyError: log('[Mongodb] Already exists')
def fetch_using_store(): """ 使用已有的代理ip来爬 """ proxies = query_proxies() bad_proxies = set() def job(url, proxy): res = safe_http(url, proxies={ 'https': 'https://{}'.format(proxy), 'http': 'http://{}'.format(proxy) }, want_obj=True, timeout=15) if res is not None: check_redirect(res) store_movie(url, res.text, res.status_code) else: bad_proxies.add(proxy) missing = query_missing() pool = Pool(100) while len(missing) > 0: for proxy in proxies: if len(missing) == 0: break index = random.randint(0, len(missing) - 1) int_mid = missing[index] missing.pop(index) # 从待爬取列表移除 url = '{:s}/movie/{:s}'.format(home_url, int2mid(int_mid)) pool.spawn(job, url, proxy) pool.join() # 清除不可用的proxy proxies -= bad_proxies log('[Proxy] Clear {:d} bad proxies. Available {:d}'.format(len(bad_proxies), len(proxies))) bad_proxies.clear() if len(proxies) < 80: proxies = query_proxies() # 重查 # 待爬列表空了之后,再查一遍没有存到数据库中的影片 # 因为之前会遇到403或timeout等错误 if len(missing) == 0: missing = query_missing()
def query_missing(): """ get还没有爬的movie列表 :return: mid的10进制形式 """ cursor = collection.find(filter=None, projection={ 'mid': True, '_id': False }) movies = [mid2int(document['mid']) for document in cursor] cursor.close() available = set(range(1, mid2int(max_mid) + 1)) instore = set(movies) missing = list(available - instore) log('[Missing] {:d} items'.format(len(missing))) return missing
help='Home URL of the site. Like "https://avmo.pw') parser.add_argument('--col', dest='collection', action='store', required=True, choices=["avmoo", "avsox", "avmemo"], help='Mongodb collection name.') args = parser.parse_args() if args.logging: enable_logger() max_mid = args.mid if args.mid else get_latest() home_url = args.site + '/cn' if args.collection == 'avmoo': collection = db.avmoo elif args.collection == 'avmemo': collection = db.avmemo else: collection = db.avsox if args.test: fetch_when_test() else: fetch_using_store() log('job done')
help='The max mid among the movies on the site. Like "5f20"') parser.add_argument('--site', dest='site', action='store', required=True, type=str, help='Home URL of the site. Like "https://avmo.pw') parser.add_argument('--col', dest='collection', action='store', required=True, choices=["avmoo", "avsox", "avmemo"], help='Mongodb collection name.') args = parser.parse_args() if args.logging: enable_logger() max_mid = args.mid if args.mid else get_latest() home_url = args.site + '/cn' if args.collection == 'avmoo': collection = db.avmoo elif args.collection == 'avmemo': collection = db.avmemo else: collection = db.avsox if args.test: fetch_when_test() else: fetch_using_store() log('job done')