def get_proxy(): if PROXY: if not is_url(PROXY): return gen_proxy(PROXY) if is_url(PROXY_POOL): p = requests.get(PROXY_POOL).text.strip('\r\n') return gen_proxy(p) if os.path.isfile(PROXY_POOL): with open(PROXY_POOL, 'r') as f: p_txt = f.readlines() return random.choice([gen_proxy(i) for i in p_txt]) if isinstance(PROXY_POOL, list): return random.choice([gen_proxy(i) for i in PROXY_POOL]) return None
async def add(self, requests: typing.Union[Request, typing.List[Request]]): """ 向队列添加多个request @param requests: """ if not isinstance(requests, list): requests = [requests] # 判断url是否合法 for item in requests: if not is_url(item.url): raise InvalidUrl(f"Invalid url: {item.url} ") # 去重 request_list = [] for request in requests: if request.dont_filter or await self.dupefilter.get(request.fp): request_list.append(request) # 去重后为空 if not request_list: return 0 # 放进队列 set_len = await self.scheduler_queue.add(request_list) # 把放进队列的request去重 for request in requests: if not request.dont_filter: await self.dupefilter.add(request.fp) return set_len
def run(self, args, opts): if len(args) != 1 or not is_url(args[0]): raise UsageError() request = Request( args[0], callback=self._print_response, cb_kwargs={"opts": opts}, dont_filter=True, ) # by default, let the framework handle redirects, # i.e. command handles all codes expect 3xx if not opts.no_redirect: request.meta["handle_httpstatus_list"] = SequenceExclude( range(300, 400)) else: request.meta["handle_httpstatus_all"] = True spidercls = DefaultSpider spider_loader = self.crawler_process.spider_loader if opts.spider: spidercls = spider_loader.load(opts.spider) else: spidercls = spidercls_for_request(spider_loader, request, spidercls) self.crawler_process.crawl(spidercls, start_requests=lambda: [request]) self.crawler_process.start()
def run(self, args, opts): # 参数校验 if len(args) != 1 or not is_url(args[0]): raise UsageError() # 定义输出回调函数 cb = lambda x: self._print_response(x, opts) # 初始化一个request对象 request = Request(args[0], callback=cb, dont_filter=True) # by default, let the framework handle redirects, # i.e. command handles all codes expect 3xx # 如果选项中没有no_redirect,即不进行转发,则可处理的状态列表中包含除了300到400之间的所有状态码 if not opts.no_redirect: request.meta['handle_httpstatus_list'] = SequenceExclude( range(300, 400)) else: # 否则全部够可以处理,转发有请求库自动处理 request.meta['handle_httpstatus_all'] = True # 初始化赋值为自带简易爬虫 spidercls = DefaultSpider # 初始化爬虫加载器 spider_loader = self.crawler_process.spider_loader # 如果给定了爬虫选项,则根据给定的爬虫来进行爬取,否则根据request url来查找匹配爬虫 if opts.spider: spidercls = spider_loader.load(opts.spider) else: spidercls = spidercls_for_request(spider_loader, request, spidercls) # 默认使用自带的简易爬虫(scrapy.utils.spider.DefaultSpider)来进行给定url的数据抓取,只需要传递start_requests self.crawler_process.crawl(spidercls, start_requests=lambda: [request]) # 爬虫开启 self.crawler_process.start()
def test_proxy_pool(self, url): if not PROXY: if is_url(PROXY_POOL): return if PROXY_POOL_RAW and \ os.path.isfile(PROXY_POOL_RAW): with open(PROXY_POOL_RAW, 'r') as f: p = f.readlines() fobj = open(PROXY_POOL, 'a') print(f'* 配置中使用了文本代理IP池,开始检测所有代理IP.测试url:{url}') print(f'* 文本IP代理池中共有代理IP:{len(p)}') _ = 0 lock = threading.Lock() while p: ps = [p.pop(0) for i in range(min(PROXY_TEST_MAX, len(p)))] print(f'* 检测条数:{len(ps)},文本IP代理池中剩余代理IP:{len(p)}') ts = [] for i in ps: a = CrawlThread(fetch, args=(self.session, url, i, fobj, lock)) ts.append(a) for i in ts: i.start() for i in ts: i.join() if i.get_result(): _ += 1 fobj.close() print(f'>> 所有代理检测完成.共有{_}个.')
def _split_urls_and_names(self, args): urls = [] names = [] for arg in args: if is_url(arg): urls.append(arg) else: names.append(arg) return urls, names
def run(self, args, opts): # parse arguments if not len(args) == 1 or not is_url(args[0]): raise UsageError() else: url = args[0] # prepare spidercls self.set_spidercls(url, opts) if self.spidercls and opts.depth > 0: self.start_parsing(url, opts) self.print_results(opts)
def run(self, args, opts): # parse arguments if not len(args) == 1 or not is_url(args[0]): raise UsageError() else: url = args[0] # prepare spider self.pcrawler = self.crawler_process.create_crawler() self.set_spider(url, opts) if self.spider and opts.depth > 0: self.start_parsing(url, opts) self.print_results(opts)
def run(self, args, opts): if not len(args) == 1 or not is_url(args[0]): raise UsageError() response, spider = self.get_response_and_spider(args[0], opts) if not response: return callback = None if opts.callback: callback = opts.callback elif opts.rules: callback = self.get_callback_from_rules(spider, response) items, requests = self.run_callback(spider, response, callback or 'parse', \ opts) self.print_results(items, requests, callback, opts)
def run(self, args, opts): if len(args) != 1 or not is_url(args[0]): raise UsageError() cb = lambda x: self._print_response(x, opts) request = Request(args[0], callback=cb, dont_filter=True) request.meta['handle_httpstatus_all'] = True spidercls = DefaultSpider spiders = self.crawler_process.spiders if opts.spider: spidercls = spiders.load(opts.spider) else: spidercls = spidercls_for_request(spiders, request, spidercls) self.crawler_process.crawl(spidercls, start_requests=lambda: [request]) self.crawler_process.start()
def get_proxy(self, req): http_proxy = req.spider.settings.HTTP_PROXY if http_proxy: if is_proxy_valid(http_proxy): proxy = gen_proxy(http_proxy, req.down_type) return proxy elif is_url(http_proxy): return http_proxy else: if not req.spider.settings.HTTP_PROXY_FILL_ENABLE: self.logger.debug(f'Invalid proxy format:{http_proxy}') return _proxy = self.get_proxy_by_api(req) proxy = gen_proxy(_proxy, req.down_type) return proxy
def get_proxy(): if PROXY: if not is_url(PROXY): return gen_proxy(PROXY) if is_url(PROXY_POOL): # 根据proxy_pool接口修改 p = requests.get(PROXY_POOL).json().get('proxy') return gen_proxy(p) if os.path.isfile(PROXY_POOL): with open(PROXY_POOL,'r') as f: p_txt = f.readlines() return random.choice([gen_proxy(i) for i in p_txt]) if isinstance(PROXY_POOL,list): return random.choice([gen_proxy(i) for i in PROXY_POOL]) if PROXY_HTTP_TUNNEL: # 代理服务器 proxyHost = "http-dyn.abuyun.com" proxyPort = "9020" # 代理隧道验证信息 proxyUser = "******" proxyPass = "******" proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxyHost, "port": proxyPort, "user": proxyUser, "pass": proxyPass, } proxies = { "http": proxyMeta, "https": proxyMeta, } return proxies return None
def run(self, args, opts): if len(args) != 1 or not is_url(args[0]): raise UsageError() cb = lambda x: self._print_response(x, opts) request = Request(args[0], callback=cb, dont_filter=True) request.meta['handle_httpstatus_all'] = True spider = None if opts.spider: spider = self.crawler.spiders.create(opts.spider) else: spider = create_spider_for_request(self.crawler.spiders, request, \ default_spider=BaseSpider('default')) self.crawler.crawl(spider, [request]) self.crawler.start()
def get_proxy(api=PROXY_API): if not is_url(api): if os.path.isfile(api): with open(api, 'r') as f: p_txt = f.readlines() return random.choice([gen_proxy(i) for i in p_txt]) if isinstance(api, list): return random.choice([gen_proxy(i) for i in api]) return gen_proxy(api) else: while 1: p = requests.get(api).text.strip('\r\n').strip() if not is_proxy_valid(p): continue return gen_proxy(p)
def run(self, args, opts): if len(args) != 1 or not is_url(args[0]): raise UsageError() cb = lambda x: self._print_response(x, opts) request = Request(args[0], callback=cb, dont_filter=True) spider = None if opts.spider: try: spider = self.crawler.spiders.create(opts.spider) except KeyError: log.msg("Could not find spider: %s" % opts.spider, log.ERROR) self.crawler.queue.append_request(request, spider, \ default_spider=BaseSpider('default')) self.crawler.start()
def run(self, args, opts): # parse arguments if not len(args) == 1 or not is_url(args[0]): raise UsageError() else: url = args[0] # prepare spidercls self.set_spidercls(url, opts) if self.spidercls and opts.depth > 0: start_status = self.start_parsing(url, opts) if start_status == FATAL_ERROR: return start_status self.print_results(opts) results = self.get_results(opts) print(len(results)) return results
def run(self, args, opts): if len(args) != 1 or not is_url(args[0]): raise UsageError() cb = lambda x: self._print_response(x, opts) request = Request(args[0], callback=cb, dont_filter=True) # by default, let the framework handle redirects, # i.e. command handles all codes expect 3xx if not opts.no_redirect: request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400)) else: request.meta['handle_httpstatus_all'] = True spidercls = DefaultSpider spider_loader = self.crawler_process.spider_loader if opts.spider: spidercls = spider_loader.load(opts.spider) else: spidercls = spidercls_for_request(spider_loader, request, spidercls) self.crawler_process.crawl(spidercls, start_requests=lambda: [request]) self.crawler_process.start()
def run(self, args, opts): ## NLP parser settings self.parser_server = self.settings['PARSER_RPC_SERVER'] # For standalone rpc server (msgpack-rpc) self.parser_msgpack_host = self.settings['PARSER_MSGPACK_HOST'] self.parser_msgpack_port = self.settings['PARSER_MSGPACK_PORT'] # For distributed rpc server (json-rpc) self.parser_jsonrpc_url = self.settings['PARSER_PROXY_URL'] if len(args) != 1 or not is_url(args[0]): raise UsageError() cb = lambda x: self._print_response(x, opts) request = Request(args[0], callback=cb, dont_filter=True) request.meta['handle_httpstatus_all'] = True spidercls = DefaultSpider spider_loader = self.crawler_process.spider_loader if opts.spider: spidercls = spider_loader.load(opts.spider) else: spidercls = spidercls_for_request(spider_loader, request, spidercls) self.crawler_process.crawl(spidercls, start_requests=lambda: [request]) self.crawler_process.start()
def url(self, url): #only starts with schema:file,http,https allowed to be a valid url if not urltool.is_url(url): raise ValueError('Not a valid url for Request.') else: self.__url = urltool.safe_download_url(url)
def test_is_url(self): self.assertTrue(is_url("http://www.example.org")) self.assertTrue(is_url("https://www.example.org")) self.assertTrue(is_url("file:///some/path")) self.assertFalse(is_url("foo://bar")) self.assertFalse(is_url("foo--bar"))
def test_is_url(self): self.assertTrue(is_url('http://www.example.org')) self.assertTrue(is_url('https://www.example.org')) self.assertTrue(is_url('file:///some/path')) self.assertFalse(is_url('foo://bar')) self.assertFalse(is_url('foo--bar'))
def is_valid_url(self, url): return is_url(url) and (not " " in url)
def __init__(self, parser): self.parser = parser self.args = parser.parse_args() if not inside_project(): self.error("No active Scrapy project") self.command = self.args.command self.spider = sanitize_module_name(self.args.spider) if \ self.args.spider else None try: self.callback = self.args.callback except AttributeError: self.callback = None try: self.fixture = self.args.fixture except AttributeError: self.fixture = None if self.command == 'update': try: self.new = self.args.new except AttributeError: self.new = None try: self.dynamic = self.args.dynamic except AttributeError: self.dynamic = None if self.command == 'clear': self.fixtures = self.args.fixtures.split(',') if self.fixture and not self.callback: self.error("Can't specify a fixture without a callback") self.project_dir, self.project_name = get_project_dirs() sys.path.append(self.project_dir) self.settings = get_project_settings() if self.command == "parse": url_list = [url.strip() for url in self.args.urls.split('|')] for url in url_list: if not is_url(url): self.error("Something went wrong with your urls arg! " "Note that as of version 1.0, the character for separating " "multiple urls is '|', as opposed to ','") self.args = process_options(self.args) crawler_process = CrawlerProcess(self.settings) run_command(crawler_process, url_list, self.args) else: self.base_path = self.settings.get( 'TESTMASTER_BASE_PATH', default=os.path.join(self.project_dir, 'testmaster')) self.tests_dir = os.path.join(self.base_path, 'tests') self.spider_dir = os.path.join(self.tests_dir, self.spider) if not os.path.isdir(self.spider_dir) and self.command != "establish": self.error( "No recorded data found " "for spider '{}'".format(self.spider)) self.extra_path = self.settings.get('TESTMASTER_EXTRA_PATH') or '' if self.callback: self.callback_dir = os.path.join( self.spider_dir, self.extra_path, self.callback) if self.command == 'establish': if os.path.isdir(self.callback_dir): self.error( "Can't use 'establish' with callback arg " "if callback dir for spider '{}' " "exists already".format(self.spider)) else: if self.command == 'inspect': self.error( "No recorded data found for callback " "'{}' from '{}' spider".format(self.callback, self.spider)) if self.fixture: self.fixture_path = os.path.join(self.callback_dir, self.parse_fixture_arg()) if not os.path.isfile(self.fixture_path): self.error("Fixture '{}' not found".format(self.fixture_path))
def _check_uri(self, attribute, value): try: parse_data_uri(value) except ValueError as e: if not is_url(value): raise ValueError("uri must be a URL or a data URI") from e