コード例 #1
0
ファイル: proxy.py プロジェクト: 2311870923/pythonCrawler
def get_proxy():
    if PROXY:
        if not is_url(PROXY):
            return gen_proxy(PROXY)
    if is_url(PROXY_POOL):
        p = requests.get(PROXY_POOL).text.strip('\r\n')
        return gen_proxy(p)
    if os.path.isfile(PROXY_POOL):
        with open(PROXY_POOL, 'r') as f:
            p_txt = f.readlines()
        return random.choice([gen_proxy(i) for i in p_txt])
    if isinstance(PROXY_POOL, list):
        return random.choice([gen_proxy(i) for i in PROXY_POOL])
    return None
コード例 #2
0
ファイル: scheduler.py プロジェクト: hyq-python/hoopa
    async def add(self, requests: typing.Union[Request, typing.List[Request]]):
        """
        向队列添加多个request
        @param requests:
        """
        if not isinstance(requests, list):
            requests = [requests]

        # 判断url是否合法
        for item in requests:
            if not is_url(item.url):
                raise InvalidUrl(f"Invalid url: {item.url} ")

        # 去重
        request_list = []
        for request in requests:
            if request.dont_filter or await self.dupefilter.get(request.fp):
                request_list.append(request)

        #  去重后为空
        if not request_list:
            return 0

        # 放进队列
        set_len = await self.scheduler_queue.add(request_list)

        # 把放进队列的request去重
        for request in requests:
            if not request.dont_filter:
                await self.dupefilter.add(request.fp)

        return set_len
コード例 #3
0
    def run(self, args, opts):
        if len(args) != 1 or not is_url(args[0]):
            raise UsageError()
        request = Request(
            args[0],
            callback=self._print_response,
            cb_kwargs={"opts": opts},
            dont_filter=True,
        )
        # by default, let the framework handle redirects,
        # i.e. command handles all codes expect 3xx
        if not opts.no_redirect:
            request.meta["handle_httpstatus_list"] = SequenceExclude(
                range(300, 400))
        else:
            request.meta["handle_httpstatus_all"] = True

        spidercls = DefaultSpider
        spider_loader = self.crawler_process.spider_loader
        if opts.spider:
            spidercls = spider_loader.load(opts.spider)
        else:
            spidercls = spidercls_for_request(spider_loader, request,
                                              spidercls)
        self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
        self.crawler_process.start()
コード例 #4
0
ファイル: fetch.py プロジェクト: kongshuaifu/debug_scrapy
 def run(self, args, opts):
     # 参数校验
     if len(args) != 1 or not is_url(args[0]):
         raise UsageError()
     # 定义输出回调函数
     cb = lambda x: self._print_response(x, opts)
     # 初始化一个request对象
     request = Request(args[0], callback=cb, dont_filter=True)
     # by default, let the framework handle redirects,
     # i.e. command handles all codes expect 3xx
     # 如果选项中没有no_redirect,即不进行转发,则可处理的状态列表中包含除了300到400之间的所有状态码
     if not opts.no_redirect:
         request.meta['handle_httpstatus_list'] = SequenceExclude(
             range(300, 400))
     else:
         # 否则全部够可以处理,转发有请求库自动处理
         request.meta['handle_httpstatus_all'] = True
     # 初始化赋值为自带简易爬虫
     spidercls = DefaultSpider
     # 初始化爬虫加载器
     spider_loader = self.crawler_process.spider_loader
     # 如果给定了爬虫选项,则根据给定的爬虫来进行爬取,否则根据request url来查找匹配爬虫
     if opts.spider:
         spidercls = spider_loader.load(opts.spider)
     else:
         spidercls = spidercls_for_request(spider_loader, request,
                                           spidercls)
     # 默认使用自带的简易爬虫(scrapy.utils.spider.DefaultSpider)来进行给定url的数据抓取,只需要传递start_requests
     self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
     # 爬虫开启
     self.crawler_process.start()
コード例 #5
0
ファイル: http.py プロジェクト: LXF-DX3906/DPspider
def test_proxy_pool(self, url):
    if not PROXY:
        if is_url(PROXY_POOL):
            return
        if PROXY_POOL_RAW and \
            os.path.isfile(PROXY_POOL_RAW):
            with open(PROXY_POOL_RAW, 'r') as f:
                p = f.readlines()
            fobj = open(PROXY_POOL, 'a')
            print(f'* 配置中使用了文本代理IP池,开始检测所有代理IP.测试url:{url}')
            print(f'* 文本IP代理池中共有代理IP:{len(p)}')
            _ = 0
            lock = threading.Lock()
            while p:
                ps = [p.pop(0) for i in range(min(PROXY_TEST_MAX, len(p)))]
                print(f'* 检测条数:{len(ps)},文本IP代理池中剩余代理IP:{len(p)}')
                ts = []
                for i in ps:
                    a = CrawlThread(fetch,
                                    args=(self.session, url, i, fobj, lock))
                    ts.append(a)
                for i in ts:
                    i.start()
                for i in ts:
                    i.join()
                    if i.get_result():
                        _ += 1
            fobj.close()
            print(f'>> 所有代理检测完成.共有{_}个.')
コード例 #6
0
ファイル: crawl.py プロジェクト: bihicheng/scrapy
 def _split_urls_and_names(self, args):
     urls = []
     names = []
     for arg in args:
         if is_url(arg):
             urls.append(arg)
         else:
             names.append(arg)
     return urls, names
コード例 #7
0
ファイル: crawl.py プロジェクト: ilustreous/scrapy
 def _split_urls_and_names(self, args):
     urls = []
     names = []
     for arg in args:
         if is_url(arg):
             urls.append(arg)
         else:
             names.append(arg)
     return urls, names
コード例 #8
0
ファイル: parse.py プロジェクト: 247DigitalGroup/scrapy
    def run(self, args, opts):
        # parse arguments
        if not len(args) == 1 or not is_url(args[0]):
            raise UsageError()
        else:
            url = args[0]

        # prepare spidercls
        self.set_spidercls(url, opts)

        if self.spidercls and opts.depth > 0:
            self.start_parsing(url, opts)
            self.print_results(opts)
コード例 #9
0
ファイル: parse.py プロジェクト: sbe710/web-crawler
    def run(self, args, opts):
        # parse arguments
        if not len(args) == 1 or not is_url(args[0]):
            raise UsageError()
        else:
            url = args[0]

        # prepare spidercls
        self.set_spidercls(url, opts)

        if self.spidercls and opts.depth > 0:
            self.start_parsing(url, opts)
            self.print_results(opts)
コード例 #10
0
    def run(self, args, opts):
        # parse arguments
        if not len(args) == 1 or not is_url(args[0]):
            raise UsageError()
        else:
            url = args[0]

        # prepare spider
        self.pcrawler = self.crawler_process.create_crawler()
        self.set_spider(url, opts)

        if self.spider and opts.depth > 0:
            self.start_parsing(url, opts)
            self.print_results(opts)
コード例 #11
0
ファイル: parse.py プロジェクト: Aaron1011/oh-mainline
 def run(self, args, opts):
     if not len(args) == 1 or not is_url(args[0]):
         raise UsageError()
     response, spider = self.get_response_and_spider(args[0], opts)
     if not response:
         return
     callback = None
     if opts.callback:
         callback = opts.callback
     elif opts.rules:
         callback = self.get_callback_from_rules(spider, response)
     items, requests = self.run_callback(spider, response, callback or 'parse', \
         opts)
     self.print_results(items, requests, callback, opts)
コード例 #12
0
ファイル: parse.py プロジェクト: 0xfab/scrapy
    def run(self, args, opts):
        # parse arguments
        if not len(args) == 1 or not is_url(args[0]):
            raise UsageError()
        else:
            url = args[0]

        # prepare spider
        self.pcrawler = self.crawler_process.create_crawler()
        self.set_spider(url, opts)

        if self.spider and opts.depth > 0:
            self.start_parsing(url, opts)
            self.print_results(opts)
コード例 #13
0
ファイル: parse.py プロジェクト: ilustreous/scrapy
 def run(self, args, opts):
     if not len(args) == 1 or not is_url(args[0]):
         raise UsageError()
     response, spider = self.get_response_and_spider(args[0], opts)
     if not response:
         return
     callback = None
     if opts.callback:
         callback = opts.callback
     elif opts.rules:
         callback = self.get_callback_from_rules(spider, response)
     items, requests = self.run_callback(spider, response, callback or 'parse', \
         opts)
     self.print_results(items, requests, callback, opts)
コード例 #14
0
ファイル: fetch.py プロジェクト: jtwaleson/scrapy
    def run(self, args, opts):
        if len(args) != 1 or not is_url(args[0]):
            raise UsageError()
        cb = lambda x: self._print_response(x, opts)
        request = Request(args[0], callback=cb, dont_filter=True)
        request.meta['handle_httpstatus_all'] = True

        spidercls = DefaultSpider
        spiders = self.crawler_process.spiders
        if opts.spider:
            spidercls = spiders.load(opts.spider)
        else:
            spidercls = spidercls_for_request(spiders, request, spidercls)
        self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
        self.crawler_process.start()
コード例 #15
0
    def run(self, args, opts):
        if len(args) != 1 or not is_url(args[0]):
            raise UsageError()
        cb = lambda x: self._print_response(x, opts)
        request = Request(args[0], callback=cb, dont_filter=True)
        request.meta['handle_httpstatus_all'] = True

        spidercls = DefaultSpider
        spiders = self.crawler_process.spiders
        if opts.spider:
            spidercls = spiders.load(opts.spider)
        else:
            spidercls = spidercls_for_request(spiders, request, spidercls)
        self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
        self.crawler_process.start()
コード例 #16
0
ファイル: HttpProxy.py プロジェクト: zhangzhaolei/Amipy
 def get_proxy(self, req):
     http_proxy = req.spider.settings.HTTP_PROXY
     if http_proxy:
         if is_proxy_valid(http_proxy):
             proxy = gen_proxy(http_proxy, req.down_type)
             return proxy
         elif is_url(http_proxy):
             return http_proxy
         else:
             if not req.spider.settings.HTTP_PROXY_FILL_ENABLE:
                 self.logger.debug(f'Invalid proxy format:{http_proxy}')
                 return
     _proxy = self.get_proxy_by_api(req)
     proxy = gen_proxy(_proxy, req.down_type)
     return proxy
コード例 #17
0
def get_proxy():
    if PROXY:
        if not is_url(PROXY):
            return gen_proxy(PROXY)
    if is_url(PROXY_POOL):
        # 根据proxy_pool接口修改
        p = requests.get(PROXY_POOL).json().get('proxy')
        return gen_proxy(p)
    if os.path.isfile(PROXY_POOL):
        with open(PROXY_POOL,'r') as f:
            p_txt = f.readlines()
        return random.choice([gen_proxy(i) for i in p_txt])
    if isinstance(PROXY_POOL,list):
        return random.choice([gen_proxy(i) for i in PROXY_POOL])
    if PROXY_HTTP_TUNNEL:
        # 代理服务器
        proxyHost = "http-dyn.abuyun.com"
        proxyPort = "9020"

        # 代理隧道验证信息
        proxyUser = "******"
        proxyPass = "******"

        proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxyHost,
            "port": proxyPort,
            "user": proxyUser,
            "pass": proxyPass,
        }

        proxies = {
            "http": proxyMeta,
            "https": proxyMeta,
        }
        return proxies
    return None
コード例 #18
0
ファイル: fetch.py プロジェクト: 00gpowe/scrapy
    def run(self, args, opts):
        if len(args) != 1 or not is_url(args[0]):
            raise UsageError()
        cb = lambda x: self._print_response(x, opts)
        request = Request(args[0], callback=cb, dont_filter=True)
        request.meta['handle_httpstatus_all'] = True

        spider = None
        if opts.spider:
            spider = self.crawler.spiders.create(opts.spider)
        else:
            spider = create_spider_for_request(self.crawler.spiders, request, \
                default_spider=BaseSpider('default'))
        self.crawler.crawl(spider, [request])
        self.crawler.start()
コード例 #19
0
ファイル: fetch.py プロジェクト: reenvs/self-summary
    def run(self, args, opts):
        if len(args) != 1 or not is_url(args[0]):
            raise UsageError()
        cb = lambda x: self._print_response(x, opts)
        request = Request(args[0], callback=cb, dont_filter=True)
        request.meta['handle_httpstatus_all'] = True

        spider = None
        if opts.spider:
            spider = self.crawler.spiders.create(opts.spider)
        else:
            spider = create_spider_for_request(self.crawler.spiders, request, \
                default_spider=BaseSpider('default'))
        self.crawler.crawl(spider, [request])
        self.crawler.start()
コード例 #20
0
ファイル: proxy.py プロジェクト: xrr8417403/douyin-1
def get_proxy(api=PROXY_API):
    if not is_url(api):
        if os.path.isfile(api):
            with open(api, 'r') as f:
                p_txt = f.readlines()
            return random.choice([gen_proxy(i) for i in p_txt])
        if isinstance(api, list):
            return random.choice([gen_proxy(i) for i in api])
        return gen_proxy(api)
    else:
        while 1:
            p = requests.get(api).text.strip('\r\n').strip()
            if not is_proxy_valid(p):
                continue
            return gen_proxy(p)
コード例 #21
0
ファイル: fetch.py プロジェクト: bihicheng/scrapy
    def run(self, args, opts):
        if len(args) != 1 or not is_url(args[0]):
            raise UsageError()
        cb = lambda x: self._print_response(x, opts)
        request = Request(args[0], callback=cb, dont_filter=True)

        spider = None
        if opts.spider:
            try:
                spider = self.crawler.spiders.create(opts.spider)
            except KeyError:
                log.msg("Could not find spider: %s" % opts.spider, log.ERROR)

        self.crawler.queue.append_request(request, spider, \
            default_spider=BaseSpider('default'))
        self.crawler.start()
コード例 #22
0
    def run(self, args, opts):
        if len(args) != 1 or not is_url(args[0]):
            raise UsageError()
        cb = lambda x: self._print_response(x, opts)
        request = Request(args[0], callback=cb, dont_filter=True)

        spider = None
        if opts.spider:
            try:
                spider = self.crawler.spiders.create(opts.spider)
            except KeyError:
                log.msg("Could not find spider: %s" % opts.spider, log.ERROR)

        self.crawler.queue.append_request(request, spider, \
            default_spider=BaseSpider('default'))
        self.crawler.start()
コード例 #23
0
    def run(self, args, opts):
        # parse arguments
        if not len(args) == 1 or not is_url(args[0]):
            raise UsageError()
        else:
            url = args[0]

        # prepare spidercls
        self.set_spidercls(url, opts)

        if self.spidercls and opts.depth > 0:
            start_status = self.start_parsing(url, opts)
            if start_status == FATAL_ERROR:
                return start_status
            self.print_results(opts)
            results = self.get_results(opts)
            print(len(results))
            return results
コード例 #24
0
ファイル: fetch.py プロジェクト: ArturGaspar/scrapy
    def run(self, args, opts):
        if len(args) != 1 or not is_url(args[0]):
            raise UsageError()
        cb = lambda x: self._print_response(x, opts)
        request = Request(args[0], callback=cb, dont_filter=True)
        # by default, let the framework handle redirects,
        # i.e. command handles all codes expect 3xx
        if not opts.no_redirect:
            request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
        else:
            request.meta['handle_httpstatus_all'] = True

        spidercls = DefaultSpider
        spider_loader = self.crawler_process.spider_loader
        if opts.spider:
            spidercls = spider_loader.load(opts.spider)
        else:
            spidercls = spidercls_for_request(spider_loader, request, spidercls)
        self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
        self.crawler_process.start()
コード例 #25
0
ファイル: srl.py プロジェクト: hiropppe/rasc-sample
  def run(self, args, opts):
    ## NLP parser settings
    self.parser_server = self.settings['PARSER_RPC_SERVER']
    # For standalone rpc server (msgpack-rpc)
    self.parser_msgpack_host = self.settings['PARSER_MSGPACK_HOST']
    self.parser_msgpack_port = self.settings['PARSER_MSGPACK_PORT']
    # For distributed rpc server (json-rpc)
    self.parser_jsonrpc_url  = self.settings['PARSER_PROXY_URL'] 
    
    if len(args) != 1 or not is_url(args[0]):
      raise UsageError()
    
    cb = lambda x: self._print_response(x, opts)
    request = Request(args[0], callback=cb, dont_filter=True)
    request.meta['handle_httpstatus_all'] = True

    spidercls = DefaultSpider
    spider_loader = self.crawler_process.spider_loader
    if opts.spider:
      spidercls = spider_loader.load(opts.spider)
    else:
      spidercls = spidercls_for_request(spider_loader, request, spidercls)
    self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
    self.crawler_process.start()
コード例 #26
0
ファイル: response.py プロジェクト: zhangzhaolei/Amipy
 def url(self, url):
     #only starts with schema:file,http,https allowed to be a valid url
     if not urltool.is_url(url):
         raise ValueError('Not a valid url for Request.')
     else:
         self.__url = urltool.safe_download_url(url)
コード例 #27
0
ファイル: test_url.py プロジェクト: scrapy/w3lib
 def test_is_url(self):
     self.assertTrue(is_url("http://www.example.org"))
     self.assertTrue(is_url("https://www.example.org"))
     self.assertTrue(is_url("file:///some/path"))
     self.assertFalse(is_url("foo://bar"))
     self.assertFalse(is_url("foo--bar"))
コード例 #28
0
 def test_is_url(self):
     self.assertTrue(is_url('http://www.example.org'))
     self.assertTrue(is_url('https://www.example.org'))
     self.assertTrue(is_url('file:///some/path'))
     self.assertFalse(is_url('foo://bar'))
     self.assertFalse(is_url('foo--bar'))
コード例 #29
0
ファイル: movie.py プロジェクト: fdioguardi/movies_ontology
 def is_valid_url(self, url):
     return is_url(url) and (not " " in url)
コード例 #30
0
ファイル: cli.py プロジェクト: ThomasAitken/Scrapy-Testmaster
    def __init__(self, parser):
        self.parser = parser
        self.args = parser.parse_args()

        if not inside_project():
            self.error("No active Scrapy project")

        self.command = self.args.command

        self.spider = sanitize_module_name(self.args.spider) if \
            self.args.spider else None
        try:
            self.callback = self.args.callback
        except AttributeError:
            self.callback = None
        try:
            self.fixture = self.args.fixture
        except AttributeError:
            self.fixture = None

        if self.command == 'update':
            try:
                self.new = self.args.new
            except AttributeError:
                self.new = None
            try:
                self.dynamic = self.args.dynamic
            except AttributeError:
                self.dynamic = None

        if self.command == 'clear':
            self.fixtures = self.args.fixtures.split(',')

        if self.fixture and not self.callback:
            self.error("Can't specify a fixture without a callback")

        self.project_dir, self.project_name = get_project_dirs()
        sys.path.append(self.project_dir)

        self.settings = get_project_settings()

        if self.command == "parse":
            url_list = [url.strip() for url in self.args.urls.split('|')]
            for url in url_list:
                if not is_url(url):
                    self.error("Something went wrong with your urls arg! "
                               "Note that as of version 1.0, the character for separating "
                               "multiple urls is '|', as opposed to ','")

            self.args = process_options(self.args)
            crawler_process = CrawlerProcess(self.settings)
            run_command(crawler_process, url_list, self.args)

        else:
            self.base_path = self.settings.get(
                'TESTMASTER_BASE_PATH',
                default=os.path.join(self.project_dir, 'testmaster'))
            self.tests_dir = os.path.join(self.base_path, 'tests')

            self.spider_dir = os.path.join(self.tests_dir, self.spider)

            if not os.path.isdir(self.spider_dir) and self.command != "establish":
                self.error(
                    "No recorded data found "
                    "for spider '{}'".format(self.spider))

            self.extra_path = self.settings.get('TESTMASTER_EXTRA_PATH') or ''
            if self.callback:
                self.callback_dir = os.path.join(
                    self.spider_dir, self.extra_path, self.callback)

                if self.command == 'establish':
                    if os.path.isdir(self.callback_dir):
                        self.error(
                            "Can't use 'establish' with callback arg "
                            "if callback dir for spider '{}' "
                            "exists already".format(self.spider))
            else:
                if self.command == 'inspect':
                    self.error(
                        "No recorded data found for callback "
                        "'{}' from '{}' spider".format(self.callback, self.spider))

            if self.fixture:
                self.fixture_path = os.path.join(self.callback_dir,
                                                 self.parse_fixture_arg())
                if not os.path.isfile(self.fixture_path):
                    self.error("Fixture '{}' not found".format(self.fixture_path))
コード例 #31
0
ファイル: properties.py プロジェクト: mhaberler/czml3
 def _check_uri(self, attribute, value):
     try:
         parse_data_uri(value)
     except ValueError as e:
         if not is_url(value):
             raise ValueError("uri must be a URL or a data URI") from e
コード例 #32
0
ファイル: test_url.py プロジェクト: Preetwinder/w3lib
 def test_is_url(self):
     self.assertTrue(is_url('http://www.example.org'))
     self.assertTrue(is_url('https://www.example.org'))
     self.assertTrue(is_url('file:///some/path'))
     self.assertFalse(is_url('foo://bar'))
     self.assertFalse(is_url('foo--bar'))