Beispiel #1
0
    def test_iterate_spider_output(self):
        i = BaseItem()
        r = Request('http://scrapytest.org')
        o = object()

        self.assertEqual(list(iterate_spider_output(i)), [i])
        self.assertEqual(list(iterate_spider_output(r)), [r])
        self.assertEqual(list(iterate_spider_output(o)), [o])
        self.assertEqual(list(iterate_spider_output([r, i, o])), [r, i, o])
Beispiel #2
0
 def parse_nodes(self, response, nodes):
     """
     Inherited from XMLFeedSpider
     Extended to also return requests.
     """
     for selector in nodes:
         ret = iterate_spider_output(self.parse_node(response, selector))
         for result_item in self.process_results(response, ret):
             yield result_item
     seen = set()
     for i, rule in enumerate(self.rules):
         links = [
             l
             for l
             in rule.link_extractor.extract_links(response)
             if l not in seen
         ]
         self.logger.info('links %s', links)
         if links and rule.process_links:
             links = rule.process_links(links)
         for link in links:
             seen.add(link)
             r = Request(url=link.url)
             r.meta.update(rule=i, link_text=link.text)
             yield rule.process_request(r)
Beispiel #3
0
 def cb_wrapper(response):
     try:
         output = cb(response)
         output = list(iterate_spider_output(output))
     except:
         case = _create_testcase(method, 'callback')
         results.addError(case, sys.exc_info())
Beispiel #4
0
 def parse(self, response):
     b = BeautifulSoup(response.body)
     details = b.findAll(attrs={"class": "detail"})
     
     for detail in details:
         resp = TextResponse(url="..", status=200, body=detail.text.encode("utf8"))
         for requests_or_item in iterate_spider_output(self.parse_item(resp)):
             yield requests_or_item
Beispiel #5
0
 def wrapper(response):
     try:
         self.pre_process(response)
     except ContractFail as e:
         if fail:
             raise
         else:
             print e.format(self.method)
     return list(iterate_spider_output(cb(response)))
Beispiel #6
0
 def _parse_response(self, response, callback, cb_kwargs, follow=True):
   if callback:
     cb_res = callback(response, **cb_kwargs) or ()    #第一次调用parse_start_url,后面每次调用Rule中指定的callback
     cb_res = self.process_results(response, cb_res)   #给什么返回什么
     for requests_or_item in iterate_spider_output(cb_res):
       yield requests_or_item
   if follow and self._follow_links:     #如果follow=True,则跟进(默认True)
     for request_or_item in self._requests_to_follow(response):   #开始rule规则
       yield request_or_item
Beispiel #7
0
    def run_callback(self, response, cb):
        items, requests = [], []

        for x in iterate_spider_output(cb(response)):
            if isinstance(x, (BaseItem, dict)):
                items.append(x)
            elif isinstance(x, Request):
                requests.append(x)
        return items, requests
    def __call__(self, response):
        """Main response entry point.

        This method calls the callback and wraps the returned generator.

        """
        output = iterate_spider_output(self.callback(response=response, **self.kwargs))
        if not isinstance(output, GeneratorType):
            raise ValueError("Callback must return a generator type")
        return self._unwindGenerator(output)
Beispiel #9
0
    def _parse_response(self, response, callback, cb_kwargs, follow=True):
        if callback:
            cb_res = callback(response, **cb_kwargs) or ()
            cb_res = self.process_results(response, cb_res)
            for requests_or_item in iterate_spider_output(cb_res):
                yield requests_or_item

        if follow and settings['CRAWLSPIDER_FOLLOW_LINKS']:
            for request_or_item in self._requests_to_follow(response):
                yield request_or_item
Beispiel #10
0
    def _parse_response(self, response, callback, cb_kwargs, follow=True):
        if callback:
            cb_res = callback(response, **cb_kwargs) or ()
            cb_res = self.process_results(response, cb_res)
            for requests_or_item in iterate_spider_output(cb_res):
                yield requests_or_item

        if follow and self._follow_links:
            for request_or_item in self._requests_to_follow(response):
                yield request_or_item
Beispiel #11
0
    def parse_rows(self, response):
        """Receives a response and a dict (representing each row) with a key for
        each provided (or detected) header of the CSV file.  This spider also
        gives the opportunity to override adapt_response and
        process_results methods for pre and post-processing purposes.
        """

        for row in csviter(response, self.delimiter, self.headers, self.quotechar):
            ret = iterate_spider_output(self.parse_row(response, row))
            for result_item in self.process_results(response, ret):
                yield result_item
Beispiel #12
0
    def parse_nodes(self, response, nodes):
        """This method is called for the nodes matching the provided tag name
        (itertag). Receives the response and an Selector for each node.
        Overriding this method is mandatory. Otherwise, you spider won't work.
        This method must return either a BaseItem, a Request, or a list
        containing any of them.
        """

        for selector in nodes:
            ret = iterate_spider_output(self.parse_node(response, selector))
            for result_item in self.process_results(response, ret):
                yield result_item
Beispiel #13
0
    def run_callback(self, spider, response, callback, opts):
        cb = callback if callable(callback) else getattr(spider, callback, None)
        if not cb:
            log.msg('Cannot find callback %r in spider: %s' % (callback, spider.name))
            return (), ()

        items, requests = [], []
        for x in iterate_spider_output(cb(response)):
            if isinstance(x, BaseItem):
                items.append(x)
            elif isinstance(x, Request):
                requests.append(x)
        return items, requests
Beispiel #14
0
    def parse_nodes(self, response, nodes): #可复写 默认行为是从解析器解析出的目标tags列表里 给每个调用tag 调用parse_node
        # 然后从它返回的结果列表里 每个结果调用process_results
        """This method is called for the nodes matching the provided tag name
        (itertag). Receives the response and an Selector for each node.
        Overriding this method is mandatory. Otherwise, you spider won't work.
        This method must return either an item, a request, or a list
        containing any of them.
        """

        for selector in nodes:
            ret = iterate_spider_output(self.parse_node(response, selector))
            for result_item in self.process_results(response, ret):
                yield result_item
Beispiel #15
0
    def run_callback(self, spider, response, callback, args, opts):
        if callback:
            callback_fcn = callback if callable(callback) else getattr(spider, callback, None)
            if not callback_fcn:
                log.msg("Cannot find callback %s in %s spider" % (callback, spider.name))
                return (), ()

            result = iterate_spider_output(callback_fcn(response))
            links = [i for i in result if isinstance(i, Request)]
            items = [self.pipeline_process(i, spider, opts) for i in result if isinstance(i, BaseItem)]
            return items, links

        return (), ()
Beispiel #16
0
 def wrapper(response):
     try:
         results.startTest(self.testcase_pre)
         self.pre_process(response)
         results.stopTest(self.testcase_pre)
     except AssertionError:
         results.addFailure(self.testcase_pre, sys.exc_info())
     except Exception:
         results.addError(self.testcase_pre, sys.exc_info())
     else:
         results.addSuccess(self.testcase_pre)
     finally:
         return list(iterate_spider_output(cb(response)))
    def _parse_response(self, response, callback, cb_kwargs, follow=True):
        if callback:
            cb_res = callback(response, **cb_kwargs) or ()
            cb_res = self.process_results(response, cb_res)
            for requests_or_item in iterate_spider_output(cb_res):
                yield requests_or_item

        if follow and self._follow_links:
            for request_or_item in self._requests_to_follow(response):
                if isinstance(request_or_item, Request):
                    request_or_item.meta['start_url'] = response.meta[
                        'start_url']
                yield request_or_item
Beispiel #18
0
 def wrapper(response):
     try:
         results.startTest(self.testcase_pre)
         self.pre_process(response)
         results.stopTest(self.testcase_pre)
     except AssertionError:
         results.addFailure(self.testcase_pre, sys.exc_info())
     except Exception:
         results.addError(self.testcase_pre, sys.exc_info())
     else:
         results.addSuccess(self.testcase_pre)
     finally:
         return list(iterate_spider_output(cb(response)))
Beispiel #19
0
 def run_callback(self, response, cb):
     """
     run callback and get items and requests
     :param response:
     :param cb:
     :return:
     """
     items, requests = [], []
     for x in iterate_spider_output(cb(response)):
         if isinstance(x, (BaseItem, dict)):
             items.append(x)
         elif isinstance(x, Request):
             requests.append(x)
     return items, requests
Beispiel #20
0
 def wrapper(response, **cb_kwargs):
     output = list(iterate_spider_output(cb(response, **cb_kwargs)))
     try:
         results.startTest(self.testcase_post)
         self.post_process(output)
         results.stopTest(self.testcase_post)
     except AssertionError:
         results.addFailure(self.testcase_post, sys.exc_info())
     except Exception:
         results.addError(self.testcase_post, sys.exc_info())
     else:
         results.addSuccess(self.testcase_post)
     finally:
         return output
Beispiel #21
0
    def parse_rows(self, response):
        """Receives a response and a dict (representing each row) with a key for
        each provided (or detected) header of the CSV file.  This spider also
        gives the opportunity to override adapt_response and
        process_results methods for pre and post-processing purposes.
        """

        for row in csviter(response,
                           self.delimiter,
                           self.headers,
                           quotechar=self.quotechar):
            ret = iterate_spider_output(self.parse_row(response, row))
            for result_item in self.process_results(response, ret):
                yield result_item
Beispiel #22
0
    def _response_downloaded(self, response, callback, cb_kwargs, follow):
        """
        This is were any response arrives, and were it's decided whether
        to extract links or not from it, and if it will be parsed or not.
        It returns a list of requests/items.
        """
        if callback:
            cb_res = callback(response, **cb_kwargs) or ()
            cb_res = self.process_results(response, cb_res)
            for requests_or_item in iterate_spider_output(cb_res):
                yield requests_or_item

        if follow and settings.getbool("CRAWLSPIDER_FOLLOW_LINKS", True):
            for request_or_item in self._requests_to_follow(response):
                yield request_or_item
Beispiel #23
0
    def _parse_response(self,
                        response,
                        callback,
                        cb_kwargs,
                        follow=True):  # _parse_response是CrawlSpider的核心函数
        if callback:
            cb_res = callback(response, **cb_kwargs) or ()
            cb_res = self.process_results(
                response, cb_res)  #可以重载process_results,从而避免重载parse函数
            for requests_or_item in iterate_spider_output(cb_res):
                yield requests_or_item

        if follow and self._follow_links:  #调用我们设置的rule
            for request_or_item in self._requests_to_follow(
                    response):  #调用_requests_to_follow函数
                yield request_or_item
Beispiel #24
0
 def _parse_response_v2(self,
                        response,
                        parser,
                        callback,
                        cb_kwargs,
                        follow=True):
     if parser:
         cb_res = parser(response, **cb_kwargs) or ()
         if callback:
             cb_res = callback(response, cb_res=cb_res, **cb_kwargs) or ()
         cb_res = self.process_results(response, cb_res)
         for requests_or_item in iterate_spider_output(cb_res):
             yield requests_or_item
     if follow and self._follow_links:
         for request_or_item in self._requests_to_follow(response):
             yield request_or_item
Beispiel #25
0
  def _parse_response(self, response, callback, cb_kwargs, follow=True):

    if 'isFilter' in response.url:     #watch it!!!!!!!
      response=response.replace(body=str(response.body.decode('unicode_escape').replace('\/','/')))
      # response.body r.content类型是bytes  response.text r.text类型是str

    #super()._parse_response(response, callback, cb_kwargs, follow=True),为什么不行???
    if callback:
      cb_res = callback(response, **cb_kwargs) or ()
      cb_res = self.process_results(response, cb_res)
      for requests_or_item in iterate_spider_output(cb_res):
        yield requests_or_item

    if follow and self._follow_links:
      for request_or_item in self._requests_to_follow(response):
        yield request_or_item
Beispiel #26
0
  def _parse_response(self, response, callback, cb_kwargs, follow=True):

    if 'isFilter' in response.url:     #watch it!!!!!!!
      response=response.replace(body=str(response.body.decode('unicode_escape').replace('\/','/')))
      # response.body r.content类型是bytes  response.text r.text类型是str

    #super()._parse_response(response, callback, cb_kwargs, follow=True),为什么不行???
    if callback:
      cb_res = callback(response, **cb_kwargs) or ()
      cb_res = self.process_results(response, cb_res)
      for requests_or_item in iterate_spider_output(cb_res):
        yield requests_or_item

    if follow and self._follow_links:
      for request_or_item in self._requests_to_follow(response):
        yield request_or_item
Beispiel #27
0
    def _parse_response(self, response, callback, cb_kwargs, follow=True):
        """

        :param response:
        :param callback:
        :param cb_kwargs:
        :param follow: 跟进标志
        :return:
        """
        if callback:
            cb_res = callback(response, **cb_kwargs) or ()
            cb_res = self.process_results(response, cb_res)
            for requests_or_item in iterate_spider_output(cb_res):
                yield requests_or_item

        if follow and self._follow_links:
            for request_or_item in self._requests_to_follow(response):
                yield request_or_item
Beispiel #28
0
    def parse(self, response):
        """Dispatch callback and generate requests"""
        # get rule for response
        rule = self._rulesman.get_rule_from_response(response)

        if rule:
            # dispatch callback if set
            if rule.callback:
                output = iterate_spider_output(rule.callback(response))
                for req_or_item in output:
                    yield req_or_item

            if rule.follow:
                for req in self._reqgen.generate_requests(response):
                    # only dispatch request if has matching rule
                    if self._rulesman.get_rule_from_request(req):
                         yield req
        else:
             self.log("No rule for response %s" % response, level=log.WARNING)
Beispiel #29
0
    def parse(self, response):
        """Dispatch callback and generate requests"""
        # get rule for response
        rule = self._rulesman.get_rule_from_response(response)

        if rule:
            # dispatch callback if set
            if rule.callback:
                output = iterate_spider_output(rule.callback(response))
                for req_or_item in output:
                    yield req_or_item

            if rule.follow:
                for req in self._reqgen.generate_requests(response):
                    # only dispatch request if has matching rule
                    if self._rulesman.get_rule_from_request(req):
                        yield req
        else:
            self.log("No rule for response %s" % response, level=log.WARNING)
Beispiel #30
0
    def __call__(self, response):
        """Main response entry point.

        This method calls the callback and wraps the returned generator.
        The decorated method must return a generator.

        Args:
            response (scrapy.Response):

        Returns:
            scrapy.Request:

        """
        output = iterate_spider_output(
            self._response_callback(response, **self._kwargs))
        if not isinstance(output, GeneratorType):
            raise TypeError(
                f'{self._response_callback.__name__} must return a generator')

        return self.resume(output)
Beispiel #31
0
 def parse_nodes(self, response, nodes):
     """
     Inherited from XMLFeedSpider
     Extended to also return requests.
     """
     for selector in nodes:
         ret = iterate_spider_output(self.parse_node(response, selector))
         for result_item in self.process_results(response, ret):
             yield result_item
     seen = set()
     for i, rule in enumerate(self.rules):
         links = [
             l for l in rule.link_extractor.extract_links(response)
             if l not in seen
         ]
         self.logger.info('links %s', links)
         if links and rule.process_links:
             links = rule.process_links(links)
         for link in links:
             seen.add(link)
             r = Request(url=link.url)
             r.meta.update(rule=i, link_text=link.text)
             yield rule.process_request(r)
Beispiel #32
0
 def parse_rows(self, response):
     for row in exceliter(response, self.header, self.skiprows, self.sheet_name,
                          self.dropna_thresh, self.isEmpty_lineHeader):
         ret = iterate_spider_output(self.parse_row(response, row))
         for result_item in self.process_results(response, ret):
             yield result_item
Beispiel #33
0
 def parse_nodes(self, response, nodes):
     for selector in nodes:
         ret = iterate_spider_output(self.parse_node(response, selector))
         for result_item in self.process_results(response, ret):
             yield result_item
Beispiel #34
0
 def parse_rows(self, response):
     for row in pcsviter(response, self.delimiter, self.headers, self.quotechar):
         ret = iterate_spider_output(self.parse_row(response, row))
         for result_item in self.process_results(response, ret):
             yield result_item
 def parse_cities(self, response):
     nodes = xmliter(response, "city")
     for selector in nodes:
         ret = iterate_spider_output(self.parse_city(response, selector))
         for result_item in ret:
             yield result_item
Beispiel #36
0
 def parse_rows(self, response):
     for row in xlsiter(response, self.headers, self.sheet_index):
         ret = iterate_spider_output(self.parse_row(response, row))
         for result_item in self.process_result(response, ret):
             yield result_item
 def __call__(self, response):
     output = iterate_spider_output(self.callback(response))
     if isinstance(output, types.GeneratorType):
         return self._unwindGenerator(output)
     else:
         return output
Beispiel #38
0
 def wrapper(response):
     output = cb(response)
     output = list(iterate_spider_output(output))
Beispiel #39
0
 def start_requests(self):
     self._postinit_reqs = super(InitSpider, self).start_requests()
     return iterate_spider_output(self.init_request())
Beispiel #40
0
    def parse_nodes(self, response, nodes):
        """This method is called for the nodes matching the provided tag name
        (itertag). Receives the response and an Selector for each node.
        Overriding this method is mandatory. Otherwise, you spider won't work.
        This method must return either a BaseItem, a Request, or a list
        containing any of them.
        """
        spider_config = response.meta.get("spider_config")
        spiders = response.meta.get("spiders")
        context = self.context or {}
        if None in [spider_config]:
            spider_config = self.spider_config
            spiders = self.spiders

        data = {"url": response.url, "domain": get_domain(response.url)}
        for extractor in spider_config.get('extractors', []):
            extracted_items = []
            for selector in nodes:
                ret = iterate_spider_output(
                    self.parse_node(response, selector, extractor))
                for result_item in self.process_results(response, ret):
                    extracted_items.append(result_item)
            data[extractor['extractor_id']] = {}
            data[extractor['extractor_id']]['entries'] = extracted_items
        context["spider_id"] = spider_config.get("spider_id")
        data['context'] = context
        """
        if spider_traversal_id is None, it means this response originated from the 
        request raised by the start urls. 

        If it is Not None, the request/response is raised some traversal strategy.
        """
        current_request_traversal_id = response.meta.get(
            'current_request_traversal_id', None)
        """
        In xml crawling current_request_traversal_page_count starts from 1, because there is no page 0.
        """
        current_request_traversal_page_count = response.meta.get(
            'current_request_traversal_page_count', 1)
        """
        Note on current_request_spider_id:
        This can never be none, including the ones that are started by start_urls .
        """
        spider_config_id = spider_config.get("spider_id")

        spider_traversals = spider_config.get('traversals', [])
        for traversal in spider_traversals:
            next_spider_id = traversal['next_spider_id']
            iter_param = traversal['iter_param']

            next_spider = get_spider_from_list(spider_id=next_spider_id,
                                               spiders=spiders)

            traversal['allow_domains'] = next_spider.get("allowed_domains", [])
            traversal_id = traversal['traversal_id']
            traversal_max_pages = traversal.get('max_pages', 1)

            traversal_links = []
            is_this_request_from_same_traversal = self.is_this_request_from_same_traversal(
                response, traversal)
            print("is_this_request_from_same_traversal",
                  is_this_request_from_same_traversal)
            print("current_request_traversal_page_count",
                  current_request_traversal_page_count)
            print("traversal_max_pages", traversal_max_pages)
            print(
                " current_request_traversal_page_count <= traversal_max_pages",
                current_request_traversal_page_count <= traversal_max_pages)
            shall_traverse = False

            if current_request_traversal_id is None:
                """
                start urls will not have this traversal_id set, so we should allow then to traverse
                """
                shall_traverse = True

            elif is_this_request_from_same_traversal and current_request_traversal_page_count <= traversal_max_pages:
                """
                This block will be valid for the traversals from same spider_id, ie., pagination of a spider 
                """

                shall_traverse = True

            elif is_this_request_from_same_traversal:
                """
                """
                shall_traverse = True

            elif is_this_request_from_same_traversal is False and current_request_traversal_page_count <= \
                    traversal_max_pages:
                """
                This for the spider_a traversing to spider_b, this is not pagination, but trsversing between 
                spiders.
                """
                shall_traverse = True
            print("shall_traverse: {}".format(traversal_id), shall_traverse)
            if shall_traverse:
                current_url = response.url
                clean_url_without_iter_param = current_url.split(
                    "?")[0] if "?" in current_url else current_url
                # this is already iterating, so ignore.
                print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<",
                      clean_url_without_iter_param)
                print("clean_url_without_iter_param",
                      clean_url_without_iter_param)
                traversal_link = "{}?{}={}".format(
                    clean_url_without_iter_param, iter_param,
                    current_request_traversal_page_count + 1)

                print("traversal_link", traversal_link)

                data[traversal_id] = {"traversal_urls": [traversal_link]}
                """
                Then validate for max_pages logic if traversal_id's traversal has any!.
                This is where the further traversal for this traversal_id  is decided 
                """
                max_pages = traversal.get("max_pages", 1)

                current_request_traversal_page_count += 1
                """
                we are already incrementing, the last number, so using <= might make it 6 pages when 
                max_pages is 5 
                """
                if current_request_traversal_page_count <= max_pages:
                    print("=======current_request_traversal_page_count",
                          current_request_traversal_page_count)
                    print("-----------------------------------")
                    yield scrapy.Request(
                        traversal_link,
                        callback=self.parse,
                        errback=self.parse_error,
                        meta={
                            "spider_config":
                            next_spider,
                            "spiders":
                            spiders,
                            "current_request_traversal_id":
                            traversal_id,
                            "current_request_traversal_page_count":
                            current_request_traversal_page_count,
                        })

            print("=================================================")
            print("====traversal_links", traversal_id, len(traversal_links),
                  traversal_links)
            print("=================================================")

        yield data

        self.post_parse(response=response)
Beispiel #41
0
 def wrapper(response):
     output = cb(response)
     output = list(iterate_spider_output(output))
Beispiel #42
0
 def start_requests(self):
     logging.info("@@@@@ start_requests is called @@@@@")
     self._postinit_reqs = super(PollSpider, self).start_requests()
     return iterate_spider_output(self.login())
Beispiel #43
0
 def _handle_failure(self, failure, errback):
     if errback:
         results = errback(failure) or ()
         for request_or_item in iterate_spider_output(results):
             yield request_or_item
 def __call__(self, response):
     output = iterate_spider_output(self.callback(response=response, **self.kwargs))
     if not isinstance(output, GeneratorType):
         raise ValueError("Callback must return a generator type")
     return self._unwindGenerator(output)
Beispiel #45
0
 def start_requests(self):
     self._postinit_reqs = super().start_requests()
     return iterate_spider_output(self.init_request())
 def parse_cities(self, response):
     nodes = xmliter(response, "city")
     for selector in nodes:
         ret = iterate_spider_output(self.parse_city(response, selector))
         for result_item in ret:
             yield result_item