def test_iterate_spider_output(self): i = BaseItem() r = Request('http://scrapytest.org') o = object() self.assertEqual(list(iterate_spider_output(i)), [i]) self.assertEqual(list(iterate_spider_output(r)), [r]) self.assertEqual(list(iterate_spider_output(o)), [o]) self.assertEqual(list(iterate_spider_output([r, i, o])), [r, i, o])
def parse_nodes(self, response, nodes): """ Inherited from XMLFeedSpider Extended to also return requests. """ for selector in nodes: ret = iterate_spider_output(self.parse_node(response, selector)) for result_item in self.process_results(response, ret): yield result_item seen = set() for i, rule in enumerate(self.rules): links = [ l for l in rule.link_extractor.extract_links(response) if l not in seen ] self.logger.info('links %s', links) if links and rule.process_links: links = rule.process_links(links) for link in links: seen.add(link) r = Request(url=link.url) r.meta.update(rule=i, link_text=link.text) yield rule.process_request(r)
def cb_wrapper(response): try: output = cb(response) output = list(iterate_spider_output(output)) except: case = _create_testcase(method, 'callback') results.addError(case, sys.exc_info())
def parse(self, response): b = BeautifulSoup(response.body) details = b.findAll(attrs={"class": "detail"}) for detail in details: resp = TextResponse(url="..", status=200, body=detail.text.encode("utf8")) for requests_or_item in iterate_spider_output(self.parse_item(resp)): yield requests_or_item
def wrapper(response): try: self.pre_process(response) except ContractFail as e: if fail: raise else: print e.format(self.method) return list(iterate_spider_output(cb(response)))
def _parse_response(self, response, callback, cb_kwargs, follow=True): if callback: cb_res = callback(response, **cb_kwargs) or () #第一次调用parse_start_url,后面每次调用Rule中指定的callback cb_res = self.process_results(response, cb_res) #给什么返回什么 for requests_or_item in iterate_spider_output(cb_res): yield requests_or_item if follow and self._follow_links: #如果follow=True,则跟进(默认True) for request_or_item in self._requests_to_follow(response): #开始rule规则 yield request_or_item
def run_callback(self, response, cb): items, requests = [], [] for x in iterate_spider_output(cb(response)): if isinstance(x, (BaseItem, dict)): items.append(x) elif isinstance(x, Request): requests.append(x) return items, requests
def __call__(self, response): """Main response entry point. This method calls the callback and wraps the returned generator. """ output = iterate_spider_output(self.callback(response=response, **self.kwargs)) if not isinstance(output, GeneratorType): raise ValueError("Callback must return a generator type") return self._unwindGenerator(output)
def _parse_response(self, response, callback, cb_kwargs, follow=True): if callback: cb_res = callback(response, **cb_kwargs) or () cb_res = self.process_results(response, cb_res) for requests_or_item in iterate_spider_output(cb_res): yield requests_or_item if follow and settings['CRAWLSPIDER_FOLLOW_LINKS']: for request_or_item in self._requests_to_follow(response): yield request_or_item
def _parse_response(self, response, callback, cb_kwargs, follow=True): if callback: cb_res = callback(response, **cb_kwargs) or () cb_res = self.process_results(response, cb_res) for requests_or_item in iterate_spider_output(cb_res): yield requests_or_item if follow and self._follow_links: for request_or_item in self._requests_to_follow(response): yield request_or_item
def parse_rows(self, response): """Receives a response and a dict (representing each row) with a key for each provided (or detected) header of the CSV file. This spider also gives the opportunity to override adapt_response and process_results methods for pre and post-processing purposes. """ for row in csviter(response, self.delimiter, self.headers, self.quotechar): ret = iterate_spider_output(self.parse_row(response, row)) for result_item in self.process_results(response, ret): yield result_item
def parse_nodes(self, response, nodes): """This method is called for the nodes matching the provided tag name (itertag). Receives the response and an Selector for each node. Overriding this method is mandatory. Otherwise, you spider won't work. This method must return either a BaseItem, a Request, or a list containing any of them. """ for selector in nodes: ret = iterate_spider_output(self.parse_node(response, selector)) for result_item in self.process_results(response, ret): yield result_item
def run_callback(self, spider, response, callback, opts): cb = callback if callable(callback) else getattr(spider, callback, None) if not cb: log.msg('Cannot find callback %r in spider: %s' % (callback, spider.name)) return (), () items, requests = [], [] for x in iterate_spider_output(cb(response)): if isinstance(x, BaseItem): items.append(x) elif isinstance(x, Request): requests.append(x) return items, requests
def parse_nodes(self, response, nodes): #可复写 默认行为是从解析器解析出的目标tags列表里 给每个调用tag 调用parse_node # 然后从它返回的结果列表里 每个结果调用process_results """This method is called for the nodes matching the provided tag name (itertag). Receives the response and an Selector for each node. Overriding this method is mandatory. Otherwise, you spider won't work. This method must return either an item, a request, or a list containing any of them. """ for selector in nodes: ret = iterate_spider_output(self.parse_node(response, selector)) for result_item in self.process_results(response, ret): yield result_item
def run_callback(self, spider, response, callback, args, opts): if callback: callback_fcn = callback if callable(callback) else getattr(spider, callback, None) if not callback_fcn: log.msg("Cannot find callback %s in %s spider" % (callback, spider.name)) return (), () result = iterate_spider_output(callback_fcn(response)) links = [i for i in result if isinstance(i, Request)] items = [self.pipeline_process(i, spider, opts) for i in result if isinstance(i, BaseItem)] return items, links return (), ()
def wrapper(response): try: results.startTest(self.testcase_pre) self.pre_process(response) results.stopTest(self.testcase_pre) except AssertionError: results.addFailure(self.testcase_pre, sys.exc_info()) except Exception: results.addError(self.testcase_pre, sys.exc_info()) else: results.addSuccess(self.testcase_pre) finally: return list(iterate_spider_output(cb(response)))
def _parse_response(self, response, callback, cb_kwargs, follow=True): if callback: cb_res = callback(response, **cb_kwargs) or () cb_res = self.process_results(response, cb_res) for requests_or_item in iterate_spider_output(cb_res): yield requests_or_item if follow and self._follow_links: for request_or_item in self._requests_to_follow(response): if isinstance(request_or_item, Request): request_or_item.meta['start_url'] = response.meta[ 'start_url'] yield request_or_item
def run_callback(self, response, cb): """ run callback and get items and requests :param response: :param cb: :return: """ items, requests = [], [] for x in iterate_spider_output(cb(response)): if isinstance(x, (BaseItem, dict)): items.append(x) elif isinstance(x, Request): requests.append(x) return items, requests
def wrapper(response, **cb_kwargs): output = list(iterate_spider_output(cb(response, **cb_kwargs))) try: results.startTest(self.testcase_post) self.post_process(output) results.stopTest(self.testcase_post) except AssertionError: results.addFailure(self.testcase_post, sys.exc_info()) except Exception: results.addError(self.testcase_post, sys.exc_info()) else: results.addSuccess(self.testcase_post) finally: return output
def parse_rows(self, response): """Receives a response and a dict (representing each row) with a key for each provided (or detected) header of the CSV file. This spider also gives the opportunity to override adapt_response and process_results methods for pre and post-processing purposes. """ for row in csviter(response, self.delimiter, self.headers, quotechar=self.quotechar): ret = iterate_spider_output(self.parse_row(response, row)) for result_item in self.process_results(response, ret): yield result_item
def _response_downloaded(self, response, callback, cb_kwargs, follow): """ This is were any response arrives, and were it's decided whether to extract links or not from it, and if it will be parsed or not. It returns a list of requests/items. """ if callback: cb_res = callback(response, **cb_kwargs) or () cb_res = self.process_results(response, cb_res) for requests_or_item in iterate_spider_output(cb_res): yield requests_or_item if follow and settings.getbool("CRAWLSPIDER_FOLLOW_LINKS", True): for request_or_item in self._requests_to_follow(response): yield request_or_item
def _parse_response(self, response, callback, cb_kwargs, follow=True): # _parse_response是CrawlSpider的核心函数 if callback: cb_res = callback(response, **cb_kwargs) or () cb_res = self.process_results( response, cb_res) #可以重载process_results,从而避免重载parse函数 for requests_or_item in iterate_spider_output(cb_res): yield requests_or_item if follow and self._follow_links: #调用我们设置的rule for request_or_item in self._requests_to_follow( response): #调用_requests_to_follow函数 yield request_or_item
def _parse_response_v2(self, response, parser, callback, cb_kwargs, follow=True): if parser: cb_res = parser(response, **cb_kwargs) or () if callback: cb_res = callback(response, cb_res=cb_res, **cb_kwargs) or () cb_res = self.process_results(response, cb_res) for requests_or_item in iterate_spider_output(cb_res): yield requests_or_item if follow and self._follow_links: for request_or_item in self._requests_to_follow(response): yield request_or_item
def _parse_response(self, response, callback, cb_kwargs, follow=True): if 'isFilter' in response.url: #watch it!!!!!!! response=response.replace(body=str(response.body.decode('unicode_escape').replace('\/','/'))) # response.body r.content类型是bytes response.text r.text类型是str #super()._parse_response(response, callback, cb_kwargs, follow=True),为什么不行??? if callback: cb_res = callback(response, **cb_kwargs) or () cb_res = self.process_results(response, cb_res) for requests_or_item in iterate_spider_output(cb_res): yield requests_or_item if follow and self._follow_links: for request_or_item in self._requests_to_follow(response): yield request_or_item
def _parse_response(self, response, callback, cb_kwargs, follow=True): """ :param response: :param callback: :param cb_kwargs: :param follow: 跟进标志 :return: """ if callback: cb_res = callback(response, **cb_kwargs) or () cb_res = self.process_results(response, cb_res) for requests_or_item in iterate_spider_output(cb_res): yield requests_or_item if follow and self._follow_links: for request_or_item in self._requests_to_follow(response): yield request_or_item
def parse(self, response): """Dispatch callback and generate requests""" # get rule for response rule = self._rulesman.get_rule_from_response(response) if rule: # dispatch callback if set if rule.callback: output = iterate_spider_output(rule.callback(response)) for req_or_item in output: yield req_or_item if rule.follow: for req in self._reqgen.generate_requests(response): # only dispatch request if has matching rule if self._rulesman.get_rule_from_request(req): yield req else: self.log("No rule for response %s" % response, level=log.WARNING)
def __call__(self, response): """Main response entry point. This method calls the callback and wraps the returned generator. The decorated method must return a generator. Args: response (scrapy.Response): Returns: scrapy.Request: """ output = iterate_spider_output( self._response_callback(response, **self._kwargs)) if not isinstance(output, GeneratorType): raise TypeError( f'{self._response_callback.__name__} must return a generator') return self.resume(output)
def parse_rows(self, response): for row in exceliter(response, self.header, self.skiprows, self.sheet_name, self.dropna_thresh, self.isEmpty_lineHeader): ret = iterate_spider_output(self.parse_row(response, row)) for result_item in self.process_results(response, ret): yield result_item
def parse_nodes(self, response, nodes): for selector in nodes: ret = iterate_spider_output(self.parse_node(response, selector)) for result_item in self.process_results(response, ret): yield result_item
def parse_rows(self, response): for row in pcsviter(response, self.delimiter, self.headers, self.quotechar): ret = iterate_spider_output(self.parse_row(response, row)) for result_item in self.process_results(response, ret): yield result_item
def parse_cities(self, response): nodes = xmliter(response, "city") for selector in nodes: ret = iterate_spider_output(self.parse_city(response, selector)) for result_item in ret: yield result_item
def parse_rows(self, response): for row in xlsiter(response, self.headers, self.sheet_index): ret = iterate_spider_output(self.parse_row(response, row)) for result_item in self.process_result(response, ret): yield result_item
def __call__(self, response): output = iterate_spider_output(self.callback(response)) if isinstance(output, types.GeneratorType): return self._unwindGenerator(output) else: return output
def wrapper(response): output = cb(response) output = list(iterate_spider_output(output))
def start_requests(self): self._postinit_reqs = super(InitSpider, self).start_requests() return iterate_spider_output(self.init_request())
def parse_nodes(self, response, nodes): """This method is called for the nodes matching the provided tag name (itertag). Receives the response and an Selector for each node. Overriding this method is mandatory. Otherwise, you spider won't work. This method must return either a BaseItem, a Request, or a list containing any of them. """ spider_config = response.meta.get("spider_config") spiders = response.meta.get("spiders") context = self.context or {} if None in [spider_config]: spider_config = self.spider_config spiders = self.spiders data = {"url": response.url, "domain": get_domain(response.url)} for extractor in spider_config.get('extractors', []): extracted_items = [] for selector in nodes: ret = iterate_spider_output( self.parse_node(response, selector, extractor)) for result_item in self.process_results(response, ret): extracted_items.append(result_item) data[extractor['extractor_id']] = {} data[extractor['extractor_id']]['entries'] = extracted_items context["spider_id"] = spider_config.get("spider_id") data['context'] = context """ if spider_traversal_id is None, it means this response originated from the request raised by the start urls. If it is Not None, the request/response is raised some traversal strategy. """ current_request_traversal_id = response.meta.get( 'current_request_traversal_id', None) """ In xml crawling current_request_traversal_page_count starts from 1, because there is no page 0. """ current_request_traversal_page_count = response.meta.get( 'current_request_traversal_page_count', 1) """ Note on current_request_spider_id: This can never be none, including the ones that are started by start_urls . """ spider_config_id = spider_config.get("spider_id") spider_traversals = spider_config.get('traversals', []) for traversal in spider_traversals: next_spider_id = traversal['next_spider_id'] iter_param = traversal['iter_param'] next_spider = get_spider_from_list(spider_id=next_spider_id, spiders=spiders) traversal['allow_domains'] = next_spider.get("allowed_domains", []) traversal_id = traversal['traversal_id'] traversal_max_pages = traversal.get('max_pages', 1) traversal_links = [] is_this_request_from_same_traversal = self.is_this_request_from_same_traversal( response, traversal) print("is_this_request_from_same_traversal", is_this_request_from_same_traversal) print("current_request_traversal_page_count", current_request_traversal_page_count) print("traversal_max_pages", traversal_max_pages) print( " current_request_traversal_page_count <= traversal_max_pages", current_request_traversal_page_count <= traversal_max_pages) shall_traverse = False if current_request_traversal_id is None: """ start urls will not have this traversal_id set, so we should allow then to traverse """ shall_traverse = True elif is_this_request_from_same_traversal and current_request_traversal_page_count <= traversal_max_pages: """ This block will be valid for the traversals from same spider_id, ie., pagination of a spider """ shall_traverse = True elif is_this_request_from_same_traversal: """ """ shall_traverse = True elif is_this_request_from_same_traversal is False and current_request_traversal_page_count <= \ traversal_max_pages: """ This for the spider_a traversing to spider_b, this is not pagination, but trsversing between spiders. """ shall_traverse = True print("shall_traverse: {}".format(traversal_id), shall_traverse) if shall_traverse: current_url = response.url clean_url_without_iter_param = current_url.split( "?")[0] if "?" in current_url else current_url # this is already iterating, so ignore. print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<", clean_url_without_iter_param) print("clean_url_without_iter_param", clean_url_without_iter_param) traversal_link = "{}?{}={}".format( clean_url_without_iter_param, iter_param, current_request_traversal_page_count + 1) print("traversal_link", traversal_link) data[traversal_id] = {"traversal_urls": [traversal_link]} """ Then validate for max_pages logic if traversal_id's traversal has any!. This is where the further traversal for this traversal_id is decided """ max_pages = traversal.get("max_pages", 1) current_request_traversal_page_count += 1 """ we are already incrementing, the last number, so using <= might make it 6 pages when max_pages is 5 """ if current_request_traversal_page_count <= max_pages: print("=======current_request_traversal_page_count", current_request_traversal_page_count) print("-----------------------------------") yield scrapy.Request( traversal_link, callback=self.parse, errback=self.parse_error, meta={ "spider_config": next_spider, "spiders": spiders, "current_request_traversal_id": traversal_id, "current_request_traversal_page_count": current_request_traversal_page_count, }) print("=================================================") print("====traversal_links", traversal_id, len(traversal_links), traversal_links) print("=================================================") yield data self.post_parse(response=response)
def start_requests(self): logging.info("@@@@@ start_requests is called @@@@@") self._postinit_reqs = super(PollSpider, self).start_requests() return iterate_spider_output(self.login())
def _handle_failure(self, failure, errback): if errback: results = errback(failure) or () for request_or_item in iterate_spider_output(results): yield request_or_item
def __call__(self, response): output = iterate_spider_output(self.callback(response=response, **self.kwargs)) if not isinstance(output, GeneratorType): raise ValueError("Callback must return a generator type") return self._unwindGenerator(output)
def start_requests(self): self._postinit_reqs = super().start_requests() return iterate_spider_output(self.init_request())