def get_start_requests(project_path, spider_name): """ get start requests :param project_path: project path :param spider_name: spider name :return: """ work_cwd = os.getcwd() try: # change work dir os.chdir(project_path) # load settings settings = get_project_settings() check_deprecated_settings(settings) runner = CrawlerRunner(settings=settings) # add crawler spider_cls = runner.spider_loader.load(spider_name) runner.crawl(spider_cls) # get crawler crawler = list(runner.crawlers)[0] # get spider by crawler spider = crawler.spider # get start requests requests = list(spider.start_requests()) if not requests and hasattr(spider, 'start'): requests = list(spider.start()) requests = list(map(lambda r: process_request(r), requests)) return {'finished': True, 'requests': requests} finally: os.chdir(work_cwd)
def callback(response): """ this callback wraps truly request's callback to get follows :param response: :return: """ # if no callback, use default parse callback of CrawlSpider cb = self.args.callback or self.default_callback # change un-callable callback to callable callback if not callable(cb): cb_method = getattr(spider, cb, None) if callable(cb_method): cb = cb_method # run truly callback to get items and requests, then to this method items, requests = self.run_callback(response, cb) # process request callback for request in requests: request.callback = self.get_callback(request) request.meta['callback'] = request.callback # process items and requests and response self.items += list(map(lambda item: process_item(item), items)) self.requests += list( map(lambda request: process_request(request), requests)) self.response = process_response(response)
def get_requests(self, lvl=None): """ get requests :param lvl: level :return: requests """ if lvl is None: levels = list(self.requests.keys()) if levels: requests = self.requests[max(levels)] else: requests = [] else: requests = self.requests.get(lvl, []) requests_array = [] for request in requests: print('Request', request, self.get_callback(request)) requests_array.append(process_request(request)) return requests_array
def callback(response): """ callback :param response: :return: """ request = response.request cb = self.args.callback or 'parse' if not callable(cb): cb_method = getattr(spider, cb, None) if callable(cb_method): cb = cb_method items, requests = self.run_callback(response, cb) # process request callback for request in requests: request.callback = self.get_callback(request) request.meta['callback'] = request.callback # process items and requests and response self.items += list(map(lambda item: process_item(item), items)) self.requests += list( map(lambda request: process_request(request), requests)) self.response = process_response(response)