Esempio n. 1
0
def get_start_requests(project_path, spider_name):
    """
    get start requests
    :param project_path: project path
    :param spider_name: spider name
    :return:
    """
    work_cwd = os.getcwd()
    try:
        # change work dir
        os.chdir(project_path)
        # load settings
        settings = get_project_settings()
        check_deprecated_settings(settings)
        runner = CrawlerRunner(settings=settings)
        # add crawler
        spider_cls = runner.spider_loader.load(spider_name)
        runner.crawl(spider_cls)
        # get crawler
        crawler = list(runner.crawlers)[0]
        # get spider by crawler
        spider = crawler.spider
        # get start requests
        requests = list(spider.start_requests())
        if not requests and hasattr(spider, 'start'):
            requests = list(spider.start())
        requests = list(map(lambda r: process_request(r), requests))
        return {'finished': True, 'requests': requests}
    finally:
        os.chdir(work_cwd)
Esempio n. 2
0
        def callback(response):
            """
            this callback wraps truly request's callback to get follows
            :param response:
            :return:
            """
            # if no callback, use default parse callback of CrawlSpider
            cb = self.args.callback or self.default_callback

            # change un-callable callback to callable callback
            if not callable(cb):
                cb_method = getattr(spider, cb, None)
                if callable(cb_method):
                    cb = cb_method

            # run truly callback to get items and requests, then to this method
            items, requests = self.run_callback(response, cb)

            # process request callback
            for request in requests:
                request.callback = self.get_callback(request)
                request.meta['callback'] = request.callback

            # process items and requests and response
            self.items += list(map(lambda item: process_item(item), items))
            self.requests += list(
                map(lambda request: process_request(request), requests))
            self.response = process_response(response)
Esempio n. 3
0
    def get_requests(self, lvl=None):
        """
        get requests
        :param lvl: level
        :return: requests
        """
        if lvl is None:
            levels = list(self.requests.keys())
            if levels:
                requests = self.requests[max(levels)]
            else:
                requests = []
        else:
            requests = self.requests.get(lvl, [])

        requests_array = []
        for request in requests:
            print('Request', request, self.get_callback(request))
            requests_array.append(process_request(request))

        return requests_array
Esempio n. 4
0
        def callback(response):
            """
            callback
            :param response:
            :return:
            """
            request = response.request
            cb = self.args.callback or 'parse'
            if not callable(cb):
                cb_method = getattr(spider, cb, None)
                if callable(cb_method):
                    cb = cb_method
            items, requests = self.run_callback(response, cb)

            # process request callback
            for request in requests:
                request.callback = self.get_callback(request)
                request.meta['callback'] = request.callback
            # process items and requests and response
            self.items += list(map(lambda item: process_item(item), items))
            self.requests += list(
                map(lambda request: process_request(request), requests))
            self.response = process_response(response)