Beispiel #1
0
        def parse(self, response):
            #print(response.headers)
            #print(response.status)
            if response.status == 504:
                # no response
                #print('504 detected')
                pass

            # LUA ERROR # # TODO: print/display errors
            elif 'error' in response.data:
                if (response.data['error'] == 'network99'):
                    ## splash restart ##
                    error_retry = request.meta.get('error_retry', 0)
                    if error_retry < 3:
                        error_retry += 1
                        url = request.meta['current_url']
                        father = request.meta['father']

                        self.logger.error(
                            'Splash, ResponseNeverReceived for %s, retry in 10s ...',
                            url)
                        time.sleep(10)
                        yield SplashRequest(url,
                                            self.parse,
                                            errback=self.errback_catcher,
                                            endpoint='execute',
                                            cache_args=['lua_source'],
                                            meta={
                                                'father': father,
                                                'current_url': url,
                                                'error_retry': error_retry
                                            },
                                            args=self.build_request_arg(
                                                response.cookiejar))
                    else:
                        print('Connection to proxy refused')
                else:
                    print(response.data['error'])

            elif response.status != 200:
                print('other response: {}'.format(response.status))
                # detect connection to proxy refused
                error_log = (json.loads(response.body.decode()))
                print(error_log)
            #elif crawlers.is_redirection(self.domains[0], response.data['last_url']):
            #    pass # ignore response
            else:

                item_id = crawlers.create_item_id(self.item_dir,
                                                  self.domains[0])
                self.save_crawled_item(item_id, response.data['html'])
                crawlers.create_item_metadata(item_id, self.domains[0],
                                              response.data['last_url'],
                                              self.port,
                                              response.meta['father'])

                if self.root_key is None:
                    self.root_key = item_id
                    crawlers.add_domain_root_item(item_id, self.domain_type,
                                                  self.domains[0],
                                                  self.date_epoch, self.port)
                    crawlers.create_domain_metadata(self.domain_type,
                                                    self.domains[0], self.port,
                                                    self.full_date,
                                                    self.date_month)

                if 'cookies' in response.data:
                    all_cookies = response.data['cookies']
                else:
                    all_cookies = []

                # SCREENSHOT
                if 'png' in response.data and self.png:
                    sha256_string = Screenshot.save_crawled_screeshot(
                        response.data['png'],
                        5000000,
                        f_save=self.requested_mode)
                    if sha256_string:
                        Screenshot.save_item_relationship(
                            sha256_string, item_id)
                        Screenshot.save_domain_relationship(
                            sha256_string, self.domains[0])
                # HAR
                if 'har' in response.data and self.har:
                    crawlers.save_har(self.har_dir, item_id,
                                      response.data['har'])

                le = LinkExtractor(allow_domains=self.domains, unique=True)
                for link in le.extract_links(response):
                    l_cookies = self.build_request_arg(all_cookies)
                    yield SplashRequest(link.url,
                                        self.parse,
                                        errback=self.errback_catcher,
                                        endpoint='execute',
                                        meta={
                                            'father': item_id,
                                            'current_url': link.url
                                        },
                                        args=l_cookies)
Beispiel #2
0
        def parse(self, response):
            #print(response.headers)
            #print(response.status)
            #print(response.meta)
            #print(response.data) # # TODO: handle lua script error
            #{'type': 'ScriptError', 'info': {'error': "'}' expected (to close '{' at line 47) near 'error_retry'",
            #'message': '[string "..."]:53: \'}\' expected (to close \'{\' at line 47) near \'error_retry\'',
            #'type': 'LUA_INIT_ERROR', 'source': '[string "..."]', 'line_number': 53},
            #'error': 400, 'description': 'Error happened while executing Lua script'}
            if response.status == 504:
                # no response
                #print('504 detected')
                pass

            # LUA ERROR # # TODO: logs errors
            elif 'error' in response.data:
                if (response.data['error'] == 'network99'):
                    ## splash restart ##
                    error_retry = response.meta.get('error_retry', 0)
                    if error_retry < 3:
                        error_retry += 1
                        url = response.data['last_url']
                        father = response.meta['father']

                        self.logger.error(
                            'Splash, ResponseNeverReceived for %s, retry in 10s ...',
                            url)
                        time.sleep(10)
                        if 'cookies' in response.data:
                            all_cookies = response.data[
                                'cookies']  # # TODO:  use initial cookie ?????
                        else:
                            all_cookies = []
                        l_cookies = self.build_request_arg(all_cookies)
                        yield SplashRequest(url,
                                            self.parse,
                                            errback=self.errback_catcher,
                                            endpoint='execute',
                                            dont_filter=True,
                                            meta={
                                                'father': father,
                                                'current_url': url,
                                                'error_retry': error_retry
                                            },
                                            args=l_cookies)
                    else:
                        if self.requested_mode == 'test':
                            crawlers.save_test_ail_crawlers_result(
                                False, 'Connection to proxy refused')
                        print('Connection to proxy refused')
                elif response.data['error'] == 'network3':
                    if self.requested_mode == 'test':
                        crawlers.save_test_ail_crawlers_result(
                            False,
                            'HostNotFoundError: the remote host name was not found (invalid hostname)'
                        )
                    print(
                        'HostNotFoundError: the remote host name was not found (invalid hostname)'
                    )
                else:
                    if self.requested_mode == 'test':
                        crawlers.save_test_ail_crawlers_result(
                            False, response.data['error'])
                    print(response.data['error'])

            elif response.status != 200:
                print('other response: {}'.format(response.status))
                # detect connection to proxy refused
                error_log = (json.loads(response.body.decode()))
                print(error_log)
            #elif crawlers.is_redirection(self.domains[0], response.data['last_url']):
            #    pass # ignore response
            else:
                ## TEST MODE ##
                if self.requested_mode == 'test':
                    if 'It works!' in response.data['html']:
                        crawlers.save_test_ail_crawlers_result(
                            True, 'It works!')
                    else:
                        print('TEST ERROR')
                        crawlers.save_test_ail_crawlers_result(
                            False, 'TEST ERROR')
                    return
                ## -- ##

                item_id = crawlers.create_item_id(self.item_dir,
                                                  self.domains[0])
                self.save_crawled_item(item_id, response.data['html'])
                crawlers.create_item_metadata(item_id, self.domains[0],
                                              response.data['last_url'],
                                              self.port,
                                              response.meta['father'])

                if self.root_key is None:
                    self.root_key = item_id
                    crawlers.add_domain_root_item(item_id, self.domain_type,
                                                  self.domains[0],
                                                  self.date_epoch, self.port)
                    crawlers.create_domain_metadata(self.domain_type,
                                                    self.domains[0], self.port,
                                                    self.full_date,
                                                    self.date_month)

                if 'cookies' in response.data:
                    all_cookies = response.data['cookies']
                else:
                    all_cookies = []

                # SCREENSHOT
                if 'png' in response.data and self.png:
                    sha256_string = Screenshot.save_crawled_screeshot(
                        response.data['png'],
                        5000000,
                        f_save=self.requested_mode)
                    if sha256_string:
                        Screenshot.save_item_relationship(
                            sha256_string, item_id)
                        Screenshot.save_domain_relationship(
                            sha256_string, self.domains[0])
                # HAR
                if 'har' in response.data and self.har:
                    crawlers.save_har(self.har_dir, item_id,
                                      response.data['har'])

                le = LinkExtractor(allow_domains=self.domains, unique=True)
                for link in le.extract_links(response):
                    l_cookies = self.build_request_arg(all_cookies)
                    yield SplashRequest(link.url,
                                        self.parse,
                                        errback=self.errback_catcher,
                                        endpoint='execute',
                                        meta={
                                            'father': item_id,
                                            'current_url': link.url
                                        },
                                        args=l_cookies)