Ejemplo n.º 1
0
    def run(self):
        """Starts the crawling process the listed websites. The results queue
        will start filling up with image URLs.
        """
        self.__running = True

        # Open up all browser windows
        for i in range(self.__browser_instance_cnt):
            if not self.__running:
                break  # End prematurely

            browser = Driver(executable_path=paths.driver)

            # Set up the browser to be closable
            self.__browser_close_methods.append(browser.quit)

            # Set the page timeout
            browser.set_page_load_timeout(self.__load_timeout)
            self.__browser_pool.put(browser)

        crawl_threads = []

        # Starts crawling the page and returns the given browser to the pool
        # when finished
        def crawl_and_return_to_pool(url, browser):
            progress_weight = (1 / len(self.__website_list)) * 100
            self._crawl_page(url, browser, progress_weight)
            self.__browser_pool.put(browser)

        # Start crawling each URL
        for url in self.__website_list:
            if not self.__running:
                break  # End prematurely

            # Wait for an unused browser instance
            browser = self.__browser_pool.get()
            # Start crawling
            thread = Thread(target=crawl_and_return_to_pool,
                            args=(url, browser))
            thread.start()
            crawl_threads.append(thread)

        # Wait for crawling to finish
        for thread in crawl_threads:
            thread.join()

        self._close_browsers()

        self.__running = False
        self.__is_finished = True
    def get_driver(self, ua, proxy=None):
        if not proxy:
            service_args = self.service_args
        else:
            service_args = self.service_args.copy()
            p = urlsplit(proxy)
            service_args += [
                "--proxy=" + (p.netloc or p.path),
                "--proxy-type=" + (p.scheme or "http")
            ]

        dcap = self.dcap.copy()
        dcap["phantomjs.page.settings.userAgent"] = ua

        driver = PhantomJS(executable_path=self.executable_path,
                           service_args=service_args,
                           desired_capabilities=dcap)
        driver.set_page_load_timeout(self.page_load_timeout)
        driver.set_script_timeout(self.page_load_timeout)
        return driver
    def get_driver(self, request):
        proxy = request.meta.get("proxy", None)
        if not proxy:
            service_args = self.service_args
        else:
            service_args = self.service_args.copy()
            p = urlparse(proxy)
            service_args += [
                "--proxy=" + (p.netloc or p.path),
                "--proxy-type=" + (p.scheme or "http")
            ]

        ua = request.headers.get(
            'User-Agent',
            b'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/59.0'
        )
        self.dcap["phantomjs.page.settings.userAgent"] = ua.decode()

        driver = PhantomJS(executable_path=self.executable_path,
                           service_args=service_args,
                           desired_capabilities=self.dcap)
        driver.set_page_load_timeout(self.page_load_timeout)
        driver.set_script_timeout(self.page_load_timeout)
        return driver
Ejemplo n.º 4
0
class RequestUtil:
    __browserAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0'

    def __init__(self):
        self.cookies = ''
        self._lock = threading.RLock()

    def http_get_request(self, url, referer, timeout=''):
        self._lock.acquire()
        cookie = cookielib.CookieJar()
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie),
                                      SmartRedirectHandler())
        urllib2.install_opener(opener)
        headers = {
            'User-Agent': self.__browserAgent,
            'Referer': referer,
            'Cache-Control': 'max-age=0',
            'Accept': '*/*',
            'Connection': 'Keep-Alive',
            'Accept-encoding': 'gzip'
        }
        req = urllib2.Request(url=url, headers=headers)
        if timeout == '':
            open = urllib2.urlopen(req)
        else:
            open = urllib2.urlopen(req, timeout=timeout)
        if self.cookies == '':
            for item in cookie:
                self.cookies = self.cookies + item.name + '=' + item.value + ';'
            self.cookies = self.cookies[:-1]
        if url != open.url:
            req = urllib2.Request(url=open.url, headers=headers)
        self._lock.release()
        return (open, req)

    def http_post_request(self, url, datas, referer, timeout=''):
        self._lock.acquire()
        postdata = urllib.urlencode(datas)
        headers = {
            'User-Agent': self.__browserAgent,
            'Referer': referer,
            'Content-Type': 'application/x-www-form-urlencoded',
            'Cache-Control': 'no-cache',
            'Accept': '*/*',
            'Connection': 'Keep-Alive',
            'Accept-encoding': 'gzip',
            'Cookie': self.cookies
        }
        req = urllib2.Request(url=url, data=postdata, headers=headers)
        req.get_host()
        if timeout == '':
            open = urllib2.urlopen(req)
        else:
            open = urllib2.urlopen(req, timeout=timeout)
        if url != open.url:
            req = urllib2.Request(url=open.url, headers=headers)
        self._lock.release()
        return (open, req)

    def http_get(self, url, refer='https://www.baidu.com'):
        return self.http_get_request(url, refer, 60)

    def http_post(self, url, datas, refer='https://www.baidu.com'):
        return self.http_post_request(url, datas, refer, 60)

    def http_post_request2(self, url, datas, timeout=''):
        if timeout == '':
            open = urllib2.urlopen(url, datas)
        else:
            open = urllib2.urlopen(url, datas, timeout=timeout)
        data = open.read()
        return data

    def http_post2(self, url, datas):
        return self.http_post_request2(url, datas, 300)

    def create_phandomjs(self, service_args, caps, timeout=30):
        self.driver = PhantomJS(desired_capabilities=caps,
                                service_args=service_args)
        self.driver.set_page_load_timeout(timeout)
        self.driver.set_script_timeout(timeout)
        self.driver.implicitly_wait(timeout)

    def close_phandomjs(self):
        try:
            self.driver.quit()
        except:
            pass

    def http_get_phandomjs(self,
                           url,
                           refer='https://www.baidu.com',
                           timeout=1000):
        caps = dict(DesiredCapabilities.PHANTOMJS)
        caps['browserName'] = 'chrome'
        caps["phantomjs.page.settings.resourceTimeout"] = timeout
        caps["phantomjs.page.settings.loadImages"] = False
        caps["phantomjs.page.settings.userAgent"] = (self.__browserAgent)
        caps["phantomjs.page.customHeaders.Referer"] = (refer)

        service_args = []
        service_args.append('--load-images=no')
        service_args.append('--disk-cache=yes')
        service_args.append('--cookies-file=')

        self.create_phandomjs(timeout=timeout,
                              service_args=service_args,
                              caps=caps)
        self.driver.get(url)
        return self.driver.page_source
Ejemplo n.º 5
0
class PagesCrawler(BaseSpider):

    name = 'pages'
    link_extractor = RegexpLinkExtractor(canonicalize=False, deny_extensions=[])
    ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS])

    def __init__(self, **kw):
        args = DEFAULT_INPUT.copy()
        args.update(kw)
        self.args = args
        self.start_urls = to_list(args['start_urls'])
        self.maxdepth = int(args['maxdepth'])
        self.follow_prefixes = to_list(args['follow_prefixes'])
        self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
        self.discover_prefixes = [url_to_lru_clean("http%s://%s" % (https, u.replace('http://', '').replace('https://', ''))) for u in to_list(args['discover_prefixes']) for https in ['', 's']]
        self.resolved_links = {}
        self.user_agent = args['user_agent']
        self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false"
        if self.phantom:
            self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT']))
            self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
            self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
        self.errors = 0
        dispatcher.connect(self.closed, spider_closed)
        dispatcher.connect(self.crashed, spider_error)

    def start_requests(self):
        self.log("Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'], log.INFO)
        self.log("ARGUMENTS : "+str(self.args), log.INFO)
        if self.phantom:
            self.init_phantom()
        for url in self.start_urls:
            yield self._request(url)

    def init_phantom(self):
        self.prefixfiles = os.path.join(
            scrapyd_config().get('logs_dir'),
            HYPHE_PROJECT,
            self.name,
            self.crawler.settings['JOBID']
        )
        self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, log.INFO)
        phantom_args = []
        if PROXY and not PROXY.startswith(':'):
            phantom_args.append('--proxy=%s' % PROXY)
        phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles)
        phantom_args.append('--ignore-ssl-errors=true')
        phantom_args.append('--load-images=false')
        self.capabilities = dict(DesiredCapabilities.PHANTOMJS)
        self.capabilities['phantomjs.page.settings.userAgent'] = self.user_agent
        self.capabilities['takesScreenshot'] = False
        self.capabilities['phantomjs.page.settings.javascriptCanCloseWindows'] = False
        self.capabilities['phantomjs.page.settings.javascriptCanOpenWindows'] = False
        self.phantom = PhantomJS(
            executable_path=PHANTOM['PATH'],
            service_args=phantom_args,
            desired_capabilities=self.capabilities,
            service_log_path="%s-phantomjs.log" % self.prefixfiles
        )
        self.phantom.implicitly_wait(10)
        self.phantom.set_page_load_timeout(60)
        self.phantom.set_script_timeout(self.ph_timeout + 15)

    def crashed(self, spider):
        self.errors += 1
        self.closed("CRASH")

    def closed(self, reason):
        if self.errors:
            self.log("%s error%s encountered during the crawl." %
                (self.errors, 's' if self.errors > 1 else ''), log.ERROR)
        if self.phantom:
            self.phantom.quit()
            if not self.errors:
                for f in ["phantomjs-cookie.txt", "phantomjs.log"]:
                    fi = "%s-%s" % (self.prefixfiles, f)
                    if os.path.exists(fi) and not self.errors:
                        os.remove(fi)

    def handle_response(self, response):
        lru = url_to_lru_clean(response.url)

        if self.phantom:
            self.phantom.get(response.url)

          # Collect whole DOM of the webpage including embedded iframes
            with open(os.path.join(PHANTOM["JS_PATH"], "get_iframes_content.js")) as js:
                get_bod_w_iframes = js.read()
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

          # Try to scroll and unfold page
            self.log("Start PhantomJS scrolling and unfolding", log.INFO)
            with open(os.path.join(PHANTOM["JS_PATH"], "scrolldown_and_unfold.js")) as js:
                try:
                    signal.signal(signal.SIGALRM, timeout_alarm)
                    signal.alarm(self.ph_timeout + 30)
                    timedout = self.phantom.execute_async_script(
                        js.read(), self.ph_timeout,
                        self.ph_idle_timeout, self.ph_ajax_timeout)
                    signal.alarm(0)
                    if timedout:
                        raise SeleniumTimeout
                    self.log("Scrolling/Unfolding finished", log.INFO)
                except SeleniumTimeout:
                    self.log("Scrolling/Unfolding timed-out (%ss)" % self.ph_timeout, log.WARNING)
                    self.errors += 1
                except WebDriverException as e:
                    err = json.loads(e.msg)['errorMessage']
                    self.log("Scrolling/Unfolding crashed: %s" % err, log.ERROR)
                    self.errors += 1
                except Exception as e:
                    self.log("Scrolling/Unfolding crashed: %s %s" % (type(e), e), log.ERROR)
                    self.errors += 1
                    return self._make_raw_page(response, lru)
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

      # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses
        if response.status == 200 and not isinstance(response, HtmlResponse):
            try:
                flags = response.flags
                if "partial" in flags:
                    flags.remove("partial")
                flags.append("cleaned")
                response = HtmlResponse(response.url, headers=response.headers, body=cleanupbase64images(response.body), flags=flags, request=response.request)
                self.log("WARNING: page with base64 embedded images was cleaned-up for links extraction")
            except:
                pass

        if 300 < response.status < 400 or isinstance(response, HtmlResponse):
            return self.parse_html(response, lru)
        else:
            return self._make_raw_page(response, lru)

    def handle_error(self, failure, response=None):
        if response:
            p = self._make_raw_page(response, failure.request.url)
            p['error'] = error_name(failure.value)
            return p
        elif not "://www" in failure.request.url:
            return self._request(failure.request.url.replace('://', '://www.'))
        self.log("ERROR : %s" % failure.getErrorMessage(), log.ERROR)
        self.errors += 1
        return

    def parse_html(self, response, lru):
        lrulinks = []
        # handle redirects
        realdepth = response.meta['depth']
        if 300 < response.status < 400:
            redir_url = response.headers['Location']
            if redir_url.startswith('/'):
                redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url)
            elif redir_url.startswith('./') or not redir_url.startswith('http'):
                redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:])
            links = [{'url': redir_url}]
            response.meta['depth'] -= 1
        else:
            try:
                links = self.link_extractor.extract_links(response)
            except Exception as e:
                self.log("ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), log.ERROR)
                links = []
                self.errors += 1
        for link in links:
            try:
                url = link.url
            except AttributeError:
                url = link['url']
            try:
                lrulink = url_to_lru_clean(url)
            except ValueError, e:
                self.log("Error converting URL %s to LRU: %s" % (url, e), log.ERROR)
                continue
            lrulinks.append((url, lrulink))
            if self._should_follow(response.meta['depth'], lru, lrulink) and \
                    not url_has_any_extension(url, self.ignored_exts):
                yield self._request(url)
        response.meta['depth'] = realdepth
        yield self._make_html_page(response, lru, lrulinks)
Ejemplo n.º 6
0
class PagesCrawler(BaseSpider):

    name = 'pages'
    link_extractor = RegexpLinkExtractor(canonicalize=False,
                                         deny_extensions=[])
    ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS])

    def __init__(self, **kw):
        args = DEFAULT_INPUT.copy()
        args.update(kw)
        self.args = args
        self.start_urls = to_list(args['start_urls'])
        self.maxdepth = int(args['maxdepth'])
        self.follow_prefixes = to_list(args['follow_prefixes'])
        self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
        self.discover_prefixes = [
            url_to_lru_clean(
                "http%s://%s" %
                (https, u.replace('http://', '').replace('https://', '')))
            for u in to_list(args['discover_prefixes']) for https in ['', 's']
        ]
        self.resolved_links = {}
        self.user_agent = args['user_agent']
        self.phantom = 'phantom' in args and args[
            'phantom'] and args['phantom'].lower() != "false"
        if self.phantom:
            self.ph_timeout = int(
                args.get('phantom_timeout', PHANTOM['TIMEOUT']))
            self.ph_idle_timeout = int(
                args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
            self.ph_ajax_timeout = int(
                args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
        self.errors = 0
        dispatcher.connect(self.closed, spider_closed)
        dispatcher.connect(self.crashed, spider_error)

    def start_requests(self):
        self.log(
            "Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'],
            log.INFO)
        self.log("ARGUMENTS : " + str(self.args), log.INFO)
        if self.phantom:
            self.init_phantom()
        for url in self.start_urls:
            yield self._request(url)

    def init_phantom(self):
        self.prefixfiles = os.path.join(scrapyd_config().get('logs_dir'),
                                        HYPHE_PROJECT, self.name,
                                        self.crawler.settings['JOBID'])
        self.log("Using path %s for PhantomJS crawl" % self.prefixfiles,
                 log.INFO)
        phantom_args = []
        if PROXY and not PROXY.startswith(':'):
            phantom_args.append('--proxy=%s' % PROXY)
        phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' %
                            self.prefixfiles)
        phantom_args.append('--ignore-ssl-errors=true')
        phantom_args.append('--load-images=false')
        self.capabilities = dict(DesiredCapabilities.PHANTOMJS)
        self.capabilities[
            'phantomjs.page.settings.userAgent'] = self.user_agent
        self.capabilities['takesScreenshot'] = False
        self.capabilities[
            'phantomjs.page.settings.javascriptCanCloseWindows'] = False
        self.capabilities[
            'phantomjs.page.settings.javascriptCanOpenWindows'] = False
        self.phantom = PhantomJS(executable_path=PHANTOM['PATH'],
                                 service_args=phantom_args,
                                 desired_capabilities=self.capabilities,
                                 service_log_path="%s-phantomjs.log" %
                                 self.prefixfiles)
        self.phantom.implicitly_wait(10)
        self.phantom.set_page_load_timeout(60)
        self.phantom.set_script_timeout(self.ph_timeout + 15)

    def crashed(self, spider):
        self.errors += 1
        self.closed("CRASH")

    def closed(self, reason):
        if self.errors:
            self.log(
                "%s error%s encountered during the crawl." %
                (self.errors, 's' if self.errors > 1 else ''), log.ERROR)
        if self.phantom:
            self.phantom.quit()
            if not self.errors:
                for f in ["phantomjs-cookie.txt", "phantomjs.log"]:
                    fi = "%s-%s" % (self.prefixfiles, f)
                    if os.path.exists(fi) and not self.errors:
                        os.remove(fi)

    def handle_response(self, response):
        lru = url_to_lru_clean(response.url)

        if self.phantom:
            self.phantom.get(response.url)

            # Collect whole DOM of the webpage including embedded iframes
            with open(
                    os.path.join(PHANTOM["JS_PATH"],
                                 "get_iframes_content.js")) as js:
                get_bod_w_iframes = js.read()
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

            # Try to scroll and unfold page
            self.log("Start PhantomJS scrolling and unfolding", log.INFO)
            with open(
                    os.path.join(PHANTOM["JS_PATH"],
                                 "scrolldown_and_unfold.js")) as js:
                try:
                    signal.signal(signal.SIGALRM, timeout_alarm)
                    signal.alarm(self.ph_timeout + 30)
                    timedout = self.phantom.execute_async_script(
                        js.read(), self.ph_timeout, self.ph_idle_timeout,
                        self.ph_ajax_timeout)
                    signal.alarm(0)
                    if timedout:
                        raise SeleniumTimeout
                    self.log("Scrolling/Unfolding finished", log.INFO)
                except SeleniumTimeout:
                    self.log(
                        "Scrolling/Unfolding timed-out (%ss)" %
                        self.ph_timeout, log.WARNING)
                    self.errors += 1
                except WebDriverException as e:
                    err = json.loads(e.msg)['errorMessage']
                    self.log("Scrolling/Unfolding crashed: %s" % err,
                             log.ERROR)
                    self.errors += 1
                except Exception as e:
                    self.log(
                        "Scrolling/Unfolding crashed: %s %s" % (type(e), e),
                        log.ERROR)
                    self.errors += 1
                    return self._make_raw_page(response, lru)
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

    # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses
        if response.status == 200 and not isinstance(response, HtmlResponse):
            try:
                flags = response.flags
                if "partial" in flags:
                    flags.remove("partial")
                flags.append("cleaned")
                response = HtmlResponse(response.url,
                                        headers=response.headers,
                                        body=cleanupbase64images(
                                            response.body),
                                        flags=flags,
                                        request=response.request)
                self.log(
                    "WARNING: page with base64 embedded images was cleaned-up for links extraction"
                )
            except:
                pass

        if 300 < response.status < 400 or isinstance(response, HtmlResponse):
            return self.parse_html(response, lru)
        else:
            return self._make_raw_page(response, lru)

    def handle_error(self, failure, response=None):
        if response:
            p = self._make_raw_page(response, failure.request.url)
            p['error'] = error_name(failure.value)
            return p
        elif not "://www" in failure.request.url:
            return self._request(failure.request.url.replace('://', '://www.'))
        error = failure.getErrorMessage()
        self.log("ERROR : %s" % error, log.ERROR)
        if PROXY and not PROXY.startswith(
                ':') and "OpenSSL.SSL.Error" in error:
            return self._request(failure.request.url, noproxy=True)
        self.errors += 1
        return

    def parse_html(self, response, lru):
        lrulinks = []
        # handle redirects
        realdepth = response.meta['depth']
        if 300 < response.status < 400:
            redir_url = response.headers['Location']
            if redir_url.startswith('/'):
                redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'),
                                      redir_url)
            elif redir_url.startswith(
                    './') or not redir_url.startswith('http'):
                redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'),
                                      redir_url[1:])
            links = [{'url': redir_url}]
            response.meta['depth'] -= 1
        else:
            try:
                links = self.link_extractor.extract_links(response)
            except Exception as e:
                self.log(
                    "ERROR: links extractor crashed on %s: %s %s" %
                    (response, type(e), e), log.ERROR)
                links = []
                self.errors += 1
        for link in links:
            try:
                url = link.url
            except AttributeError:
                url = link['url']
            try:
                lrulink = url_to_lru_clean(url)
            except ValueError, e:
                self.log("Error converting URL %s to LRU: %s" % (url, e),
                         log.ERROR)
                continue
            lrulinks.append((url, lrulink))
            if self._should_follow(response.meta['depth'], lru, lrulink) and \
                    not url_has_any_extension(url, self.ignored_exts):
                yield self._request(url)
        response.meta['depth'] = realdepth
        yield self._make_html_page(response, lru, lrulinks)
Ejemplo n.º 7
0
class CNStock(SentimentCrawler):
    def __init__(self):
        super().__init__(init=False)
        self.driver = PhantomJS()
        self.driver.maximize_window()
        self.wait = WebDriverWait(self.driver, 15)
        self.url = 'http://www.cnstock.com/'
        self.name = '中国证券网'

    def crawl_main_page(self, keyword):
        self.driver.set_page_load_timeout(10)
        try:
            self.driver.get(self.url)
        except TimeoutException:
            self.driver.execute_script('window.stop();')

        try:
            self.wait.until(
                ec.presence_of_element_located((By.ID, 'nav_keywords')))
        except:
            CustomLogging.log_to_file('中国证券网打开失败', LogType.ERROR)

        self.driver.find_element_by_id('nav_keywords').clear()
        self.driver.find_element_by_id('nav_keywords').send_keys(keyword +
                                                                 Keys.ENTER)

        return self.crawl_search_results()

    def crawl_search_results(self):
        search_results = []
        self.driver.switch_to.window(self.driver.window_handles[-1])
        self.driver.maximize_window()

        exit_flag = 0
        while True:
            try:
                self.wait.until(
                    ec.presence_of_element_located(
                        (By.CLASS_NAME, 'result-cont')))
            except TimeoutException:
                CustomLogging.log_to_file('中国证券网搜索结果页错误', LogType.ERROR)
                break

            try:
                result_articles = self.driver.find_elements_by_class_name(
                    'result-article')

                for each_article in result_articles:
                    item = Entity()

                    publish_date = each_article.find_element_by_class_name(
                        'g').text
                    item.publish_date = re.search(
                        re.compile(
                            '[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d'
                        ), publish_date).group()

                    if not in_date_range(
                            conv_pub_date(item.publish_date, 'cnstock'),
                            self.year_range):
                        exit_flag = 1
                        # 跳出for循环
                        break
                    item.short_description = each_article.find_element_by_class_name(
                        'des').text
                    item.title = each_article.find_element_by_tag_name(
                        'a').text
                    if self.keyword not in item.short_description and self.keyword not in item.title:
                        continue

                    if item.title in self.titles:
                        continue
                    else:
                        self.titles.append(item.title)

                    item.url = each_article.find_element_by_tag_name(
                        'a').get_attribute('href')
                    threading.Thread(target=super().download_and_save_item,
                                     args=(item, )).start()

                if exit_flag == 1:
                    break
            except NoSuchElementException:
                CustomLogging.log_to_file('没有搜索结果', LogType.INFO)
                break

            try:
                next_page = self.driver.find_element_by_xpath(
                    '//div[@class="pagination pagination-centered"]//a[contains(text(), "下一页")]'
                )
                self.driver.get(next_page.get_attribute('href'))
                # next_page.click()
            except NoSuchElementException:
                break

        return search_results

    def parse_html(self, url, html):
        bs = BeautifulSoup(html, 'lxml')
        try:
            full_content = bs.find('div', attrs={'id': 'qmt_content_div'}).text
            return full_content
        except Exception:
            CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url),
                                      LogType.ERROR)
            pass
Ejemplo n.º 8
0
class InspectAddress(object):
    def __init__(self):
        dcap = dict(DesiredCapabilities.PHANTOMJS)  # 设置userAgent
        dcap["phantomjs.page.settings.userAgent"] = (
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:25.0) Gecko/20100101 Firefox/25.0 "
        )
        self.driver = PhantomJS(
            executable_path=r'phantomjs-2.1.1-windows\bin\phantomjs.exe',
            desired_capabilities=dcap)

    def get_dev_cookie(self):
        logurl = 'https://www.bidinghuo.cn/api/backend/login.json'
        # jsondata_url = 'https://www.bidinghuo.cn/api/backend/platform/query.json'
        headers = {
            'Content-Type':
            'application/json;charset=UTF-8',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
        }
        data = {
            u'username': config.developers_account[0],
            u'password': config.developers_account[1]
        }
        value = ''
        try:
            res = requests.post(logurl, data=data)
            if res.status_code == 200:
                print u'开发平台账户登录-成功'
                value = res.cookies['laravel_session']
            else:
                print u'开发平台账户登录-失败'

        except:
            print u'开发平台账户登录-失败'

        cookies = {
            u'domain': u'.bidinghuo.cn',
            u'secure': False,
            u'value': value,
            u'expiry': None,
            u'path': u'/',
            u'httpOnly': True,
            u'name': u'laravel_session'
        }
        return cookies

    def get_brand_cookie(self):
        logurl = 'https://pyf123.bidinghuo.cn/api/admin/login.json'
        headers = {
            'Content-Type':
            'application/json;charset=UTF-8',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
        }
        data = {
            u'username': config.brand_user[0],
            u'password': config.brand_user[1]
        }
        value = ''
        try:
            res = requests.post(logurl, data=data)
            if res.status_code == 200:
                print u'品牌商账户登录-成功'
                value = res.cookies['laravel_session']
            else:
                print u'品牌商账户登录-失败'
        except:
            print u'品牌商账户登录-失败'
        cookies = {
            u'domain': u'.bidinghuo.cn',
            u'secure': False,
            u'value': value,
            u'expiry': None,
            u'path': u'/',
            u'httpOnly': True,
            u'name': u'laravel_session'
        }
        return cookies

    def developer_platform(self):
        '''访问品牌商管理开发平台'''
        url = config.developers_platform
        try:
            self.driver.add_cookie(self.get_dev_cookie())
            self.driver.get(url)
            self.driver.set_page_load_timeout(30)
        except:
            print u'访问品牌商管理开发平台-异常'

        try:
            page = self.driver.page_source
            page_soup = BeautifulSoup(page)
            username = page_soup.find_all(class_='user-name')[0]
            assert username.string == config.developers_account[0]
            print u'品牌商管理开发平台-访问正常'
        except:
            print u'品牌商管理开发平台-访问异常'

    def brand_platform(self):
        '''访问品牌商后台'''
        url = config.brand_platform
        try:
            self.driver.add_cookie(self.get_brand_cookie())
            self.driver.get(url)
            self.driver.set_page_load_timeout(30)
            bdh_title = BeautifulSoup(
                self.driver.page_source).find_all(class_='ovh')[0].h2.string
            nsgj_title = BeautifulSoup(
                self.driver.page_source).find_all(class_='ovh')[1].h2.string
            assert bdh_title == u'必订火'
            assert nsgj_title == u'内审管家'
            print u'访问品牌商后台-正常'
        except:
            print u'访问品牌商后台-异常'
        try:
            page = self.driver.page_source
            nsgj_href = self.driver.find_element_by_xpath(
                '//*[@id="app"]/div[2]/div/div[2]/div/div[2]/a').get_attribute(
                    'href')
            bdh_href = self.driver.find_element_by_xpath(
                '//*[@id="app"]/div[2]/div/div[1]/div/div[2]/a').get_attribute(
                    'href')
            assert requests.get(bdh_href).status_code == 200
            self.driver.get(bdh_href)
            self.driver.set_page_load_timeout(30)
            dhh_title = BeautifulSoup(self.driver.page_source).find_all(
                class_='meeting-name text-overflow')[0].string
            assert dhh_title == u'测试订货会'
            print u'访问品牌商订货会-正常'
        except:
            print u'访问品牌商订货会-异常'
        try:
            assert requests.get(bdh_href).status_code == 200
            self.driver.get(nsgj_href)
            self.driver.set_page_load_timeout(30)
            nsh_title = BeautifulSoup(self.driver.page_source).find_all(
                class_='meeting-name text-overflow')[0].string
            assert nsh_title == u'认同与人体'
            print u'访问品牌商内审管家-正常'
        except:
            print u'访问品牌商内审管家-异常'
Ejemplo n.º 9
0
class WeixinPhantomjs(Base):
    all_uids = {docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs}

    def __init__(self):
        self.start_page = START_PAGE
        self.end_page = END_PAGE
        self.weixin_url = REFER_FIRST

        # self.driver = Firefox()
        if hasattr(config, 'PHANTOMJS_PATH'):
            self.driver = PhantomJS(executable_path=getattr(config, 'PHANTOMJS_PATH'))
        else:
            self.driver = PhantomJS()

    def open_weixin_browser(self, word):
        try:
            self.driver.get(self.weixin_url)
            self.driver.set_page_load_timeout(3)

            self.driver.find_element_by_id('upquery').send_keys(word)
            self.driver.find_element_by_class_name('swz').click()
            time.sleep(3)

            urls_uids = self.extract_urls_uids(word=word)
            Article(urls_uids=urls_uids, word=word).extract()
        except Exception as e:
            storage_word.append([word, 0])
            self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e))
            self.close_browser()
            return True
        return False

    def get_total_pages_to_word(self):
        pages = []
        page_id_css = 'pagebar_container'

        try:
            e = self.driver.find_element_by_id(page_id_css)
            for _p in e.text.split():
                _p = _p.strip()

                if not _p.isdigit():
                    return DEFAULT_PAGES if DEFAULT_PAGES <= pages[-1] else pages[-1]
                else:
                    pages.append(int(_p))
            return 1
        except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError):
            pass
        return 1

    def extract_urls_uids(self, word):
        urls_uids = []
        timestamp = [_t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p')]
        urls_tits = [(t.get_attribute('href'), self.trim(t.text))
                     for t in self.driver.find_elements_by_css_selector('h4 a')]

        if len(urls_tits) != len(timestamp):
            return urls_uids

        for index, url_tit in enumerate(urls_tits):
            try:
                uid = self.md5(timestamp[index] + url_tit[1] + word)

                if uid not in self.__class__.all_uids:
                    self.__class__.all_uids.add(uid)
                    urls_uids.append({'url': url_tit[0], 'uid': uid})
            except (TypeError, IndexError):
                pass
        return urls_uids

    @property
    def is_forbidden(self):
        css_id = 'seccodeForm'

        try:
            if self.driver.find_element_by_id(css_id):
                return True
        except NoSuchElementException:
            pass
        return False

    def appear_element(self, by):
        try:
            # Have `click` function to specified element
            tag = WebDriverWait(self.driver, 20).until(lambda driver: driver.find_element_by_id(by))
            tag.click()
            return True
        except (TimeoutException, NoSuchWindowException, NoSuchElementException):
            pass
        return False

    def crawl_single(self, word=None, go=0):
        is_go = True
        go_page = int(go)
        next_page_css = 'sogou_page_%s'

        is_break = self.open_weixin_browser(word)
        pages = self.get_total_pages_to_word()

        for page in range(self.start_page + 1, (pages or self.end_page) + 1):
            if is_go and page < go_page:
                continue
            else:
                is_go = False

            if not self.appear_element(by=next_page_css % page):
                is_break = True
                msg = '\tNot appear next page element, will break'
            elif self.is_forbidden:
                is_break = True
                msg = '\tSpider was forbidden, crawling again after sleeping a moment!'

            if is_break:
                storage_word.append([word, page])
                self.logger.info(msg)
                break

            urls_uids = self.extract_urls_uids(word=word)
            Article(urls_uids=urls_uids, word=word).extract()

            # self.driver.find_element_by_id(next_page_css % page).click()
            # wt = randint(10, 40) if page % 5 == 0 else randint(5, 18)
            wt = randint(1, 5)
            self.logger.info('Word <{}>, Page <{}> Done, sleeping {}s!'.format(word, page, wt))
            # self.driver.implicitly_wait(wt)
            time.sleep(wt)

        self.close_browser()

    @classmethod
    def crawl_with_threads(cls):
        pool = ThreadPool(4)
        total_words = QueryWords().get_query_words()

        for bulk_words in total_words:
            try:
                pool.map(lambda w: cls().crawl_single(w), bulk_words)
            except Exception as e:
                cls.logger.info('Threads crawl error: type <{}>, msg <{}>'.format(e.__class__, e))

        pool.close()
        pool.join()
        in_client.close()

    def close_browser(self):
        try:
            self.driver.close()
        except (NoSuchWindowException,):
            pass
Ejemplo n.º 10
0
class WeixinPhantomjs(Base):
    def __init__(self):
        self.start_page = START_PAGE
        self.end_page = END_PAGE
        self.weixin_url = REFER_FIRST

        # self.driver = Firefox()
        if hasattr(config, 'PHANTOMJS_PATH'):
            self.driver = PhantomJS(executable_path=getattr(config, 'PHANTOMJS_PATH'))
        else:
            self.driver = PhantomJS()

        self.client = MongoClient(HOST, PORT)
        self.collection = self.client[DB][COLLECTION]
        self.all_uids = self.uids

    def open_weixin_browser(self, word):
        try:
            self.driver.get(self.weixin_url)
            self.driver.set_page_load_timeout(3)

            self.driver.find_element_by_id('upquery').send_keys(word)
            self.driver.find_element_by_class_name('swz').click()
            time.sleep(3)

            urls_uids = self.extract_urls_uids(word=word)
            Article(urls_uids=urls_uids, word=word).extract()
        except Exception as e:
            storage_word.append([word, 0])
            self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e))
            self.close_browser()
            return True
        return False

    def get_total_pages_to_word(self):
        pages = []
        page_id_css = 'pagebar_container'

        try:
            e = self.driver.find_element_by_id(page_id_css)
            for _p in e.text.split():
                _p = _p.strip()

                if not _p.isdigit():
                    return DEFAULT_PAGES if DEFAULT_PAGES <= pages[-1] else pages[-1]
                else:
                    pages.append(int(_p))
            return 1
        except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError):
            pass
        return 1

    def get_query_words(self, word):
        query_words = []

        for docs in self.collection.find({}, {'rel': 1, 'conp': 1}).sort([('_id', 1)]):
            w = docs['conp']

            if w not in query_words:
                query_words.append(w)

            for item in docs['rel']:
                if item not in query_words:
                    query_words.append(item)

        self.client.close()

        return self.query_index(query_words, word)

    @property
    def uids(self):
        return {docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs}

    def extract_urls_uids(self, word):
        urls_uids = []
        timestamp = [_t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p')]
        urls_tits = [(t.get_attribute('href'), self.trim(t.text))
                     for t in self.driver.find_elements_by_css_selector('h4 a')]

        if len(urls_tits) != len(timestamp):
            return urls_uids

        for index, url_tit in enumerate(urls_tits):
            try:
                uid = self.md5(timestamp[index] + url_tit[1] + word)

                if uid not in self.all_uids:
                    self.all_uids.add(uid)
                    urls_uids.append({'url': url_tit[0], 'uid': uid})
            except (TypeError, IndexError):
                pass
        return urls_uids

    @staticmethod
    def query_index(words, cut_word):
        temp_words = words[START_INDEX:END_INDEX]

        try:
            index = temp_words.index(cut_word)
            return temp_words[index:], index + START_INDEX
        except ValueError:
            pass
        return temp_words, START_INDEX

    @property
    def is_forbidden(self):
        css_id = 'seccodeForm'

        try:
            if self.driver.find_element_by_id(css_id):
                return True
        except NoSuchElementException:
            pass
        return False

    def appear_element(self, by):
        try:
            # Have `click` function to specified element
            tag = WebDriverWait(self.driver, 20).until(lambda driver: driver.find_element_by_id(by))
            tag.click()
            return True
        except (TimeoutException, NoSuchWindowException, NoSuchElementException):
            pass
        return False

    def crawl(self, word=None, go=0):
        is_go = True
        is_break = False
        go_page = int(go)
        next_page_css = 'sogou_page_%s'
        query_words, ind = self.get_query_words(word)

        for index, word in enumerate(query_words, 1):
            next_ind = ind + index
            is_break = self.open_weixin_browser(word)
            pages = self.get_total_pages_to_word()

            for page in range(self.start_page + 1, (pages or self.end_page) + 1):
                if is_go and page < go_page:
                    continue
                else:
                    is_go = False

                if not self.appear_element(by=next_page_css % page):
                    is_break = True
                    msg = '\tNot appear next page element, will break'
                elif self.is_forbidden:
                    is_break = True
                    msg = '\tSpider was forbidden, crawling again after sleeping a moment!'

                if is_break:
                    storage_word.append([word, page])
                    self.logger.info(msg)
                    break

                urls_uids = self.extract_urls_uids(word=word)
                Article(urls_uids=urls_uids, word=word).extract()

                # self.driver.find_element_by_id(next_page_css % page).click()
                wt = randint(10, 40) if page % 3 == 0 else randint(5, 18)
                self.logger.info('Index <{}>, Word <{}>, Page <{}> Done, sleeping {}s!'.format(next_ind, word, page, wt))
                # self.driver.implicitly_wait(wt)
                time.sleep(wt)

            if is_break:
                break

        in_client.close()
        self.close_browser()

    def close_browser(self):
        try:
            self.driver.close()
        except (NoSuchWindowException,):
            pass
Ejemplo n.º 11
0
class PagesCrawler(Spider):

    name = 'pages'
    link_extractor = RegexpLinkExtractor(canonicalize=False,
                                         deny_extensions=[])
    ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS])

    def __init__(self, **kwargs):
        mongo = MongoClient(MONGO_HOST, MONGO_PORT)[MONGO_DB][MONGO_JOBS_COL]
        job = mongo.find_one({"_id": kwargs["job_id"]})
        args = job["crawl_arguments"]
        self.args = args
        self.start_urls = to_list(args['start_urls'])
        self.maxdepth = int(args['max_depth'])
        self.follow_prefixes = to_list(args['follow_prefixes'])
        self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
        self.prefixes_trie = LRUTrie()
        for p in self.follow_prefixes:
            self.prefixes_trie.set_lru(p, True)
        for p in self.nofollow_prefixes:
            self.prefixes_trie.set_lru(p, False)
        self.discover_prefixes = [
            url_to_lru_clean(
                "http%s://%s" %
                (https, u.replace('http://', '').replace('https://', '')),
                TLDS_TREE) for u in to_list(args['discover_prefixes'])
            for https in ['', 's']
        ]
        self.resolved_links = {}
        self.user_agent = args['user_agent']
        self.phantom = 'phantom' in args and args[
            'phantom'] and args['phantom'].lower() != "false"
        self.cookies = None
        if 'cookies' in args and args["cookies"]:
            self.cookies = dict(
                cookie.split('=', 1)
                for cookie in re.split(r'\s*;\s*', args['cookies'])
                if '=' in cookie)
        if self.phantom:
            self.ph_timeout = int(
                args.get('phantom_timeout', PHANTOM['TIMEOUT']))
            self.ph_idle_timeout = int(
                args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
            self.ph_ajax_timeout = int(
                args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
        self.errors = 0

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(PagesCrawler, cls).from_crawler(crawler, *args,
                                                       **kwargs)
        crawler.signals.connect(spider.spider_closed, signal=spider_closed)
        crawler.signals.connect(spider.spider_crashed, signal=spider_error)
        return spider

    def start_requests(self):
        self.log(
            "Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'],
            logging.INFO)
        self.log("ARGUMENTS : " + str(self.args), logging.INFO)
        if self.phantom:
            self.init_phantom()
        for url in self.start_urls:
            yield self._request(url)

    def init_phantom(self):
        self.prefixfiles = os.path.join(scrapyd_config().get('logs_dir'),
                                        HYPHE_PROJECT, self.name,
                                        self.crawler.settings['JOBID'])
        self.log("Using path %s for PhantomJS crawl" % self.prefixfiles,
                 logging.INFO)
        phantom_args = []
        if PROXY and not PROXY.startswith(':'):
            phantom_args.append('--proxy=%s' % PROXY)
        phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' %
                            self.prefixfiles)
        phantom_args.append('--ignore-ssl-errors=true')
        phantom_args.append('--load-images=false')
        self.capabilities = dict(DesiredCapabilities.PHANTOMJS)
        self.capabilities[
            'phantomjs.page.settings.userAgent'] = self.user_agent
        self.capabilities['takesScreenshot'] = False
        self.capabilities[
            'phantomjs.page.settings.javascriptCanCloseWindows'] = False
        self.capabilities[
            'phantomjs.page.settings.javascriptCanOpenWindows'] = False
        self.phantom = PhantomJS(executable_path=PHANTOM['PATH'],
                                 service_args=phantom_args,
                                 desired_capabilities=self.capabilities,
                                 service_log_path="%s-phantomjs.log" %
                                 self.prefixfiles)
        self.phantom.implicitly_wait(10)
        self.phantom.set_page_load_timeout(60)
        self.phantom.set_script_timeout(self.ph_timeout + 15)

    def spider_crashed(self, spider):
        self.errors += 1
        self.spider_closed(spider, reason="CRASH")

    def spider_closed(self, spider, reason=""):
        if self.errors:
            self.log(
                "%s error%s encountered during the crawl (%s)." %
                (self.errors, 's' if self.errors > 1 else '', reason),
                logging.ERROR)
        if self.phantom:
            self.phantom.quit()
            if not self.errors:
                for f in ["phantomjs-cookie.txt", "phantomjs.log"]:
                    fi = "%s-%s" % (self.prefixfiles, f)
                    if os.path.exists(fi) and not self.errors:
                        os.remove(fi)

    def handle_response(self, response):
        lru = url_to_lru_clean(response.url, TLDS_TREE)

        if self.phantom:
            self.phantom.get(response.url)

            # Collect whole DOM of the webpage including embedded iframes
            with open(
                    os.path.join(PHANTOM["JS_PATH"],
                                 "get_iframes_content.js")) as js:
                get_bod_w_iframes = js.read()
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

            # Try to scroll and unfold page
            self.log("Start PhantomJS scrolling and unfolding", logging.INFO)
            with open(
                    os.path.join(PHANTOM["JS_PATH"],
                                 "scrolldown_and_unfold.js")) as js:
                try:
                    signal.signal(signal.SIGALRM, timeout_alarm)
                    signal.alarm(self.ph_timeout + 30)
                    timedout = self.phantom.execute_async_script(
                        js.read(), self.ph_timeout, self.ph_idle_timeout,
                        self.ph_ajax_timeout)
                    signal.alarm(0)
                    if timedout:
                        raise SeleniumTimeout
                    self.log("Scrolling/Unfolding finished", logging.INFO)
                except SeleniumTimeout:
                    self.log(
                        "Scrolling/Unfolding timed-out (%ss)" %
                        self.ph_timeout, logging.WARNING)
                    self.errors += 1
                except WebDriverException as e:
                    err = json.loads(e.msg)['errorMessage']
                    self.log("Scrolling/Unfolding crashed: %s" % err,
                             logging.ERROR)
                    self.errors += 1
                except Exception as e:
                    self.log(
                        "Scrolling/Unfolding crashed: %s %s" % (type(e), e),
                        logging.ERROR)
                    self.errors += 1
                    return self._make_raw_page(response, lru)
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

    # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses
        if response.status == 200 and not isinstance(response, HtmlResponse):
            try:
                flags = response.flags
                if "partial" in flags:
                    flags.remove("partial")
                flags.append("cleaned")
                response = HtmlResponse(response.url,
                                        headers=response.headers,
                                        body=cleanupbase64images(
                                            response.body),
                                        flags=flags,
                                        request=response.request)
                self.log(
                    "WARNING: page with base64 embedded images was cleaned-up for links extraction"
                )
            except:
                pass

        if 300 < response.status < 400 or isinstance(response, HtmlResponse):
            return self.parse_html(response, lru)
        else:
            return self._make_raw_page(response, lru)

    def handle_error(self, failure, response=None):
        if response:
            p = self._make_raw_page(response, failure.request.url)
            p['error'] = error_name(failure.value)
            return p
        elif not "://www" in failure.request.url:
            return self._request(failure.request.url.replace('://', '://www.'))
        error = failure.getErrorMessage()
        self.log("ERROR : %s" % error, logging.ERROR)
        if PROXY and not PROXY.startswith(
                ':') and "OpenSSL.SSL.Error" in error:
            return self._request(failure.request.url, noproxy=True)
        self.errors += 1
        return

    def parse_html(self, response, lru):
        lrulinks = []
        # handle redirects
        realdepth = response.meta['depth']
        if 300 < response.status < 400:
            redir_url = response.headers['Location']
            if redir_url.startswith('/'):
                redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'),
                                      redir_url)
            elif redir_url.startswith('../'):
                lrustart = lru[:lru.rfind('|p:')]
                while redir_url.startswith('../'):
                    lrustart = lrustart[:lrustart.rfind('|p:')]
                    redir_url = redir_url[3:]
                redir_url = "%s/%s" % (lru_to_url(lrustart + '|'), redir_url)
            elif redir_url.startswith(
                    './') or not redir_url.startswith('http'):
                redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'),
                                      redir_url[1:])
            links = [{'url': redir_url}]
            response.meta['depth'] -= 1
        else:
            try:
                links = self.link_extractor.extract_links(response)
            except Exception as e:
                self.log(
                    "ERROR: links extractor crashed on %s: %s %s" %
                    (response, type(e), e), logging.ERROR)
                links = []
                self.errors += 1
        for link in links:
            try:
                url = link.url
            except AttributeError:
                url = link['url']
            try:
                lrulink = url_to_lru_clean(url, TLDS_TREE)
            except (ValueError, IndexError) as e:
                self.log("Error converting URL %s to LRU: %s" % (url, e),
                         logging.ERROR)
                continue
            lrulinks.append((url, lrulink))
            if self._should_follow(response.meta['depth'], lrulink) and \
                    not url_has_any_extension(url, self.ignored_exts):
                yield self._request(url)
        response.meta['depth'] = realdepth
        yield self._make_html_page(response, lru, lrulinks)

    def _make_html_page(self, response, lru, lrulinks):
        p = self._make_raw_page(response, lru)
        if STORE_HTML:
            p['body'] = Binary(response.body.encode('zip'))
        p['lrulinks'] = lrulinks
        return p

    def _make_raw_page(self, response, lru):
        p = self._new_page(response.url, lru)
        p['status'] = response.status
        p['size'] = len(response.body)
        if isinstance(response, HtmlResponse):
            p['encoding'] = response.encoding
        if response.meta.get('depth'):
            p['depth'] = response.meta['depth']
        if response.headers.get('content-type'):
            p['content_type'] = response.headers.get('content-type').partition(
                ';')[0]
        p['error'] = None
        return p

    def _new_page(self, url, lru=None):
        if lru is None:
            lru = url_to_lru_clean(url, TLDS_TREE)
        p = Page()
        p['url'] = url
        p['lru'] = lru
        p['depth'] = 0
        p['timestamp'] = int(time.time() * 1000)
        return p

    def _should_follow(self, depth, tolru):
        c1 = depth < self.maxdepth
        c2 = self.prefixes_trie.match_lru(tolru)
        return c1 and c2

    def _request(self, url, noproxy=False, **kw):
        kw['meta'] = {'handle_httpstatus_all': True, 'noproxy': noproxy}
        kw['callback'] = self.handle_response
        kw['errback'] = self.handle_error
        if self.cookies:
            kw['cookies'] = self.cookies
        if self.phantom:
            kw['method'] = 'HEAD'
        return Request(url, **kw)