Example #1
0
    def send_crawl_request(self, crawl_request,
                           timeout: datetime.datetime=None):

        host = self.get_host(extract_hostname(crawl_request['url']))
        if host is None:
            self.log.warning("Got a job with host I cannot find. Check debug")
            self.log.debug("Got a jbo with host I cannot find {}".format(
                repr(crawl_request)))
            return

        upsert_crawl_document(
            url=crawl_request['url'],
            url_hash=hash_url(crawl_request['url']),
            latest_request=crawl_request,
            host=extract_hostname(crawl_request['url']),
            latest_request_date=datetime.datetime.now(
                datetime.timezone.utc
            )
        )

        self.send_crawl_request_to_host(crawl_request, host.host)
Example #2
0
    def process_task(self, msg):
        """
        Handling messages
        :param msg:
        :return:
        """
        crawl_result = Dict(msg)
        self.log.info("Processing {}".format(crawl_result.url))
        host = self.get_host_by_result(crawl_result)
        if host is None:
            self.log.warning("Got job for host I cannot find")
            self.log.debug("Got job for host I cannot find {}".format(
                repr(crawl_result)))
            return
        try:

            if "status_code" not in crawl_result:
                raise KeyError("`status_code` not found in crawl_result "
                               + "%s json:b64:" % crawl_result.url
                               + base64.b64encode(
                                    ujson.dumps(crawl_result).encode('utf8')
                                ).decode("utf8")
                               )

            upsert_crawl_document(
                url=crawl_result.url,
                url_hash=hash_url(crawl_result.url),
                latest_result=crawl_result.to_dict(),
                latest_result_date=now(),
                latest_status_code=crawl_result.status_code
            )

            # robot - we retrieved robots.txt
            if 'robots' in crawl_result.actions:
                self.process_robots_task(crawl_result)
            else:
                def try_again_tomorrow():
                    cd = CrawlDocument.objects(
                        url_hash=hash_url(crawl_result.url)).first()

                    # try again tomorrow
                    self.log.info("Try again tomorrow: {}".format(
                        crawl_result.url))
                    if cd is not None and 'url' in cd.latest_request:
                        self.send_crawl_request(cd.latest_request,
                                                timeout=now()+timedelta(days=1)
                                                )
                    else:
                        self.send_crawl_request(
                            crawl_result.crawl_task.to_dict(),
                            timeout=now()+timedelta(days=1)
                        )

                # 200, normal processing
                if crawl_result.status_code == 200:
                    if ('follow' in crawl_result.actions
                            or "nofollow" not in crawl_result.actions):
                        self.extract_and_send_crawl_requests(crawl_result)
                    if ('index' in crawl_result.actions
                            or "noindex" not in crawl_result.actions):
                        self.send_crawl_result_to_analysis(crawl_result)

                elif 400 <= crawl_result.status_code <= 499:
                    self.send_remove_request(crawl_result)
                elif 300 <= crawl_result.status_code <= 399:
                    try_again_tomorrow()
                elif 500 <= crawl_result.status_code <= 599:
                    try_again_tomorrow()

        except NoRobotsForHostError:
            # no robots.txt or it's expired, so we create request
            # for processing
            robots_request = {
                "url": parse.urljoin(crawl_result.url, "/robots.txt"),
                "cookies": crawl_result.cookies,
                "method": "GET",
                "actions": ["robots"],
                "timeout": datetime.datetime.now(
                    datetime.timezone.utc).isoformat()
            }

            host.robots_txt = RobotsTxt(status="waiting")
            host.save()

            self.send_crawl_request_to_host(robots_request, host.host)
            self.log.warning("No robots for {}".format(host.host))
            raise RequeueMessage
        except RequeueMessage as e:
            self.log.exception(e)
            raise e
        except Exception as e:
            self.log.exception(e)
            raise RequeueMessage

        return True