def send_crawl_request(self, crawl_request, timeout: datetime.datetime=None): host = self.get_host(extract_hostname(crawl_request['url'])) if host is None: self.log.warning("Got a job with host I cannot find. Check debug") self.log.debug("Got a jbo with host I cannot find {}".format( repr(crawl_request))) return upsert_crawl_document( url=crawl_request['url'], url_hash=hash_url(crawl_request['url']), latest_request=crawl_request, host=extract_hostname(crawl_request['url']), latest_request_date=datetime.datetime.now( datetime.timezone.utc ) ) self.send_crawl_request_to_host(crawl_request, host.host)
def process_task(self, msg): """ Handling messages :param msg: :return: """ crawl_result = Dict(msg) self.log.info("Processing {}".format(crawl_result.url)) host = self.get_host_by_result(crawl_result) if host is None: self.log.warning("Got job for host I cannot find") self.log.debug("Got job for host I cannot find {}".format( repr(crawl_result))) return try: if "status_code" not in crawl_result: raise KeyError("`status_code` not found in crawl_result " + "%s json:b64:" % crawl_result.url + base64.b64encode( ujson.dumps(crawl_result).encode('utf8') ).decode("utf8") ) upsert_crawl_document( url=crawl_result.url, url_hash=hash_url(crawl_result.url), latest_result=crawl_result.to_dict(), latest_result_date=now(), latest_status_code=crawl_result.status_code ) # robot - we retrieved robots.txt if 'robots' in crawl_result.actions: self.process_robots_task(crawl_result) else: def try_again_tomorrow(): cd = CrawlDocument.objects( url_hash=hash_url(crawl_result.url)).first() # try again tomorrow self.log.info("Try again tomorrow: {}".format( crawl_result.url)) if cd is not None and 'url' in cd.latest_request: self.send_crawl_request(cd.latest_request, timeout=now()+timedelta(days=1) ) else: self.send_crawl_request( crawl_result.crawl_task.to_dict(), timeout=now()+timedelta(days=1) ) # 200, normal processing if crawl_result.status_code == 200: if ('follow' in crawl_result.actions or "nofollow" not in crawl_result.actions): self.extract_and_send_crawl_requests(crawl_result) if ('index' in crawl_result.actions or "noindex" not in crawl_result.actions): self.send_crawl_result_to_analysis(crawl_result) elif 400 <= crawl_result.status_code <= 499: self.send_remove_request(crawl_result) elif 300 <= crawl_result.status_code <= 399: try_again_tomorrow() elif 500 <= crawl_result.status_code <= 599: try_again_tomorrow() except NoRobotsForHostError: # no robots.txt or it's expired, so we create request # for processing robots_request = { "url": parse.urljoin(crawl_result.url, "/robots.txt"), "cookies": crawl_result.cookies, "method": "GET", "actions": ["robots"], "timeout": datetime.datetime.now( datetime.timezone.utc).isoformat() } host.robots_txt = RobotsTxt(status="waiting") host.save() self.send_crawl_request_to_host(robots_request, host.host) self.log.warning("No robots for {}".format(host.host)) raise RequeueMessage except RequeueMessage as e: self.log.exception(e) raise e except Exception as e: self.log.exception(e) raise RequeueMessage return True