def upsert_document(self, index_task): update_dict = { "set__url": index_task['url'], "set__url_hash": hash_url(index_task['url']), "set__host": crawlmanager.extract_hostname(index_task['url']), "set__meta_data": index_task['document']['meta_data'], "set__features": index_task['document']['features'] } IndexDocument.objects(url=index_task['url']).update_one( upsert=True, **update_dict )
def try_again_tomorrow(): cd = CrawlDocument.objects( url_hash=hash_url(crawl_result.url)).first() # try again tomorrow self.log.info("Try again tomorrow: {}".format( crawl_result.url)) if cd is not None and 'url' in cd.latest_request: self.send_crawl_request(cd.latest_request, timeout=now()+timedelta(days=1) ) else: self.send_crawl_request( crawl_result.crawl_task.to_dict(), timeout=now()+timedelta(days=1) )
def test_status_codes_gt_200(self, objects, declare_queue, sync, cdobjects): inst = objects.return_value inst.first.return_value = self.host declare_queue.side_effect = declare_queue_side_effect crawl_response = { "url": "http://example.com/home.html", "body": "<html></html>", "actions": ["index"], "status_code": 500, "headers": { "content-type": "text/html" }, "crawl_time": datetime.now(timezone.utc).isoformat() } now = datetime.now(timezone.utc) cd = CrawlDocument() cd.host = "example.com" cd.url = crawl_response['url'] cd.url_hash = hash_url(crawl_response['url']) cd.latest_request = { "url": crawl_response['url'], "cookies": {}, "method": "GET", "actions": ["follow", "index"] } cd.save() cinst = cdobjects.return_value cinst.first.return_value = cd self.crawl_manager.process_task( ujson.dumps(crawl_response).encode("utf8") ) crawl_response['status_code'] = 302 self.crawl_manager.process_task( ujson.dumps(crawl_response).encode("utf8") ) self.assertEqual(sync.call_count, 2)
def send_crawl_request(self, crawl_request, timeout: datetime.datetime=None): host = self.get_host(extract_hostname(crawl_request['url'])) if host is None: self.log.warning("Got a job with host I cannot find. Check debug") self.log.debug("Got a jbo with host I cannot find {}".format( repr(crawl_request))) return upsert_crawl_document( url=crawl_request['url'], url_hash=hash_url(crawl_request['url']), latest_request=crawl_request, host=extract_hostname(crawl_request['url']), latest_request_date=datetime.datetime.now( datetime.timezone.utc ) ) self.send_crawl_request_to_host(crawl_request, host.host)
def test_index_document(self): cd = IndexDocument(url="http://example.com", host="example.com") cd.save() self.assertEqual(cd.url_hash, hash_url(cd.url))
def process_task(self, msg): """ Handling messages :param msg: :return: """ crawl_result = Dict(msg) self.log.info("Processing {}".format(crawl_result.url)) host = self.get_host_by_result(crawl_result) if host is None: self.log.warning("Got job for host I cannot find") self.log.debug("Got job for host I cannot find {}".format( repr(crawl_result))) return try: if "status_code" not in crawl_result: raise KeyError("`status_code` not found in crawl_result " + "%s json:b64:" % crawl_result.url + base64.b64encode( ujson.dumps(crawl_result).encode('utf8') ).decode("utf8") ) upsert_crawl_document( url=crawl_result.url, url_hash=hash_url(crawl_result.url), latest_result=crawl_result.to_dict(), latest_result_date=now(), latest_status_code=crawl_result.status_code ) # robot - we retrieved robots.txt if 'robots' in crawl_result.actions: self.process_robots_task(crawl_result) else: def try_again_tomorrow(): cd = CrawlDocument.objects( url_hash=hash_url(crawl_result.url)).first() # try again tomorrow self.log.info("Try again tomorrow: {}".format( crawl_result.url)) if cd is not None and 'url' in cd.latest_request: self.send_crawl_request(cd.latest_request, timeout=now()+timedelta(days=1) ) else: self.send_crawl_request( crawl_result.crawl_task.to_dict(), timeout=now()+timedelta(days=1) ) # 200, normal processing if crawl_result.status_code == 200: if ('follow' in crawl_result.actions or "nofollow" not in crawl_result.actions): self.extract_and_send_crawl_requests(crawl_result) if ('index' in crawl_result.actions or "noindex" not in crawl_result.actions): self.send_crawl_result_to_analysis(crawl_result) elif 400 <= crawl_result.status_code <= 499: self.send_remove_request(crawl_result) elif 300 <= crawl_result.status_code <= 399: try_again_tomorrow() elif 500 <= crawl_result.status_code <= 599: try_again_tomorrow() except NoRobotsForHostError: # no robots.txt or it's expired, so we create request # for processing robots_request = { "url": parse.urljoin(crawl_result.url, "/robots.txt"), "cookies": crawl_result.cookies, "method": "GET", "actions": ["robots"], "timeout": datetime.datetime.now( datetime.timezone.utc).isoformat() } host.robots_txt = RobotsTxt(status="waiting") host.save() self.send_crawl_request_to_host(robots_request, host.host) self.log.warning("No robots for {}".format(host.host)) raise RequeueMessage except RequeueMessage as e: self.log.exception(e) raise e except Exception as e: self.log.exception(e) raise RequeueMessage return True