コード例 #1
0
ファイル: indexmanager.py プロジェクト: ifrpl/toddler
    def upsert_document(self, index_task):

        update_dict = {
            "set__url": index_task['url'],
            "set__url_hash": hash_url(index_task['url']),
            "set__host": crawlmanager.extract_hostname(index_task['url']),
            "set__meta_data": index_task['document']['meta_data'],
            "set__features": index_task['document']['features']
        }

        IndexDocument.objects(url=index_task['url']).update_one(
            upsert=True,
            **update_dict
        )
コード例 #2
0
ファイル: crawlmanager.py プロジェクト: ifrpl/toddler
                def try_again_tomorrow():
                    cd = CrawlDocument.objects(
                        url_hash=hash_url(crawl_result.url)).first()

                    # try again tomorrow
                    self.log.info("Try again tomorrow: {}".format(
                        crawl_result.url))
                    if cd is not None and 'url' in cd.latest_request:
                        self.send_crawl_request(cd.latest_request,
                                                timeout=now()+timedelta(days=1)
                                                )
                    else:
                        self.send_crawl_request(
                            crawl_result.crawl_task.to_dict(),
                            timeout=now()+timedelta(days=1)
                        )
コード例 #3
0
ファイル: test_crawlmanager.py プロジェクト: ifrpl/toddler
    def test_status_codes_gt_200(self, objects, declare_queue, sync,
                                 cdobjects):


        inst = objects.return_value
        inst.first.return_value = self.host

        declare_queue.side_effect = declare_queue_side_effect

        crawl_response = {
            "url": "http://example.com/home.html",
            "body": "<html></html>",
            "actions": ["index"],
            "status_code": 500,
            "headers": {
                "content-type": "text/html"
            },
            "crawl_time": datetime.now(timezone.utc).isoformat()
        }

        now = datetime.now(timezone.utc)

        cd = CrawlDocument()
        cd.host = "example.com"
        cd.url = crawl_response['url']
        cd.url_hash = hash_url(crawl_response['url'])
        cd.latest_request = {
            "url": crawl_response['url'],
            "cookies": {},
            "method": "GET",
            "actions": ["follow", "index"]
        }
        cd.save()
        cinst = cdobjects.return_value
        cinst.first.return_value = cd

        self.crawl_manager.process_task(
            ujson.dumps(crawl_response).encode("utf8")
        )

        crawl_response['status_code'] = 302

        self.crawl_manager.process_task(
            ujson.dumps(crawl_response).encode("utf8")
        )

        self.assertEqual(sync.call_count, 2)
コード例 #4
0
ファイル: crawlmanager.py プロジェクト: ifrpl/toddler
    def send_crawl_request(self, crawl_request,
                           timeout: datetime.datetime=None):

        host = self.get_host(extract_hostname(crawl_request['url']))
        if host is None:
            self.log.warning("Got a job with host I cannot find. Check debug")
            self.log.debug("Got a jbo with host I cannot find {}".format(
                repr(crawl_request)))
            return

        upsert_crawl_document(
            url=crawl_request['url'],
            url_hash=hash_url(crawl_request['url']),
            latest_request=crawl_request,
            host=extract_hostname(crawl_request['url']),
            latest_request_date=datetime.datetime.now(
                datetime.timezone.utc
            )
        )

        self.send_crawl_request_to_host(crawl_request, host.host)
コード例 #5
0
ファイル: test_models.py プロジェクト: ifrpl/toddler
    def test_index_document(self):

        cd = IndexDocument(url="http://example.com", host="example.com")
        cd.save()
        self.assertEqual(cd.url_hash, hash_url(cd.url))
コード例 #6
0
ファイル: crawlmanager.py プロジェクト: ifrpl/toddler
    def process_task(self, msg):
        """
        Handling messages
        :param msg:
        :return:
        """
        crawl_result = Dict(msg)
        self.log.info("Processing {}".format(crawl_result.url))
        host = self.get_host_by_result(crawl_result)
        if host is None:
            self.log.warning("Got job for host I cannot find")
            self.log.debug("Got job for host I cannot find {}".format(
                repr(crawl_result)))
            return
        try:

            if "status_code" not in crawl_result:
                raise KeyError("`status_code` not found in crawl_result "
                               + "%s json:b64:" % crawl_result.url
                               + base64.b64encode(
                                    ujson.dumps(crawl_result).encode('utf8')
                                ).decode("utf8")
                               )

            upsert_crawl_document(
                url=crawl_result.url,
                url_hash=hash_url(crawl_result.url),
                latest_result=crawl_result.to_dict(),
                latest_result_date=now(),
                latest_status_code=crawl_result.status_code
            )

            # robot - we retrieved robots.txt
            if 'robots' in crawl_result.actions:
                self.process_robots_task(crawl_result)
            else:
                def try_again_tomorrow():
                    cd = CrawlDocument.objects(
                        url_hash=hash_url(crawl_result.url)).first()

                    # try again tomorrow
                    self.log.info("Try again tomorrow: {}".format(
                        crawl_result.url))
                    if cd is not None and 'url' in cd.latest_request:
                        self.send_crawl_request(cd.latest_request,
                                                timeout=now()+timedelta(days=1)
                                                )
                    else:
                        self.send_crawl_request(
                            crawl_result.crawl_task.to_dict(),
                            timeout=now()+timedelta(days=1)
                        )

                # 200, normal processing
                if crawl_result.status_code == 200:
                    if ('follow' in crawl_result.actions
                            or "nofollow" not in crawl_result.actions):
                        self.extract_and_send_crawl_requests(crawl_result)
                    if ('index' in crawl_result.actions
                            or "noindex" not in crawl_result.actions):
                        self.send_crawl_result_to_analysis(crawl_result)

                elif 400 <= crawl_result.status_code <= 499:
                    self.send_remove_request(crawl_result)
                elif 300 <= crawl_result.status_code <= 399:
                    try_again_tomorrow()
                elif 500 <= crawl_result.status_code <= 599:
                    try_again_tomorrow()

        except NoRobotsForHostError:
            # no robots.txt or it's expired, so we create request
            # for processing
            robots_request = {
                "url": parse.urljoin(crawl_result.url, "/robots.txt"),
                "cookies": crawl_result.cookies,
                "method": "GET",
                "actions": ["robots"],
                "timeout": datetime.datetime.now(
                    datetime.timezone.utc).isoformat()
            }

            host.robots_txt = RobotsTxt(status="waiting")
            host.save()

            self.send_crawl_request_to_host(robots_request, host.host)
            self.log.warning("No robots for {}".format(host.host))
            raise RequeueMessage
        except RequeueMessage as e:
            self.log.exception(e)
            raise e
        except Exception as e:
            self.log.exception(e)
            raise RequeueMessage

        return True