def maintenance_notice(self, task_id, more_info=None):
        """
        Create a new task on the DB with the status maintenance, that signals
        that something is wrong with that task but we can continue processing
        it.

        This can be used for example to signal that the structure of some api
        changed.
        Args:
            task_id: The task id to notice
            more_info: more information about the maintenance notice
        """
        task = Task.objects.get(task_id=task_id)

        if not task:
            raise Exception("Not found task with task id %s", task_id)

        task_data = {
            "status": STATUS_MAINTENANCE,
            "kind": task.kind,
            "options": task.options,
            "params": task.params,
            "type": task.type,
            "user": task.user,
            "more_info": [
                TaskMoreInfo(**{"source": self.__crawler_name__, "created_at": timezone.now(), "details": more_info})
            ],
        }

        Task.create(**task_data)
    def create_task(
        self,
        kind,
        include_companies=None,
        from_date=None,
        to_date=None,
        crawling_initials=None,
        status=STATUS_CREATED,
        should_fail=False,
    ):
        crawler_options = CRAWLER_OPTIONS.copy()
        # includes only the Vale company
        crawler_options["include_companies"] = include_companies
        # indicates the crawler to use
        crawler_options["crawler"] = kind
        # indicates the period that we should crawl the data
        crawler_options["from_date"] = from_date
        crawler_options["to_date"] = to_date
        # includes only companies that start with "V"
        crawler_options["crawling_initials"] = crawling_initials

        task = {
            "kind": kind,
            "user": "******",
            "type": ON_DEMAND_TASK,
            "status": status,
            "params": {
                "a": "b"
            },
            "options": {
                "c": "d"
            },
        }

        return Task.create(**task)
Beispiel #3
0
    def test_register_differences(self):
        crawler_clazz = CrawlersRegistry().get_crawler("bovespa")

        crawler = crawler_clazz()

        task_data = {
            "user": "******",
            "status": STATUS_CREATED,
            "kind": "bovespa",
            "params": "{}",
            "options": "{}",
            "type": ON_DEMAND_TASK,
        }
        task = Task.create(**task_data)

        bovespa_previous = self._create_bovespa_company(
            "123", "BGDS", "CREATED")
        bovespa_current = self._create_bovespa_company("123", "OpenExchange",
                                                       "CREATED")

        crawler.register_differences(previous_object=bovespa_previous,
                                     current_object=bovespa_current,
                                     task_id=task.task_id)

        task = Task.objects.filter(task_id=task.task_id).first()

        expected = sorted(["created_at", "updated_at", "company_name"])
        updated_fields = sorted(task.updated_fields)
        changed_fields = sorted(task.changed_fields)

        self.assertListEqual(expected, updated_fields)
        self.assertListEqual([], task.deleted_fields)
        self.assertListEqual([], task.inserted_fields)
        self.assertListEqual(expected, changed_fields)
        self.assertIsNotNone(task.differences_from_last_version)
    def test_write_to_task(self):
        @TimeIt()
        def method_to_time_default_parameter(a, b, c, d, execution_times=None):
            time.sleep(0.1)

        execution_times = []
        for x in range(10):
            method_to_time_default_parameter(0, 1, 2, 3, execution_times)

        task_data = {
            "user":
            "******",
            "status":
            1,
            "kind":
            "bovespa",
            "options":
            "{}",
            "params":
            "{}",
            "type":
            2,
            "more_info": [
                TaskMoreInfo(source="test",
                             created_at=timezone.now(),
                             details="a")
            ],
        }
        task = Task.create(**task_data)
        task = Task.objects.get(task_id=task.task_id)
        TimeIt.write_times_to_more_info(task, execution_times)

        task = Task.objects.get(task_id=task.task_id)
        # 10 execution_times + the existent more_info
        self.assertEquals(len(task.more_info), 11)
Beispiel #5
0
    def create_task(
        self,
        kind,
        include_companies=None,
        from_date=None,
        to_date=None,
        crawling_initials=None,
        status=STATUS_CREATED,
        should_fail=False,
    ):
        crawler_options = CRAWLER_OPTIONS.copy()
        # includes only the Vale company
        crawler_options["include_companies"] = include_companies
        # indicates the crawler to use
        crawler_options["crawler"] = kind
        # indicates the period that we should crawl the data
        crawler_options["from_date"] = from_date
        crawler_options["to_date"] = to_date
        # includes only companies that start with "V"
        crawler_options["crawling_initials"] = crawling_initials

        bovespa_crawler = BovespaCrawler()
        bovespa_crawler.crawl_params(TestProducer(), **crawler_options)

        for param in test_queue:
            task = {
                "options": param[1],
                "params": param[0],
                "kind": kind,
                "user": "******",
                "type": ON_DEMAND_TASK,
                "status": status,
            }

            if should_fail:
                task["kind"] = "something_else"
            Task.create(**task)
Beispiel #6
0
    def test_register_differences_already_computed(self):
        crawler_clazz = CrawlersRegistry().get_crawler("bovespa")

        crawler = crawler_clazz()

        task_data = {
            "user": "******",
            "status": STATUS_CREATED,
            "kind": "bovespa",
            "params": "{}",
            "options": "{}",
            "type": ON_DEMAND_TASK,
        }
        task = Task.create(**task_data)

        already_computed = {
            "all": {
                "updates": {
                    "created_at": {
                        "new_value": 1,
                        "old_value": 0
                    }
                }
            },
            "inserts": [],
            "updates": ["created_at"],
            "deletes": [],
        }
        crawler.register_differences(already_computed_diff=already_computed,
                                     task_id=task.task_id)

        task = Task.objects.filter(task_id=task.task_id).first()

        expected = sorted(["created_at"])
        updated_fields = sorted(task.updated_fields)
        changed_fields = sorted(task.changed_fields)

        self.assertListEqual(expected, updated_fields)
        self.assertListEqual([], task.deleted_fields)
        self.assertListEqual([], task.inserted_fields)
        self.assertListEqual(expected, changed_fields)
        self.assertIsNotNone(task.differences_from_last_version)
    def test_create_and_query(self):
        total = Task.objects.all()

        assert len(total) == 0

        options = {
            "workers_num": "4",
            "chromium_bin_file": settings.CHROMIUM_BIN_FILE,
            "cached_dir": "gs://my_crawler_cache",
            "local_dir": "fs:///data/crawler_one/local",
            "included_companies": '["4170", "14249"]',
            "from_date": "2018-01-01T00:00:00.000000Z",
            "crawling_initials": '["V", "P"]',
        }

        params = {"ccvm": 4170, "doc_type": "DFP", "fiscal_date": "2018-12-31", "version": "2.0"}

        task_data = {
            "user": "******",
            "status": STATUS_CREATED,
            "kind": "bovespa",
            "options": options,
            "params": params,
            "type": ON_DEMAND_TASK,
        }
        Task.create(**task_data)
        task_data["user"] = "******"
        Task.create(**task_data)
        task_data["user"] = "******"
        Task.create(**task_data)
        total = Task.objects.all()

        assert len(total) == 3

        # we need a tiny sleep here
        time.sleep(1)

        # test without conditions
        paginator = (
            CaravaggioSearchPaginator(query_string=str("user:user1"), limit=1000, max_limit=1000)
            .models(Task)
            .select("task_id*",)
        )

        all_tasks = []
        while paginator.has_next():
            paginator.next()
            results = paginator.get_results()
            for d in results:
                all_tasks.append(d.task_id)

        all_tasks = Task.objects.filter(task_id__in=all_tasks).all()
        assert len(all_tasks) == 1
        for task in all_tasks:
            task_params = json.loads(task.params)
            assert isinstance(task_params, dict)
            assert len(task_params) == len(params)
            assert task_params == params

            task_options = json.loads(task.options)
            assert isinstance(task_options, dict)
            assert len(task_options) == len(options)
            assert task_options == options