Example #1
0
 def wipe(self):
     logger.info("Wiping process data for %s.", self.process_id)
     request(
         "POST",
         urljoin(settings.KINGFISHER_PROCESS_URL,
                 "/api/v1/wipe_collection"),
         json={"collection_id": self.process_id},
         error_msg="Unable to wipe PROCESS",
         consume_exception=True,
     )
Example #2
0
    def run(self):
        name = self.get_pelican_dataset_name()

        request(
            "POST",
            urljoin(settings.PELICAN_FRONTEND_URL, "/datasets/"),
            json={"name": name, "collection_id": self.collection_id},
            error_msg=f"Publication {self.job.collection}: Pelican: Unable to create dataset with name {name!r} and "
            f"collection ID {self.collection_id}",
        )
Example #3
0
    def get_status(self):
        job_id = self.job.context.get("job_id")
        process_id = self.job.context.get("process_id")

        response = request(
            "GET",
            urljoin(self.host, "listjobs.json"),
            params={"project": self.project},
            error_msg=f"Unable to get status of collect job #{job_id}",
        )

        json = response.json()

        if json.get("status") == "error":
            raise Exception(json)

        if any(n["id"] == job_id for n in json.get("pending", [])):
            return Task.Status.WAITING

        # The log file does not exist if the job is pending.
        if not process_id:
            log = self.job.context.get("scrapy_log")

            try:
                response = request(
                    "get", log, error_msg=f"Unable to read scrapy log {log}")
            except RecoverableException as e:
                ex_cause = e.__cause__
                # If the request on the log file returns the error 404, something went wrong with the scrapy.
                # The file was probably lost, and the job will never be able to continue
                if isinstance(
                        ex_cause,
                        HTTPError) and ex_cause.response.status_code == 404:
                    raise Exception("Scrapy log file doesn't exist")
                raise e

            # Must match
            # https://github.com/open-contracting/kingfisher-collect/blob/7b386e8e7a198a96b733e2d8437a814632db4def/kingfisher_scrapy/extensions.py#L541
            m = re.search("Created collection (.+) in Kingfisher Process",
                          response.text)
            process_id = m.group(1) if m else None

            self.job.context["process_id"] = process_id
            self.job.save()

        if any(n["id"] == job_id for n in json.get("running", [])):
            return Task.Status.RUNNING

        if any(n["id"] == job_id for n in json.get("finished", [])):
            if not process_id:
                raise Exception("Process id is not set")

            return Task.Status.COMPLETED

        raise RecoverableException("Collect job is in undefined state")
Example #4
0
def update_collection_availability(job):
    try:
        pelican_id = job.context.get("pelican_id")
        response = request(
            "GET",
            urljoin(settings.PELICAN_FRONTEND_URL,
                    f"/datasets/{pelican_id}/coverage/"))
    except Exception as e:
        raise Exception(
            f"Publication {job.collection}: Pelican: Unable to get coverage of dataset {pelican_id}"
        ) from e

    counts = response.json()

    job.tenders_count = counts.get("tenders")
    job.tenderers_count = counts.get("tenderers")
    job.tenders_items_count = counts.get("tenders_items")
    job.parties_count = counts.get("parties")
    job.awards_count = counts.get("awards")
    job.awards_items_count = counts.get("awards_items")
    job.awards_suppliers_count = counts.get("awards_suppliers")
    job.contracts_count = counts.get("contracts")
    job.contracts_items_count = counts.get("contracts_items")
    job.contracts_transactions_count = counts.get("contracts_transactions")
    job.documents_count = counts.get("documents")
    job.plannings_count = counts.get("plannings")
    job.milestones_count = counts.get("milestones")
    job.amendments_count = counts.get("amendments")
    job.save()
Example #5
0
    def run(self):
        response = request(
            "POST",
            urljoin(self.host, "schedule.json"),
            data={
                "project": self.project,
                "spider": self.spider,
                "steps": "compile",  # no "check"
            },
            error_msg=
            f"Unable to schedule scraping for project {self.project} and spider {self.spider}",
        )

        json = response.json()
        if json.get("status") == "error":
            raise Exception(json)

        job_id = json.get("jobid")

        self.job.context["job_id"] = job_id
        self.job.context["spider"] = self.spider
        self.job.context["scrapy_log"] = urljoin(
            self.host, f"logs/{self.project}/{self.spider}/{job_id}.log")
        self.job.save()

        self.collection.last_update = date.today()
        self.collection.save()
Example #6
0
    def wipe(self):
        logger.info("Wiping Pelican data for %s.", self.collection_id)
        try:
            pelican_id = self.get_pelican_id()
        except RecoverableException:
            logger.warning("Unable to wipe PELICAN - pelican_id is not set")
            return

        if not pelican_id:
            logger.warning("Unable to wipe PELICAN - pelican_id is not set")
            return

        request(
            "DELETE",
            urljoin(settings.PELICAN_FRONTEND_URL, f"/datasets/{pelican_id}/"),
            error_msg=f"Publication {self.job.collection}: Pelican: Unable to wipe dataset with ID {pelican_id}",
            consume_exception=True,
        )
Example #7
0
    def get_status(self):
        pelican_id = self.get_pelican_id()
        if not pelican_id:
            return Task.Status.WAITING

        response = request(
            "GET",
            urljoin(settings.PELICAN_FRONTEND_URL, f"/datasets/{pelican_id}/status/"),
            error_msg=f"Publication {self.job.collection}: Pelican: Unable get status of dataset {pelican_id}",
        )

        json = response.json()
        if not json:
            return Task.Status.WAITING
        if json["phase"] == "CHECKED" and json["state"] == "OK":
            return Task.Status.COMPLETED
        return Task.Status.RUNNING
Example #8
0
    def get_pelican_id(self):
        pelican_id = self.job.context.get("pelican_id")
        if not pelican_id:
            name = self.get_pelican_dataset_name()

            response = request(
                "GET",
                urljoin(settings.PELICAN_FRONTEND_URL, "/datasets/find_by_name/"),
                params={"name": name},
                error_msg=f"Publication {self.job.collection}: Pelican: Unable to get ID for name {name!r}",
            )

            pelican_id = response.json().get("id")
            if pelican_id:
                self.job.context["pelican_id"] = pelican_id
                self.job.save()

        return pelican_id
Example #9
0
def update_collection_metadata(job):
    try:
        pelican_id = job.context.get("pelican_id")
        response = request(
            "GET",
            urljoin(settings.PELICAN_FRONTEND_URL,
                    f"/datasets/{pelican_id}/metadata/"))
    except Exception as e:
        raise Exception(
            f"Publication {job.collection}: Pelican: Unable to get metadata of dataset {pelican_id}"
        ) from e

    meta = response.json()

    if meta:
        job.date_from = parse_date(meta.get("published_from"))
        job.date_to = parse_date(meta.get("published_to"))
        job.license = meta.get("data_license") or ""
        job.ocid_prefix = meta.get("ocid_prefix")
        job.save()
Example #10
0
    def get_status(self):
        response = request(
            "GET",
            urljoin(settings.KINGFISHER_PROCESS_URL,
                    f"/api/v1/tree/{self.process_id}/"),
            error_msg=f"Unable to get status of process #{self.process_id}",
        )

        json = response.json()

        compile_releases = next(
            n for n in json if n.get("transform_type") == "compile-releases")
        is_last_completed = compile_releases.get("completed_at") is not None

        if "process_id_pelican" not in self.job.context:
            self.job.context["process_id_pelican"] = compile_releases.get("id")
            self.job.context["process_data_version"] = compile_releases.get(
                "data_version")
            self.job.save()

        return Task.Status.COMPLETED if is_last_completed else Task.Status.RUNNING