def wipe(self): logger.info("Wiping process data for %s.", self.process_id) request( "POST", urljoin(settings.KINGFISHER_PROCESS_URL, "/api/v1/wipe_collection"), json={"collection_id": self.process_id}, error_msg="Unable to wipe PROCESS", consume_exception=True, )
def run(self): name = self.get_pelican_dataset_name() request( "POST", urljoin(settings.PELICAN_FRONTEND_URL, "/datasets/"), json={"name": name, "collection_id": self.collection_id}, error_msg=f"Publication {self.job.collection}: Pelican: Unable to create dataset with name {name!r} and " f"collection ID {self.collection_id}", )
def get_status(self): job_id = self.job.context.get("job_id") process_id = self.job.context.get("process_id") response = request( "GET", urljoin(self.host, "listjobs.json"), params={"project": self.project}, error_msg=f"Unable to get status of collect job #{job_id}", ) json = response.json() if json.get("status") == "error": raise Exception(json) if any(n["id"] == job_id for n in json.get("pending", [])): return Task.Status.WAITING # The log file does not exist if the job is pending. if not process_id: log = self.job.context.get("scrapy_log") try: response = request( "get", log, error_msg=f"Unable to read scrapy log {log}") except RecoverableException as e: ex_cause = e.__cause__ # If the request on the log file returns the error 404, something went wrong with the scrapy. # The file was probably lost, and the job will never be able to continue if isinstance( ex_cause, HTTPError) and ex_cause.response.status_code == 404: raise Exception("Scrapy log file doesn't exist") raise e # Must match # https://github.com/open-contracting/kingfisher-collect/blob/7b386e8e7a198a96b733e2d8437a814632db4def/kingfisher_scrapy/extensions.py#L541 m = re.search("Created collection (.+) in Kingfisher Process", response.text) process_id = m.group(1) if m else None self.job.context["process_id"] = process_id self.job.save() if any(n["id"] == job_id for n in json.get("running", [])): return Task.Status.RUNNING if any(n["id"] == job_id for n in json.get("finished", [])): if not process_id: raise Exception("Process id is not set") return Task.Status.COMPLETED raise RecoverableException("Collect job is in undefined state")
def update_collection_availability(job): try: pelican_id = job.context.get("pelican_id") response = request( "GET", urljoin(settings.PELICAN_FRONTEND_URL, f"/datasets/{pelican_id}/coverage/")) except Exception as e: raise Exception( f"Publication {job.collection}: Pelican: Unable to get coverage of dataset {pelican_id}" ) from e counts = response.json() job.tenders_count = counts.get("tenders") job.tenderers_count = counts.get("tenderers") job.tenders_items_count = counts.get("tenders_items") job.parties_count = counts.get("parties") job.awards_count = counts.get("awards") job.awards_items_count = counts.get("awards_items") job.awards_suppliers_count = counts.get("awards_suppliers") job.contracts_count = counts.get("contracts") job.contracts_items_count = counts.get("contracts_items") job.contracts_transactions_count = counts.get("contracts_transactions") job.documents_count = counts.get("documents") job.plannings_count = counts.get("plannings") job.milestones_count = counts.get("milestones") job.amendments_count = counts.get("amendments") job.save()
def run(self): response = request( "POST", urljoin(self.host, "schedule.json"), data={ "project": self.project, "spider": self.spider, "steps": "compile", # no "check" }, error_msg= f"Unable to schedule scraping for project {self.project} and spider {self.spider}", ) json = response.json() if json.get("status") == "error": raise Exception(json) job_id = json.get("jobid") self.job.context["job_id"] = job_id self.job.context["spider"] = self.spider self.job.context["scrapy_log"] = urljoin( self.host, f"logs/{self.project}/{self.spider}/{job_id}.log") self.job.save() self.collection.last_update = date.today() self.collection.save()
def wipe(self): logger.info("Wiping Pelican data for %s.", self.collection_id) try: pelican_id = self.get_pelican_id() except RecoverableException: logger.warning("Unable to wipe PELICAN - pelican_id is not set") return if not pelican_id: logger.warning("Unable to wipe PELICAN - pelican_id is not set") return request( "DELETE", urljoin(settings.PELICAN_FRONTEND_URL, f"/datasets/{pelican_id}/"), error_msg=f"Publication {self.job.collection}: Pelican: Unable to wipe dataset with ID {pelican_id}", consume_exception=True, )
def get_status(self): pelican_id = self.get_pelican_id() if not pelican_id: return Task.Status.WAITING response = request( "GET", urljoin(settings.PELICAN_FRONTEND_URL, f"/datasets/{pelican_id}/status/"), error_msg=f"Publication {self.job.collection}: Pelican: Unable get status of dataset {pelican_id}", ) json = response.json() if not json: return Task.Status.WAITING if json["phase"] == "CHECKED" and json["state"] == "OK": return Task.Status.COMPLETED return Task.Status.RUNNING
def get_pelican_id(self): pelican_id = self.job.context.get("pelican_id") if not pelican_id: name = self.get_pelican_dataset_name() response = request( "GET", urljoin(settings.PELICAN_FRONTEND_URL, "/datasets/find_by_name/"), params={"name": name}, error_msg=f"Publication {self.job.collection}: Pelican: Unable to get ID for name {name!r}", ) pelican_id = response.json().get("id") if pelican_id: self.job.context["pelican_id"] = pelican_id self.job.save() return pelican_id
def update_collection_metadata(job): try: pelican_id = job.context.get("pelican_id") response = request( "GET", urljoin(settings.PELICAN_FRONTEND_URL, f"/datasets/{pelican_id}/metadata/")) except Exception as e: raise Exception( f"Publication {job.collection}: Pelican: Unable to get metadata of dataset {pelican_id}" ) from e meta = response.json() if meta: job.date_from = parse_date(meta.get("published_from")) job.date_to = parse_date(meta.get("published_to")) job.license = meta.get("data_license") or "" job.ocid_prefix = meta.get("ocid_prefix") job.save()
def get_status(self): response = request( "GET", urljoin(settings.KINGFISHER_PROCESS_URL, f"/api/v1/tree/{self.process_id}/"), error_msg=f"Unable to get status of process #{self.process_id}", ) json = response.json() compile_releases = next( n for n in json if n.get("transform_type") == "compile-releases") is_last_completed = compile_releases.get("completed_at") is not None if "process_id_pelican" not in self.job.context: self.job.context["process_id_pelican"] = compile_releases.get("id") self.job.context["process_data_version"] = compile_releases.get( "data_version") self.job.save() return Task.Status.COMPLETED if is_last_completed else Task.Status.RUNNING