def check_timeout(self, task_status):
     """
     See how long since task_status last changed.
     """
     if not ("previous_task_status" in vars(self) \
        and "previous_task_status_change") in vars(self):
         self.previous_task_status = task_status
         self.previous_task_status_change = datetime.datetime.now()
         return False
     if self.previous_task_status == task_status:
         # task status has not changed
         # see how long it has been since last change
         time_now = datetime.datetime.now()
         if time_now > self.previous_task_status_change \
            + datetime.timedelta(minutes=self.timeout):
             logger.info(
                 "{}: reached timeout of {} minutes since last change. Aborting"\
                 .format(
                     self.name, self.timeout
                 )
             )
             return True
     else:
         # task status has changed - reset the timer and the previous_task_status
         self.previous_task_status = task_status
         self.previous_task_status_change = datetime.datetime.now()
     return False
    def check_if_finished(self):
        if self.run_mode == "local":
            return self.is_finished
        elif self.parent and self.parent.batch_job_id:
            job_id = self.parent.batch_job_id
            task_status = batch_utils.check_tasks_status(job_id, self.name)

            logger.info(
                "{} job status: success: {} failed: {} running: {} waiting: {} cannot_run: {}"
                .format(self.name, task_status["num_success"],
                        task_status["num_failed"], task_status["num_running"],
                        task_status["num_waiting"],
                        task_status["num_cannot_run"]))
            self.run_status["succeeded"] = task_status["num_success"]
            self.run_status["failed"] = task_status[
                "num_failed"] + task_status["num_cannot_run"]
            num_incomplete = task_status["num_running"] + task_status[
                "num_waiting"]
            self.run_status["incomplete"] = num_incomplete
            self.is_finished = (num_incomplete == 0)

            # if we have exceeded timeout, say that we are finished.
            if self.check_timeout(task_status):
                self.is_finished = True
        return self.is_finished
    def run(self):
        self.check_config()
        output_dict = {}
        logger.info("{}: getting weather time series".format(self.name))
        weather_time_series = self.get_weather_time_series()
        output_dict[self.weather_collection] = {
            "type": "weather",
            "time-series-data": weather_time_series,
        }
        logger.info("{}: getting vegetation time series".format(self.name))
        veg_time_series = self.get_veg_time_series()
        output_dict[self.veg_collection] = {
            "type": "vegetation",
            "time-series-data": veg_time_series,
        }
        logger.info("{}: checking combined JSON".format(self.name))
        self.check_output_dict(output_dict)
        logger.info("{}: filling metadata dict".format(self.name))
        metadata_dict = self.get_metadata()
        output_dict["metadata"] = metadata_dict
        self.save_json(
            output_dict,
            self.output_filename,
            self.output_location,
            self.output_location_type,
        )

        logger.info("{}: Wrote output to {}".format(
            self.name,
            self.join_path(self.output_location, self.output_filename)))
        self.is_finished = True
Ejemplo n.º 4
0
    def run(self):
        self.prepare_for_run()

        start_date, end_date = self.date_range
        date_ranges = slice_time_period(start_date, end_date, self.time_per_point)
        download_locations = []
        for date_range in date_ranges:
            mid_date = find_mid_period(date_range[0], date_range[1])
            location = self.join_path(self.output_location, mid_date, "RAW")
            logger.debug("{} Will check for existing files in {}".format(self.name, location))
            if not self.replace_existing_files and self.check_for_existing_files(
                location, self.num_files_per_point
            ):
                continue
            urls = self.prep_data(date_range)
            logger.debug(
                "{}: got URL {} for date range {}".format(self.name, urls, date_range)
            )
            downloaded_ok = self.download_data(urls, location)
            if downloaded_ok:
                self.run_status["succeeded"] += 1
                logger.info("{}: download succeeded for date range {}".format(self.name, date_range))
                download_locations.append(location)
            else:
                self.run_status["failed"] += 1
                logger.error("{}: download did not succeed for date range {}".format(self.name, date_range))
        self.is_finished = True
        return self.run_status
Ejemplo n.º 5
0
    def download_data(self, download_urls, download_location):
        """
        Download zip file(s) from GEE to configured output location.

        Parameters
        ---------
        download_urls: list of strings (URLs) from gee_prep_data
        download_location: str, this will generally be <base_dir>/<date>/RAW

        Returns:
        --------
        bool, True if downloaded something, False otherwise
        """
        if len(download_urls) == 0:
            logger.info("{}: No URLs found for {}".format(self.name, self.coords))
            return False

        # download files and unzip to temporary directory
        tempdir = tempfile.TemporaryDirectory()
        for download_url in download_urls:
            try:
                download_and_unzip(download_url, tempdir.name)
            except RuntimeError as e:
                return False
        logger.debug("{}: Wrote zipfiles to {}".format(self.name, tempdir.name))
        logger.info("{}: Will download to {}".format(self.name, download_location))
        self.copy_to_output_location(tempdir.name, download_location, [".tif"])
        return True
    def process_single_date(self, date_string):
        """
        Each date will have a subdirectory called 'SPLIT' with ~400 NDVI
        sub-images.
        """
        # if we are given a list of date strings to process, and this isn't
        # one of them, skip it.
        if self.dates_to_process and not date_string in self.dates_to_process:
            logger.info("{} will not process date {}".format(
                self.name, date_string))
            return True

        # see if there is already a ndvi.json file in
        # the output location - if so, skip
        output_location = self.join_path(self.output_location, date_string,
                                         *(self.output_location_subdirs))
        if (not self.replace_existing_files) and self.check_for_existing_files(
                output_location, self.num_files_per_point):
            return True

        input_path = self.join_path(self.input_location, date_string,
                                    *(self.input_location_subdirs))
        all_input_files = self.list_directory(input_path,
                                              self.input_location_type)
        logger.info("input path is {}".format(input_path))

        # list all the "NDVI" sub-images where RGB image passes quality check
        input_files = [
            filename for filename in all_input_files if "_NDVI" in filename
            and self.check_sub_image(filename, input_path)
        ]

        if len(input_files) == 0:
            logger.info("{}: No sub-images for date {}".format(
                self.name, date_string))
            return
        else:
            logger.info("{}: found {} sub-images".format(
                self.name, len(input_files)))
        # if we only want a subset of sub-images, truncate the list here
        if self.n_sub_images > 0:
            input_files = input_files[:self.n_sub_images]

        ndvi_vals = []
        for ndvi_file in input_files:
            coords_string = find_coords_string(ndvi_file)
            ndvi_dict = self.process_sub_image(
                self.join_path(input_path, ndvi_file), date_string,
                coords_string)
            ndvi_vals.append(ndvi_dict)

        self.save_json(ndvi_vals, "ndvi_values.json", output_location,
                       self.output_location_type)

        return True
Ejemplo n.º 7
0
    def prep_data(self, date_range):
        """
        Interact with the Google Earth Engine API to get in ImageCollection,
        filter it, and convert (e.g. via median or sum) into a list of Images,
        then get the download URLs for those.

        Parameters
        ----------
        date_range: list of strings 'YYYY-MM-DD'.  Note that this will generally
                    be a sub-range of the overall date-range, as this function
                    is called in the loop over time slices.

        Returns
        -------
        url_list:  a list of URLs from which zipfiles can be downloaded from GEE.
        """
        region = get_region_string(self.coords, self.region_size)
        start_date, end_date = date_range

        image_coll = ee.ImageCollection(self.collection_name)
        geom = ee.Geometry.Point(self.coords)

        dataset = image_coll.filterBounds(geom).filterDate(start_date, end_date)
        dataset_size = dataset.size().getInfo()

        if dataset_size == 0:
            logger.info("No images found in this date rage, skipping.")
            log_msg = "WARN >>> No data found."
            return []
        # concrete class will do more filtering, and prepare Images for download
        image_list = self.prep_images(dataset)
        url_list = []
        for image in image_list:
            # get a URL from which we can download the resulting data
            try:
                url = image.getDownloadURL({"region": region, "scale": self.scale})
                url_list.append(url)
            except Exception as e:
                logger.info("Unable to get URL: {}".format(e))

            logging.info(
                f"OK   >>> Found {dataset.size().getInfo()}/{dataset_size} valid images after cloud filtering."
            )
        return url_list
    def process_single_date(self, date_string):
        """
        Read the tif files downloaded from GEE and extract the values
        (should be the same for all pixels in the image, so just take mean())

        Parameters
        ----------
        date_string: str, format "YYYY-MM-DD"

        """
        metrics_dict = {}
        # if we are given a list of date strings to process, and this isn't
        # one of them, skip it.
        if self.dates_to_process and not date_string in self.dates_to_process:
            logger.info("{} will not process date {}".format(
                self.name, date_string))
            return True
        logger.info("{}: Processing date {}".format(self.name, date_string))
        input_location = self.join_path(self.input_location, date_string,
                                        *(self.input_location_subdirs))
        for filename in self.list_directory(input_location,
                                            self.input_location_type):
            if filename.endswith(".tif"):
                name_variable = (filename.split("."))[1]
                variable_array = cv.imread(
                    self.get_file(self.join_path(input_location, filename),
                                  self.input_location_type),
                    cv.IMREAD_ANYDEPTH,
                )

                metrics_dict[name_variable] = variable_array.mean().astype(
                    np.float64)
        self.save_json(
            metrics_dict,
            "weather_data.json",
            self.join_path(self.output_location, date_string,
                           *(self.output_location_subdirs)),
            self.output_location_type,
        )
        return True
    def run_local(self):
        """
        loop over dates and call process_single_date on all of them.
        """
        logger.info("{}: Running local".format(self.name))
        if "dates_to_process" in vars(self) and len(self.dates_to_process) > 0:
            date_strings = self.dates_to_process
        else:
            date_strings = sorted(
                self.list_directory(self.input_location,
                                    self.input_location_type))

        for date_string in date_strings:
            date_regex = "[\d]{4}-[\d]{2}-[\d]{2}"
            if not re.search(date_regex, date_string):
                logger.info("{}: {} not a date string".format(
                    self.name, date_string))
                continue
            logger.debug(
                "{}: date string {} input exists {} output exists {}".format(
                    self.name,
                    date_string,
                    self.check_input_data_exists(date_string),
                    self.check_output_data_exists(date_string),
                ))

            if self.check_input_data_exists(
                    date_string
            ) and not self.check_output_data_exists(date_string):
                succeeded = self.process_single_date(date_string)
                if succeeded:
                    self.run_status["succeeded"] += 1
                else:
                    self.run_status["failed"] += 1
        self.is_finished = True
        return self.run_status
    def save_rgb_image(self, band_dict, date_string, coords_string):
        """
        Merge the seperate tif files for the R,G,B bands into
        one image, and save it.
        """
        logger.info("{}: Saving RGB image for {} {}".format(
            self.name, date_string, coords_string))
        rgb_image = convert_to_rgb(band_dict)

        # check image quality on the colour image
        if not check_image_ok(rgb_image, 1.0):
            logger.info("Detected a low quality image, skipping to next date.")
            return False
        rgb_filepath = self.construct_image_savepath(date_string,
                                                     coords_string, "RGB")
        logger.info("Will save image to {} / {}".format(
            os.path.dirname(rgb_filepath), os.path.basename(rgb_filepath)))
        self.save_image(rgb_image, os.path.dirname(rgb_filepath),
                        os.path.basename(rgb_filepath))
        if self.split_RGB_images:
            self.split_and_save_sub_images(rgb_image, date_string,
                                           coords_string, "RGB")
        return True
    def process_single_date(self, date_string):
        """
        Each date will have a subdirectory called 'SPLIT' with ~400 BWNDVI
        sub-images.
        """
        logger.info("{}: processing {}".format(self.name, date_string))
        # if we are given a list of date strings to process, and this isn't
        # one of them, skip it.
        if self.dates_to_process and not date_string in self.dates_to_process:
            logger.info("{} will not process date {}".format(
                self.name, date_string))
            return True
        # see if there is already a network_centralities.json file in
        # the output location - if so, skip
        output_location = self.join_path(self.output_location, date_string,
                                         *(self.output_location_subdirs))
        if (not self.replace_existing_files) and self.check_for_existing_files(
                output_location, self.num_files_per_point):
            return True

        input_path = self.join_path(self.input_location, date_string,
                                    *(self.input_location_subdirs))
        all_input_files = self.list_directory(input_path,
                                              self.input_location_type)

        # list all the "BWNDVI" sub-images where RGB image passes quality check
        input_files = [
            filename for filename in all_input_files if "BWNDVI" in filename
            and self.check_sub_image(filename, input_path)
        ]
        if len(input_files) == 0:
            logger.info("{}: No sub-images for date {}".format(
                self.name, date_string))
            return
        else:
            logger.info("{} found {} sub-images".format(
                self.name, len(input_files)))
        tmp_json_dir = tempfile.mkdtemp()

        # if we only want a subset of sub-images, truncate the list here
        if self.n_sub_images > 0:
            input_files = input_files[:self.n_sub_images]

        # create a multiprocessing pool to handle each sub-image in parallel
        with Pool(processes=self.n_threads) as pool:
            # prepare the arguments for the process_sub_image function
            arguments = [(
                i,
                self.get_file(self.join_path(input_path, filename),
                              self.input_location_type),
                tmp_json_dir,
                date_string,
                find_coords_string(filename),
            ) for i, filename in enumerate(input_files)]
            pool.starmap(process_sub_image, arguments)
        # put all the output json files for subimages together into one for this date
        logger.info("\n Consolidating json from all subimages")
        all_subimages = consolidate_json_to_list(tmp_json_dir)
        self.save_json(
            all_subimages,
            "network_centralities.json",
            output_location,
            self.output_location_type,
        )
        shutil.rmtree(tmp_json_dir)
        return True
    def process_single_date(self, date_string):
        """
        For a single set of .tif files corresponding to a date range
        (normally a sub-range of the full date range for the pipeline),
        construct RGB, and NDVI greyscale images.
        Then do processing and thresholding to make black+white NDVI images.
        Split the RGB and black+white NDVI ones into small (50x50pix)
        sub-images.

        Parameters
        ==========
        date_string: str, format YYYY-MM-DD

        Returns
        =======
        True if everything was processed and saved OK, False otherwise.
        """
        # first see if there are already files in the output location
        # (in which case we can skip this date)

        # normally the coordinates will be part of the file path
        coords_string = find_coords_string(self.input_location)
        # if not though, we might have coords set explicitly
        if (not coords_string) and "coords" in vars(self):
            coords_string = "{}_{}".format(self.coords[0], self.coords[1])

        if not coords_string and date_string:
            raise RuntimeError(
                "{}: coords and date need to be defined, through file path or explicitly set"
            )

        output_location = os.path.dirname(
            self.construct_image_savepath(date_string, coords_string))
        if (not self.replace_existing_files) and self.check_for_existing_files(
                output_location, self.num_files_per_point):
            return True

        # If no files already there, proceed.
        input_filepath = self.join_path(self.input_location, date_string,
                                        *(self.input_location_subdirs))
        logger.info("{} processing files in {}".format(self.name,
                                                       input_filepath))
        filenames = [
            filename for filename in self.list_directory(
                input_filepath, self.input_location_type)
            if filename.endswith(".tif")
        ]
        if len(filenames) == 0:
            return True

        # extract this to feed into `convert_to_rgb()`
        band_dict = {}
        for icol, col in enumerate("rgb"):
            band = self.RGB_bands[icol]
            filename = self.get_file(
                self.join_path(input_filepath, "download.{}.tif".format(band)),
                self.input_location_type,
            )
            band_dict[col] = {"band": band, "filename": filename}

        logger.info(filenames)
        tif_filebase = self.join_path(input_filepath,
                                      filenames[0].split(".")[0])

        # save the rgb image
        rgb_ok = self.save_rgb_image(band_dict, date_string, coords_string)
        if not rgb_ok:
            logger.info("Problem with the rgb image?")
            return False

        # save the NDVI image
        ndvi_tif = self.get_file(
            self.join_path(input_filepath, "download.NDVI.tif"),
            self.input_location_type)
        ndvi_image = scale_tif(ndvi_tif)
        ndvi_filepath = self.construct_image_savepath(date_string,
                                                      coords_string, "NDVI")
        self.save_image(ndvi_image, os.path.dirname(ndvi_filepath),
                        os.path.basename(ndvi_filepath))

        # preprocess and threshold the NDVI image
        processed_ndvi = process_and_threshold(ndvi_image)
        ndvi_bw_filepath = self.construct_image_savepath(
            date_string, coords_string, "BWNDVI")
        self.save_image(
            processed_ndvi,
            os.path.dirname(ndvi_bw_filepath),
            os.path.basename(ndvi_bw_filepath),
        )

        # split and save sub-images
        self.split_and_save_sub_images(ndvi_image, date_string, coords_string,
                                       "NDVI")

        self.split_and_save_sub_images(processed_ndvi, date_string,
                                       coords_string, "BWNDVI")

        return True
    def run_batch(self):
        """"
        Write a config json file for each set of dates.
        If this module depends on another module running in batch, we first
        get the tasks on which this modules tasks will depend on.
        If not, we look at the input dates subdirectories and divide them up
        amongst the number of batch nodes.

        We want to create a list of dictionaries
        [{"task_id": <task_id>, "config": <config_dict>, "depends_on": [<task_ids>]}]
        to pass to the batch_utils.submit_tasks function.
        """

        logger.info("{} running in batch".format(self.name))
        self.all_tasks_submitted = False
        self.start_time = datetime.datetime.now()
        task_dicts = []
        task_dependencies = self.get_dependent_batch_tasks()

        if len(task_dependencies) == 0:
            # divide up the dates
            date_strings = sorted(
                self.list_directory(self.input_location,
                                    self.input_location_type))
            logger.info("number of date strings in input location {}".format(
                len(date_strings)))
            date_strings = [
                ds for ds in date_strings if self.check_input_data_exists(ds)
            ]
            logger.info("number of date strings with input data {}".format(
                len(date_strings)))
            date_strings = [
                ds for ds in date_strings
                if not self.check_output_data_exists(ds)
            ]
            logger.info("number of date strings without output data {}".format(
                len(date_strings)))
            # split these dates up over the batch tasks
            if self.n_batch_tasks > 0:
                n_batch_tasks = self.n_batch_tasks
            else:
                n_batch_tasks = len(date_strings)
            dates_per_task = assign_dates_to_tasks(date_strings, n_batch_tasks)
            # create a config dict for each task - will correspond to configuration for an
            # instance of this Module.

            for i in range(len(dates_per_task)):
                task_dict = self.create_task_dict("{}_{}".format(self.name, i),
                                                  dates_per_task[i])
                logger.debug("{} adding task_dict {} to list".format(
                    self.name, task_dict))
                task_dicts.append(task_dict)
        else:
            # we have a bunch of tasks from the previous Module in the Sequence
            for i, (k, v) in enumerate(task_dependencies.items()):
                # key k will be the task_id of the old task.  v will be the list of dates.
                task_dict = self.create_task_dict("{}_{}".format(self.name, i),
                                                  v, [k])
                logger.debug(
                    "{} adding task_dict with dependency {} to list".format(
                        self.name, task_dict))
                task_dicts.append(task_dict)
        # Take the job_id from the parent Sequence if there is one
        if self.parent and self.parent.batch_job_id:
            job_id = self.parent.batch_job_id
        else:
            # otherwise create a new job_id just for this module
            job_id = self.name + "_" + time.strftime("%Y-%m-%d_%H-%M-%S")
        logger.info("{}: about to submit tasks for job {}".format(
            self.name, job_id))
        submitted_ok = batch_utils.submit_tasks(task_dicts, job_id)
        if submitted_ok:
            # store the task dict so any dependent modules can query it
            self.batch_task_dict = {
                td["task_id"]: td["config"]["dates_to_process"]
                for td in task_dicts
            }
            self.all_tasks_submitted = True
            logger.debug(
                "{} submitted all tasks ok, my task_dict is now {}".format(
                    self.name, self.batch_task_dict))
        return submitted_ok
    def get_dependent_batch_tasks(self):
        """
        When running in batch, we are likely to depend on tasks submitted by
        the previous Module in the Sequence.  This Module should be in the
        "depends_on" attribute of this one.

        Task dependencies will be a dict of format
        {"task_id": <task_id>, "date_range": [<dates>]}
        """
        task_dependencies = {}
        if len(self.depends_on) > 0:
            for dependency in self.depends_on:
                if not (self.parent and self.parent.get(dependency)):
                    logger.info("{} couldn't retrieve dependency {}".format(
                        self.name.dependency))
                    continue
                dependency_module = self.parent.get(dependency)
                logger.info("{}: has dependency on {}".format(
                    self.name, dependency_module.name))
                if (not "run_mode" in vars(dependency_module)
                    ) or dependency_module.run_mode == "local":
                    logger.info(
                        "{}: dependency module {} is in local run mode".format(
                            self.name, dependency_module.name))
                    continue
                logger.debug("has {} submitted all tasks? {}".format(
                    dependency_module.name,
                    dependency_module.all_tasks_submitted))
                while not dependency_module.all_tasks_submitted:
                    logger.info(
                        "{}: waiting for {} to submit all batch tasks".format(
                            self.name, dependency_module.name))
                    logger.info(".", end="")
                    sys.stdout.flush()
                    time.sleep(1)
                task_dependencies.update(dependency_module.batch_task_dict)
        logger.info("{} return task_dependencies {}".format(
            self.name, task_dependencies))
        return task_dependencies