Ejemplo n.º 1
0
def run_one(
    tile_string: str,
    base_dir: Path,
    s3_destination: str,
    update_metadata: bool,
    log: Logger,
):
    year = tile_string.split("/")[0]
    tile = tile_string.split("/")[1]

    workdir = base_dir / tile_string / "wrk"
    outdir = base_dir / tile_string / "out"

    s3_destination = f"{s3_destination}/{year}/{tile}"
    file_key = f"{tile}_{year[-2:]}"

    stac_self_href = f"s3://{s3_destination}/{file_key}.stac-item.json"

    if s3_head_object(stac_self_href) is not None and not update_metadata:
        log.info(f"{stac_self_href} already exists, skipping")
        return
    elif update_metadata:
        if int(year) > 2010:
            name = "{}_{}_sl_{}_F02DAR.tif".format(tile, year[-2:], "HH")
        else:
            name = "{}_{}_sl_{}.tif".format(tile, year[-2:], "HH")
        one_file = f"s3://{s3_destination}/{name}"

        if s3_head_object(one_file) is not None:
            # Data file exists, so we can update metadata
            log.info(f"{one_file} exists, updating metadata only")
            write_stac(s3_destination, one_file, file_key, year, log)
            # Finish here, we don't need to create the data files
            return
        else:
            # Nothing to see here, keep on walking!
            log.info(
                f"{one_file} does not exist, continuing with data creation.")

    try:
        log.info(f"Starting up process for tile {tile_string}")
        make_directories([workdir, outdir], log)
        download_files(workdir, year, tile, log)
        list_of_cogs = combine_cog(workdir, outdir, tile, year, log)
        upload_to_s3(s3_destination, list_of_cogs, log)
        write_stac(s3_destination, list_of_cogs[0], file_key, year, log)
        delete_directories([workdir, outdir], log)
    except Exception:
        log.exception(f"Job failed for tile {tile_string}")
        exit(1)
Ejemplo n.º 2
0
 def exists(self, task: Union[Task, str]) -> bool:
     if isinstance(task, str):
         uri = task
     else:
         uri = self.uri(task)
     _u = urlparse(uri)
     if _u.scheme == 's3':
         s3 = s3_client(creds=self._get_creds(), cache=True)
         meta = s3_head_object(uri, s3=s3)
         return meta is not None
     elif _u.scheme == 'file':
         return Path(_u.path).exists()
     else:
         raise ValueError(f"Can't handle url: {uri}")
inventory = list_inventory(manifest, s3=client)

report_every = 10000
count = 0

json_docs = 0
to_fix = 0

for obj in inventory:
    count += 1
    if count % report_every == 0:
        print(f"Processing {count}")
    if obj.Key.endswith(".json"):
        json_docs += 1
        o_dict = s3_head_object(f"s3://{obj.Bucket}/{obj.Key}", s3=client)
        if o_dict["ContentType"] != "application/json":
            try:
                if DO_FIX:
                    client.copy_object(
                        Bucket=obj.Bucket,
                        Key=obj.Key,
                        CopySource=f"{obj.Bucket}/{obj.Key}",
                        ContentType="application/json",
                        MetadataDirective="REPLACE",
                    )
                to_fix += 1
                print(f"Fixed {to_fix} out of {json_docs}")
            except KeyError as e:
                print(f"Failed to find content type for {obj.Key}", o_dict, e)
Ejemplo n.º 4
0
def create_mosaic(
    dc: Datacube,
    product: str,
    out_product: str,
    time: Tuple[str, str],
    time_str: str,
    bands: Tuple[str],
    s3_output_root: str,
    split_bands: bool = False,
    resolution: int = 120,
    overwrite: bool = False,
):
    log = setup_logging()
    log.info(f"Creating mosaic for {product} over {time}")

    client = start_local_dask()

    assets = {}
    data = dc.load(
        product=product,
        time=time,
        resolution=(-resolution, resolution),
        dask_chunks={"x": 2048, "y": 2048},
        measurements=bands,
    )

    # This is a bad idea, we run out of memory
    # data.persist()

    if not split_bands:
        log.info("Creating a single tif file")
        out_file = _get_path(s3_output_root, out_product, time_str, "tif")
        exists = s3_head_object(out_file) is not None
        skip_writing = not (not exists or overwrite)
        try:
            asset, _ = _save_opinionated_cog(
                data,
                out_file,
                skip_writing=skip_writing,
            )
        except ValueError:
            log.exception(
                "Failed to create COG, please check that you only have one timestep in the period."
            )
            exit(1)
        assets[bands[0]] = asset
        if skip_writing:
            log.info(f"File exists, and overwrite is False. Not writing {out_file}")
        else:
            log.info(f"Finished writing: {asset.href}")
    else:
        log.info("Creating multiple tif files")

        for band in bands:
            out_file = _get_path(
                s3_output_root, out_product, time_str, "tif", band=band
            )
            exists = s3_head_object(out_file) is not None
            skip_writing = not (not exists or overwrite)

            try:
                asset, band = _save_opinionated_cog(
                    data=data,
                    out_file=out_file,
                    band=band,
                    skip_writing=skip_writing,
                )
            except ValueError:
                log.exception(
                    "Failed to create COG, please check that you only have one timestep in the period."
                )
                exit(1)
            assets[band] = asset
            if skip_writing:
                log.info(f"File exists, and overwrite is False. Not writing {out_file}")
            else:
                log.info(f"Finished writing: {asset.href}")
                # Aggressively heavy handed, but we get memory leaks otherwise
                client.restart()

    out_stac_file = _get_path(s3_output_root, out_product, time_str, "stac-item.json")
    item = create_stac_item(
        assets[bands[0]].href,
        id=f"{product}_{time_str}",
        assets=assets,
        with_proj=True,
        properties={
            "odc:product": out_product,
            "start_datetime": f"{time[0]}T00:00:00Z",
            "end_datetime": f"{time[1]}T23:59:59Z",
        },
    )
    item.set_self_href(out_stac_file)

    log.info(f"Writing STAC: {out_stac_file}")
    client = s3_client(aws_unsigned=False)
    s3_dump(
        data=json.dumps(item.to_dict(), indent=2),
        url=item.self_href,
        ACL="bucket-owner-full-control",
        ContentType="application/json",
        s3=client,
    )
Ejemplo n.º 5
0
def download_gls(year: str, s3_dst: str, workdir: Path, overwrite: bool = False):
    log = setup_logging()
    assets = {}
    out_stac = URL(s3_dst) / year / f"{PRODUCT_NAME}_{year}.stac-item.json"

    if s3_head_object(str(out_stac)) is not None and not overwrite:
        log.info(f"{out_stac} exists, skipping")
        return

    # Download the files
    for name, file in FILES.items():
        # Create a temporary directory to work with
        with TemporaryDirectory(prefix=workdir) as tmpdir:
            log.info(f"Working on {file}")
            url = URL(
                BASE_URL.format(
                    record_id=YEARS[year][1], year_key=YEARS[year][0], file=file
                )
            )

            dest_url = URL(s3_dst) / year / f"{PRODUCT_NAME}_{year}_{name}.tif"

            if s3_head_object(str(dest_url)) is None or overwrite:
                log.info(f"Downloading {url}")

                try:
                    local_file = Path(tmpdir) / str(url.name)
                    # Download the file
                    download_file(url, local_file)

                    log.info(f"Downloaded file to {local_file}")
                    local_file_small = translate_file_deafrica_extent(local_file)
                    log.info(f"Clipped Africa out and saved to {local_file_small}")
                    resampling = "nearest" if name in DO_NEAREST else "bilinear"

                    # Create a COG in memory and upload to S3
                    with MemoryFile() as mem_dst:
                        # Creating the COG, with a memory cache and no download. Shiny.
                        cog_translate(
                            local_file_small,
                            mem_dst.name,
                            cog_profiles.get("deflate"),
                            in_memory=True,
                            nodata=255,
                            overview_resampling=resampling,
                        )
                        mem_dst.seek(0)
                        s3_dump(mem_dst, str(dest_url), ACL="bucket-owner-full-control")
                        log.info(f"File written to {dest_url}")
                except Exception:
                    log.exception(f"Failed to process {url}")
                    exit(1)
            else:
                log.info(f"{dest_url} exists, skipping")

            assets[name] = pystac.Asset(
                href=str(dest_url), roles=["data"], media_type=pystac.MediaType.COG
            )

    # Write STAC document from the last-written file
    source_doc = f"https://zenodo.org/record/{YEARS[year][1]}"
    item = create_stac_item(
        str(dest_url),
        id=str(odc_uuid("Copernicus Global Land Cover", "3.0.1", [source_doc])),
        assets=assets,
        with_proj=True,
        properties={
            "odc:product": PRODUCT_NAME,
            "start_datetime": f"{year}-01-01T00:00:00Z",
            "end_datetime": f"{year}-12-31T23:59:59Z",
        },
    )
    item.add_links(
        [
            pystac.Link(
                target=source_doc,
                title="Source",
                rel=pystac.RelType.DERIVED_FROM,
                media_type="text/html",
            )
        ]
    )
    s3_dump(
        json.dumps(item.to_dict(), indent=2),
        str(out_stac),
        ContentType="application/json",
        ACL="bucket-owner-full-control",
    )
    log.info(f"STAC written to {out_stac}")
Ejemplo n.º 6
0
def download_cci_lc(year: str,
                    s3_dst: str,
                    workdir: str,
                    overwrite: bool = False):
    log = setup_logging()
    assets = {}

    cci_lc_version = get_version_from_year(year)
    name = f"{PRODUCT_NAME}_{year}_{cci_lc_version}"

    out_cog = URL(s3_dst) / year / f"{name}.tif"
    out_stac = URL(s3_dst) / year / f"{name}.stac-item.json"

    if s3_head_object(str(out_stac)) is not None and not overwrite:
        log.info(f"{out_stac} exists, skipping")
        return

    workdir = Path(workdir)
    if not workdir.exists():
        workdir.mkdir(parents=True, exist_ok=True)

    # Create a temporary directory to work with
    tmpdir = mkdtemp(prefix=str(f"{workdir}/"))
    log.info(f"Working on {year} in the path {tmpdir}")

    if s3_head_object(str(out_cog)) is None or overwrite:
        log.info(f"Downloading {year}")
        try:
            local_file = Path(tmpdir) / f"{name}.zip"
            if not local_file.exists():
                # Download the file
                c = cdsapi.Client()

                # We could also retrieve the object metadata from the CDS.
                # e.g. f = c.retrieve("series",{params}) | f.location = URL to download
                c.retrieve(
                    "satellite-land-cover",
                    {
                        "format": "zip",
                        "variable": "all",
                        "version": cci_lc_version,
                        "year": str(year),
                    },
                    local_file,
                )

                log.info(f"Downloaded file to {local_file}")
            else:
                log.info(
                    f"File {local_file} exists, continuing without downloading"
                )

            # Unzip the file
            log.info(f"Unzipping {local_file}")
            unzipped = None
            with zipfile.ZipFile(local_file, "r") as zip_ref:
                unzipped = local_file.parent / zip_ref.namelist()[0]
                zip_ref.extractall(tmpdir)

            # Process data
            ds = xr.open_dataset(unzipped)
            # Subset to Africa
            ulx, uly, lrx, lry = AFRICA_BBOX
            # Note: lats are upside down!
            ds_small = ds.sel(lat=slice(uly, lry), lon=slice(ulx, lrx))
            ds_small = assign_crs(ds_small, crs="epsg:4326")

            # Create cog (in memory - :mem: returns bytes object)
            mem_dst = write_cog(
                ds_small.lccs_class,
                ":mem:",
                nodata=0,
                overview_resampling="nearest",
            )

            # Write to s3
            s3_dump(mem_dst, str(out_cog), ACL="bucket-owner-full-control")
            log.info(f"File written to {out_cog}")

        except Exception:
            log.exception(f"Failed to process {name}")
            exit(1)
    else:
        log.info(f"{out_cog} exists, skipping")

    assets["classification"] = pystac.Asset(href=str(out_cog),
                                            roles=["data"],
                                            media_type=pystac.MediaType.COG)

    # Write STAC document
    source_doc = (
        "https://cds.climate.copernicus.eu/cdsapp#!/dataset/satellite-land-cover"
    )
    item = create_stac_item(
        str(out_cog),
        id=str(
            odc_uuid("Copernicus Land Cover", cci_lc_version,
                     [source_doc, name])),
        assets=assets,
        with_proj=True,
        properties={
            "odc:product": PRODUCT_NAME,
            "start_datetime": f"{year}-01-01T00:00:00Z",
            "end_datetime": f"{year}-12-31T23:59:59Z",
        },
    )
    item.add_links([
        pystac.Link(
            target=source_doc,
            title="Source",
            rel=pystac.RelType.DERIVED_FROM,
            media_type="text/html",
        )
    ])
    s3_dump(
        json.dumps(item.to_dict(), indent=2),
        str(out_stac),
        ContentType="application/json",
        ACL="bucket-owner-full-control",
    )
    log.info(f"STAC written to {out_stac}")
Ejemplo n.º 7
0
def download_and_cog_chirps(
    year: str,
    month: str,
    s3_dst: str,
    day: str = None,
    overwrite: bool = False,
    slack_url: str = None,
):
    # Cleaning and sanity checks
    s3_dst = s3_dst.rstrip("/")

    # Set up file strings
    if day is not None:
        # Set up a daily process
        in_file = f"chirps-v2.0.{year}.{month}.{day}.tif.gz"
        in_href = DAILY_URL_TEMPLATE.format(year=year, in_file=in_file)
        in_data = f"/vsigzip//vsicurl/{in_href}"
        if not check_for_url_existence(in_href):
            log.warning("Couldn't find the gzipped file, trying the .tif")
            in_file = f"chirps-v2.0.{year}.{month}.{day}.tif"
            in_href = DAILY_URL_TEMPLATE.format(year=year, in_file=in_file)
            in_data = f"/vsicurl/{in_href}"

            if not check_for_url_existence(in_href):
                log.error("Couldn't find the .tif file either, aborting")
                sys.exit(1)

        file_base = f"{s3_dst}/{year}/{month}/chirps-v2.0_{year}.{month}.{day}"
        out_data = f"{file_base}.tif"
        out_stac = f"{file_base}.stac-item.json"

        start_datetime = f"{year}-{month}-{day}T00:00:00Z"
        end_datetime = f"{year}-{month}-{day}T23:59:59Z"
        product_name = "rainfall_chirps_daily"
    else:
        # Set up a monthly process
        in_file = f"chirps-v2.0.{year}.{month}.tif.gz"
        in_href = MONTHLY_URL_TEMPLATE.format(in_file=in_file)
        in_data = f"/vsigzip//vsicurl/{in_href}"
        if not check_for_url_existence(in_href):
            log.warning("Couldn't find the gzipped file, trying the .tif")
            in_file = f"chirps-v2.0.{year}.{month}.tif"
            in_href = MONTHLY_URL_TEMPLATE.format(in_file=in_file)
            in_data = f"/vsicurl/{in_href}"

            if not check_for_url_existence(in_href):
                log.error("Couldn't find the .tif file either, aborting")
                sys.exit(1)

        file_base = f"{s3_dst}/chirps-v2.0_{year}.{month}"
        out_data = f"{file_base}.tif"
        out_stac = f"{file_base}.stac-item.json"

        _, end = calendar.monthrange(int(year), int(month))
        start_datetime = f"{year}-{month}-01T00:00:00Z"
        end_datetime = f"{year}-{month}-{end}T23:59:59Z"
        product_name = "rainfall_chirps_monthly"

        # Set to 15 for the STAC metadata
        day = 15

    try:
        # Check if file already exists
        log.info(f"Working on {in_file}")
        if not overwrite and s3_head_object(out_stac) is not None:
            log.warning(f"File {out_stac} already exists. Skipping.")
            return

        # COG and STAC
        with MemoryFile() as mem_dst:
            # Creating the COG, with a memory cache and no download. Shiny.
            cog_translate(
                in_data,
                mem_dst.name,
                cog_profiles.get("deflate"),
                in_memory=True,
                nodata=-9999,
            )
            # Creating the STAC document with appropriate date range
            _, end = calendar.monthrange(int(year), int(month))
            item = create_stac_item(
                mem_dst,
                id=str(odc_uuid("chirps", "2.0", [in_file])),
                with_proj=True,
                input_datetime=datetime(int(year), int(month), int(day)),
                properties={
                    "odc:processing_datetime": datetime_to_str(datetime.now()),
                    "odc:product": product_name,
                    "start_datetime": start_datetime,
                    "end_datetime": end_datetime,
                },
            )
            item.set_self_href(out_stac)
            # Manually redo the asset
            del item.assets["asset"]
            item.assets["rainfall"] = pystac.Asset(
                href=out_data,
                title="CHIRPS-v2.0",
                media_type=pystac.MediaType.COG,
                roles=["data"],
            )
            # Let's add a link to the source
            item.add_links([
                pystac.Link(
                    target=in_href,
                    title="Source file",
                    rel=pystac.RelType.DERIVED_FROM,
                    media_type="application/gzip",
                )
            ])

            # Dump the data to S3
            mem_dst.seek(0)
            log.info(f"Writing DATA to: {out_data}")
            s3_dump(mem_dst, out_data, ACL="bucket-owner-full-control")
            # Write STAC to S3
            log.info(f"Writing STAC to: {out_stac}")
            s3_dump(
                json.dumps(item.to_dict(), indent=2),
                out_stac,
                ContentType="application/json",
                ACL="bucket-owner-full-control",
            )
            # All done!
            log.info(f"Completed work on {in_file}")

    except Exception as e:
        message = f"Failed to handle {in_file} with error {e}"

        if slack_url is not None:
            send_slack_notification(slack_url, "Chirps Rainfall Monthly",
                                    message)
        log.exception(message)

        exit(1)