Esempio n. 1
0
def metadata_convert(path, bucket=None):
    """Prepare metadata prior to datacube indexing

    Given a directory containing biogeo zones tiles generated by the rasterize_vector_file command line
    prepares a metadata string with the appropriate formating for indexing in the datacube
    The other ``metadata_convert`` functions assume that datasets are separated in different different
    so that the ``prepare_metadata`` command line handles the optional iteration based on this assumption.
    Because all tiles of a country mask are all in a same folder, iteration has to be handles by the
    present function.
    The tile are expected to be in EPSG:4326 crs

    Args:
        path (str): Path of the directory containing the biogeographic zones tiles
        bucket (str or None): Name of the s3 bucket containing the data. If ``None``
            (default), data are considered to be on a mounted filesystem

    Examples:
        >>> from madmex.ingestion.biogeographic_zones import metadata_convert
        >>> path = '/path/to/mask/dir'
        >>> yaml_str = metadata_convert(path)
        >>> with open('/path/to/metadata_out.yaml', 'w') as dst:
        >>>     dst.write(yaml_str)

    Returns:
        str: The content of the metadata for later writing to file.
    """
    if bucket is not None:
        file_list = s3.list_files(bucket, path, r'.*_\d+.tif$')
        file_list = [s3.build_rasterio_path(bucket, x) for x in file_list]
    else:
        file_list = glob(os.path.join(path, '*_*.tif'))

    def build_meta_string(x):
        """Generate the yaml string for a single tile

        Args:
            x (str): The path of the dataset
        """
        try:
            with rasterio.open(x) as src:
                crs = src.crs
                bounds = src.bounds
            meta_out = {
                'id': uuid.uuid5(uuid.NAMESPACE_URL, x),
                'll_lat': bounds.bottom,
                'lr_lat': bounds.bottom,
                'ul_lat': bounds.top,
                'ur_lat': bounds.top,
                'll_lon': bounds.left,
                'lr_lon': bounds.right,
                'ul_lon': bounds.left,
                'ur_lon': bounds.right,
                'crs': crs.wkt,
                'band': x,
            }
            # Load template
            env = Environment(loader=PackageLoader('madmex', 'templates'))
            template = env.get_template('biogeographic_zones.yaml')
            out = template.render(**meta_out)
            return out
        except Exception as e:
            pass

    yaml_list = [build_meta_string(x) for x in file_list]
    yaml_list = [x for x in yaml_list if x is not None]
    return '\n---\n'.join(yaml_list)
Esempio n. 2
0
def metadata_convert(path, bucket=None):
    """Prepare metatdata prior to datacube indexing

    Given a directory containing landsat surface reflectance bands and a MLT.txt
    file, prepares a metadata string with the appropriate formating.

    Args:
        path (str): Path of the directory containing the surface reflectance bands
            and the Landsat metadata file.
        bucket (str or None): Name of the s3 bucket containing the data. If ``None``
            (default), data are considered to be on a mounted filesystem

    Examples:
        >>> from madmex.ingestion.landsat_espa import metadata_convert
        >>> from glob import glob

        >>> scene_list = glob('/path/to/scenes/*')
        >>> yaml_list = [metadata_convert(x) for x in scene_list]

        >>> with open('/path/to/metadata_out.yaml', 'w') as dst:
        >>>     for yaml in yaml_list:
        >>>         dst.write(yaml)
        >>>         dst.write('\n---\n')

    Returns:
        str: The content of the metadata for later writing to file.
    """
    pattern = re.compile(
        r'[A-Z0-9]{4}_[A-Z0-9]{4}_\d{6}_\d{8}_\d{8}_01_(T1|T2|RT)\.xml')
    if bucket is None:
        # Check that path is a dir and contains appropriate files
        if not os.path.isdir(path):
            raise ValueError('Argument path= is not a directory')
        mtl_file_list = glob(os.path.join(path, '*.xml'))
        # Filter list of xml files with regex (there could be more than one in case
        # some bands have been opend in qgis for example)
        mtl_file_list = [x for x in mtl_file_list if pattern.search(x)]
        print(mtl_file_list)
        if len(mtl_file_list) != 1:
            raise ValueError('Could not identify a unique xml metadata file')
        mtl_file = mtl_file_list[0]
        # Start parsing xml
        root = ET.parse(mtl_file).getroot()
    else:
        file_list = s3.list_files(bucket=bucket, path=path)
        pattern = re.compile(r'.*\.xml$')
        mtl_file_list = [x for x in file_list if pattern.search(x)]
        if len(mtl_file_list) != 1:
            raise ValueError('Could not identify a unique xml metadata file')
        mtl_file = mtl_file_list[0]
        # REad xml as string
        xml_str = s3.read_file(bucket, mtl_file)
        # generate element tree root
        root = ET.fromstring(xml_str)
        path = s3.build_rasterio_path(bucket, path)

    ns = 'http://espa.cr.usgs.gov/v2'
    # Build datetime from date and time
    date_str = root.find('ns:global_metadata/ns:acquisition_date',
                         namespaces={
                             'ns': ns
                         }).text
    time_str = root.find('ns:global_metadata/ns:scene_center_time',
                         namespaces={
                             'ns': ns
                         }).text
    dt = '%sT%s' % (date_str, time_str[:8])
    # satellite sensor metadata
    instrument = root.find('ns:global_metadata/ns:instrument',
                           namespaces={
                               'ns': ns
                           }).text
    satellite = root.find('ns:global_metadata/ns:satellite',
                          namespaces={
                              'ns': ns
                          }).text
    # Scene corners in projected coordinates
    ulx = float(
        root.find(
            'ns:global_metadata/ns:projection_information/ns:corner_point[@location="UL"]',
            namespaces={
                'ns': ns
            }).attrib['x'])
    uly = float(
        root.find(
            'ns:global_metadata/ns:projection_information/ns:corner_point[@location="UL"]',
            namespaces={
                'ns': ns
            }).attrib['y'])
    lrx = float(
        root.find(
            'ns:global_metadata/ns:projection_information/ns:corner_point[@location="LR"]',
            namespaces={
                'ns': ns
            }).attrib['x'])
    lry = float(
        root.find(
            'ns:global_metadata/ns:projection_information/ns:corner_point[@location="LR"]',
            namespaces={
                'ns': ns
            }).attrib['y'])
    utm_zone = int(
        root.find(
            'ns:global_metadata/ns:projection_information/ns:utm_proj_params/ns:zone_code',
            namespaces={
                'ns': ns
            }).text)
    crs = CRS({'proj': 'utm', 'zone': utm_zone})
    # Get coorner coordinates in long lat by transforming from projected values
    p = Proj(crs)
    ul_lon, ul_lat = p(ulx, uly, inverse=True)
    lr_lon, lr_lat = p(lrx, lry, inverse=True)
    ll_lon, ll_lat = p(ulx, lry, inverse=True)
    ur_lon, ur_lat = p(lrx, uly, inverse=True)
    # Prepare metadata fields
    meta_out = {
        'id':
        uuid.uuid5(uuid.NAMESPACE_URL, path),
        'dt':
        dt,
        'll_lat':
        ll_lat,
        'lr_lat':
        lr_lat,
        'ul_lat':
        ul_lat,
        'ur_lat':
        ur_lat,
        'll_lon':
        ll_lon,
        'lr_lon':
        lr_lon,
        'ul_lon':
        ul_lon,
        'ur_lon':
        ur_lon,
        'll_x':
        ulx,
        'lr_x':
        lrx,
        'ul_x':
        ulx,
        'ur_x':
        lrx,
        'll_y':
        lry,
        'lr_y':
        lry,
        'ul_y':
        uly,
        'ur_y':
        uly,
        'crs':
        crs.wkt,
        'blue':
        os.path.join(
            path,
            root.find('ns:bands/ns:band[@name="%s"]/ns:file_name' %
                      LANDSAT_BANDS[instrument]['blue'],
                      namespaces={
                          'ns': 'http://espa.cr.usgs.gov/v2'
                      }).text),
        'green':
        os.path.join(
            path,
            root.find('ns:bands/ns:band[@name="%s"]/ns:file_name' %
                      LANDSAT_BANDS[instrument]['green'],
                      namespaces={
                          'ns': 'http://espa.cr.usgs.gov/v2'
                      }).text),
        'red':
        os.path.join(
            path,
            root.find('ns:bands/ns:band[@name="%s"]/ns:file_name' %
                      LANDSAT_BANDS[instrument]['red'],
                      namespaces={
                          'ns': 'http://espa.cr.usgs.gov/v2'
                      }).text),
        'nir':
        os.path.join(
            path,
            root.find('ns:bands/ns:band[@name="%s"]/ns:file_name' %
                      LANDSAT_BANDS[instrument]['nir'],
                      namespaces={
                          'ns': 'http://espa.cr.usgs.gov/v2'
                      }).text),
        'swir1':
        os.path.join(
            path,
            root.find('ns:bands/ns:band[@name="%s"]/ns:file_name' %
                      LANDSAT_BANDS[instrument]['swir1'],
                      namespaces={
                          'ns': 'http://espa.cr.usgs.gov/v2'
                      }).text),
        'swir2':
        os.path.join(
            path,
            root.find('ns:bands/ns:band[@name="%s"]/ns:file_name' %
                      LANDSAT_BANDS[instrument]['swir2'],
                      namespaces={
                          'ns': 'http://espa.cr.usgs.gov/v2'
                      }).text),
        'qual':
        os.path.join(
            path,
            root.find('ns:bands/ns:band[@name="pixel_qa"]/ns:file_name',
                      namespaces={
                          'ns': 'http://espa.cr.usgs.gov/v2'
                      }).text),
        'instrument':
        instrument,
        'platform':
        satellite,
    }
    # Load template
    env = Environment(loader=PackageLoader('madmex', 'templates'))
    template = env.get_template('landsat_espa.yaml')
    out = template.render(**meta_out)
    return out
Esempio n. 3
0
def metadata_convert(path, bucket=None):
    """Prepare metatdata prior to datacube indexing

    Given a directory containing sentinel2 surface reflectance bands processed
    with sen2cor, prepares a metadata string with the appropriate formating.

    Args:
        path (str): Path of the directory containing data and metadata with SAFE
            structure. Output of sen2cor processor
        bucket (str or None): Name of the s3 bucket containing the data. If ``None``
            (default), data are considered to be on a mounted filesystem

    Examples:
        >>> from madmex.ingestion.sentinel2_sr_20m import metadata_convert
        >>> from glob import glob

        >>> scene_list = glob('/path/to/scenes/*')
        >>> yaml_list = [metadata_convert(x) for x in scene_list]

        >>> with open('/path/to/metadata_out.yaml', 'w') as dst:
        >>>     for yaml in yaml_list:
        >>>         dst.write(yaml)
        >>>         dst.write('\n---\n')

    Returns:
        str: The content of the metadata for later writing to file.
    """
    def get_namespace(element):
        m = re.match('\{(.*)\}', element.tag)
        return m.group(1) if m else ''
    # List all files in directory
    if bucket is None:
        all_files = glob(os.path.join(path, '**'), recursive=True)
    else:
        all_files = s3.list_files(bucket, path)
    # Check that path is a dir and contains appropriate files
    if len(all_files) <= 1:
        raise ValueError('Argument path= is not a directory, or doesn\'t contain any files')
    mtl_pattern = re.compile(r'.*GRANULE.*MTD_TL\.xml$')
    mtl_file = [x for x in all_files if mtl_pattern.search(x)][0]
    # Extract metadata from filename
    # satellite = os.path.basename(path)[:3]
    satellite = 'sentinel2' # using generic name for both satellite to avoid mismatch with product description
    instrument = 'MSI'
    # Start parsing xml
    if bucket is None:
        root = ET.parse(mtl_file).getroot()
    else:
        xml_str = s3.read_file(bucket, mtl_file)
        root = ET.fromstring(xml_str)
    n1 = get_namespace(root)
    date_str = root.find('n1:General_Info/SENSING_TIME',
                         namespaces={'n1': n1}).text
    dt = date_str[:-5]
    # Scene corners in projected coordinates
    nrow = int(root.find('n1:Geometric_Info/Tile_Geocoding/Size[@resolution="20"]/NROWS',
                     namespaces={'n1': n1}).text)
    ncol = int(root.find('n1:Geometric_Info/Tile_Geocoding/Size[@resolution="20"]/NCOLS',
                     namespaces={'n1': n1}).text)
    ulx = float(root.find('n1:Geometric_Info/Tile_Geocoding/Geoposition[@resolution="20"]/ULX',
                     namespaces={'n1': n1}).text)
    uly = float(root.find('n1:Geometric_Info/Tile_Geocoding/Geoposition[@resolution="20"]/ULY',
                     namespaces={'n1': n1}).text)
    lrx = ulx + ncol * 20
    lry = uly - nrow * 20
    # Get coorner coordinates in long lat by transforming from projected values 
    crs = root.find('n1:Geometric_Info/Tile_Geocoding/HORIZONTAL_CS_CODE',
                               namespaces={'n1': n1}).text
    crs_wkt = CRS(init=crs).wkt
    p = Proj(init=crs)
    ul_lon, ul_lat = p(ulx, uly, inverse=True)
    lr_lon, lr_lat = p(lrx, lry, inverse=True)
    ll_lon, ll_lat = p(ulx, lry, inverse=True)
    ur_lon, ur_lat = p(lrx, uly, inverse=True)
    # FUnction to get band path from its suffix
    def get_band(suffix):
        pattern = re.compile(r'.*GRANULE/.*/IMG_DATA/R20m/.*%s_20m\.jp2$' % suffix)
        band = [x for x in all_files if pattern.search(x)][0]
        if bucket is not None:
            band = s3.build_rasterio_path(bucket, band)
        return band
    # Prepare metadata fields
    meta_out = {
        'id': uuid.uuid5(uuid.NAMESPACE_URL, path),
        'dt': dt,
        'll_lat': ll_lat,
        'lr_lat': lr_lat,
        'ul_lat': ul_lat,
        'ur_lat': ur_lat,
        'll_lon': ll_lon,
        'lr_lon': lr_lon,
        'ul_lon': ul_lon,
        'ur_lon': ur_lon,
        'll_x': ulx,
        'lr_x': lrx,
        'ul_x': ulx,
        'ur_x': lrx,
        'll_y': lry,
        'lr_y': lry,
        'ul_y': uly,
        'ur_y': uly,
        'crs': crs_wkt,
        'blue': get_band('B02'),
        'green': get_band('B03'),
        'red': get_band('B04'),
        're1': get_band('B05'),
        're2': get_band('B06'),
        're3': get_band('B07'),
        'nir': get_band('B8A'),
        'swir1': get_band('B11'),
        'swir2': get_band('B12'),
        'qual': get_band('SCL'),
        'instrument': instrument,
        'platform': satellite,
    }
    # Load template
    env = Environment(loader=PackageLoader('madmex', 'templates'))
    template = env.get_template('s2_l2a_20m.yaml')
    out = template.render(**meta_out)
    return out
Esempio n. 4
0
def metadata_convert(path, bucket=None):
    """Prepare metadata prior to datacube indexing

    Given a directory containing sentinel1 () polarisation VH and VV)
    data preprocessed with snappy, prepares a metadata string with
    the appropriate formating. 

    Args:
        path (str): Path of the directory containing sentinel1 data. 
        bucket (str or None): Name of the s3 bucket containing the data. If ``None``
            (default), data are considered to be on a mounted filesystem

    Examples:
        >>> from madmex.ingestion.s1_grd_vh_vv import metadata_convert

        >>> path = '/path/to/s1/dir'                                       
        >>> yaml_str = metadata_convert(path)                                   
                                                                        
        >>> with open('/path/to/metadata_out.yaml', 'w') as dst:                
        >>>     dst.write(yaml_str) 

    Returns:
        str: The content of the metadata for later writing to file.
    """
    if bucket is not None:
        file_list = [
            os.path.basename(x)
            for x in s3.list_files(bucket, path, r'.*filtered\.tif$')
        ]
        path = s3.build_rasterio_path(bucket, path)
    else:
        file_list = [
            os.path.basename(x)
            for x in glob(os.path.join(path, '*filtered.tif'))
        ]
    pol_vh = [x for x in file_list if '_VH_' in x][0]
    pol_vv = [x for x in file_list if '_VV_' in x][0]
    pol_vh = os.path.join(path, pol_vh)
    pol_vv = os.path.join(path, pol_vv)

    fname = os.path.basename(pol_vh).split("_")[0]
    if 'T' in fname:
        date_str = fname.replace('T', '')
    else:
        date_str = fname
    dt = datetime.strptime(date_str, '%Y%m%d%H%M%S')
    # Check that these files exist
    with rasterio.open(pol_vh) as src:
        crs = src.crs
        bounds = src.bounds
    meta_out = {
        'id': uuid.uuid5(uuid.NAMESPACE_URL, path),
        'll_lat': bounds.bottom,
        'lr_lat': bounds.bottom,
        'ul_lat': bounds.top,
        'ur_lat': bounds.top,
        'll_lon': bounds.left,
        'lr_lon': bounds.right,
        'ul_lon': bounds.left,
        'ur_lon': bounds.right,
        'dt': dt.strftime('%Y-%m-%dT%H:%M:%S'),  # 2018-01-22T17:56:29
        'crs': crs.wkt,
        'pol_vh': pol_vh,
        'pol_vv': pol_vv,
    }
    # Load template
    env = Environment(loader=PackageLoader('madmex', 'templates'))
    template = env.get_template('s1_grd_vh_vv.yaml')
    out = template.render(**meta_out)
    return out