def read_mtd(self) -> (etree._Element, str): """ Read metadata and outputs the metadata XML root and its namespace .. code-block:: python >>> from eoreader.reader import Reader >>> path = r"S2A_MSIL1C_20200824T110631_N0209_R137_T30TTK_20200824T150432.SAFE.zip" >>> prod = Reader().open(path) >>> prod.read_mtd() (<Element {https://psd-14.sentinel2.eo.esa.int/PSD/S2_PDI_Level-2A_Tile_Metadata.xsd}Level-2A_Tile_ID at ...>, '{https://psd-14.sentinel2.eo.esa.int/PSD/S2_PDI_Level-2A_Tile_Metadata.xsd}') Returns: (etree._Element, str): Metadata XML root and its namespace """ # Get MTD XML file if self.is_archived: root = files.read_archived_xml(self.path, ".*GRANULE.*\.xml") else: # Open metadata file try: mtd_file = glob.glob( os.path.join(self.path, "GRANULE", "*", "*.xml"))[0] # pylint: disable=I1101: # Module 'lxml.etree' has no 'parse' member, but source is unavailable. xml_tree = etree.parse(mtd_file) root = xml_tree.getroot() except IndexError as ex: raise InvalidProductError( f"Metadata file not found in {self.path}") from ex # Get namespace idx = root.tag.rindex("}") namespace = root.tag[:idx + 1] return root, namespace
def read_mtd(self) -> (etree._Element, str): """ Read metadata and outputs the metadata XML root and its namespace .. code-block:: python >>> from eoreader.reader import Reader >>> path = r"LC08_L1GT_023030_20200518_20200527_01_T2" >>> prod = Reader().open(path) >>> prod.read_mtd() (<Element {http://www.rsi.ca/rs2/prod/xml/schemas}product at 0x1c0efbd37c8>, '{http://www.rsi.ca/rs2/prod/xml/schemas}') Returns: (etree._Element, str): Metadata XML root and its namespace """ # Get MTD XML file if self.is_archived: root = files.read_archived_xml(self.path, ".*product\.xml") else: # Open metadata file try: mtd_file = glob.glob(os.path.join(self.path, "product.xml"))[0] # pylint: disable=I1101: # Module 'lxml.etree' has no 'parse' member, but source is unavailable. xml_tree = etree.parse(mtd_file) root = xml_tree.getroot() except IndexError as ex: raise InvalidProductError( f"Metadata file (product.xml) not found in {self.path}" ) from ex # Get namespace idx = root.tag.rindex("}") namespace = root.tag[:idx + 1] return root, namespace
def _read_mtd(self, mtd_from_path: str, mtd_archived: str = None): """ Read metadata and outputs the metadata XML root and its namespaces as a dicts as a dict Args: mtd_from_path (str): Metadata regex (glob style) to find from extracted product mtd_archived (str): Metadata regex (re style) to find from archived product Returns: (etree._Element, dict): Metadata XML root and its namespaces """ if self.is_archived: root = files.read_archived_xml(self.path, mtd_archived) else: # ONLY FOR COLLECTION 2 try: mtd_file = glob.glob(os.path.join(self.path, mtd_from_path), recursive=True)[0] # pylint: disable=I1101: # Module 'lxml.etree' has no 'parse' member, but source is unavailable. xml_tree = etree.parse(mtd_file) root = xml_tree.getroot() except IndexError as ex: raise InvalidProductError( f"Metadata file ({mtd_from_path}) not found in {self.path}" ) from ex # Get namespaces map (only useful ones) nsmap = {key: f"{{{ns}}}" for key, ns in root.nsmap.items()} pop_list = ["xsi", "xs", "xlink"] for ns in pop_list: if ns in nsmap.keys(): nsmap.pop(ns) return root, nsmap
def read_mtd(self) -> (etree._Element, str): """ Read metadata and outputs the metadata XML root and its namespace .. code-block:: python >>> from eoreader.reader import Reader >>> path = r"S1A_IW_GRDH_1SDV_20191215T060906_20191215T060931_030355_0378F7_3696.zip" >>> prod = Reader().open(path) >>> prod.read_mtd() (<Element product at 0x1832895d788>, '') Returns: (etree._Element, str): Metadata XML root and its namespace """ # Get MTD XML file if self.is_archived: root = files.read_archived_xml(self.path, ".*annotation.*\.xml") else: # Open metadata file try: mtd_file = glob.glob(os.path.join(self.path, "annotation", "*.xml"))[0] # pylint: disable=I1101: # Module 'lxml.etree' has no 'parse' member, but source is unavailable. xml_tree = etree.parse(mtd_file) root = xml_tree.getroot() except IndexError as ex: raise InvalidProductError( f"Metadata file (product.xml) not found in {self.path}" ) from ex # Get namespace namespace = "" return root, namespace
def read_mtd(self) -> (etree._Element, str): """ Read metadata and outputs the metadata XML root and its namespace .. code-block:: python >>> from eoreader.reader import Reader >>> path = r"SENTINEL2B_20190401-105726-885_L2A_T31UEQ_D_V2-0.zip" >>> prod = Reader().open(path) >>> prod.read_mtd() (<Element Muscate_Metadata_Document at 0x252d2071e88>, '') Returns: (etree._Element, str): Metadata XML root and its namespace """ # Get MTD XML file if self.is_archived: root = files.read_archived_xml(self.path, ".*MTD_ALL\.xml") else: # Open metadata file try: mtd_xml = glob.glob(os.path.join(self.path, "*MTD_ALL.xml"))[0] # pylint: disable=I1101: # Module 'lxml.etree' has no 'parse' member, but source is unavailable. xml_tree = etree.parse(mtd_xml) root = xml_tree.getroot() except IndexError as ex: raise InvalidProductError( f"Metadata file not found in {self.path}" ) from ex # Get namespace namespace = "" return root, namespace
def read_mtd( self, force_pd=False) -> Union[pd.DataFrame, Tuple[etree._Element, str]]: """ Read Landsat metadata as: - a `pandas.DataFrame` whatever its collection is (by default for collection 1) - a XML root + its namespace if the product is retrieved from the 2nd collection (by default for collection 2) .. code-block:: python >>> from eoreader.reader import Reader >>> path = r"LC08_L1GT_023030_20200518_20200527_01_T2" >>> prod = Reader().open(path) >>> # COLLECTION 1 : Open metadata as panda DataFrame >>> prod.read_mtd() NAME ORIGIN ... RESAMPLING_OPTION value "Image courtesy of the U.S. Geological Survey" ... "CUBIC_CONVOLUTION" [1 rows x 197 columns] >>> # COLLECTION 2 : Open metadata as XML >>> path = r"LC08_L1TP_200030_20201220_20210310_02_T1" # Collection 2 >>> prod = Reader().open(path) >>> prod.read_mtd() (<Element LANDSAT_METADATA_FILE at 0x19229016048>, '') >>> # COLLECTION 2 : Force to pandas.DataFrame >>> prod.read_mtd(force_pd=True) NAME ORIGIN ... RESAMPLING_OPTION value "Image courtesy of the U.S. Geological Survey" ... "CUBIC_CONVOLUTION" [1 rows x 263 columns] Args: force_pd (bool): If collection 2, return a pandas.DataFrame instead of a XML root + namespace Returns: pd.DataFrame: Metadata as a Pandas DataFrame """ # WARNING: always use force_pd in this class ! as_pd = (self._collection == LandsatCollection.COL_1) or force_pd if as_pd: mtd_name = f"{self.name}_MTL.txt" if self.is_archived: # We need to extract the file in memory to be used with pandas tar_ds = tarfile.open(self.path, "r") info = [ f.name for f in tar_ds.getmembers() if mtd_name in f.name ][0] mtd_path = tar_ds.extractfile(info) else: # FOR COLLECTION 1 AND 2 tar_ds = None mtd_path = os.path.join(self.path, mtd_name) if not os.path.isfile(mtd_path): raise FileNotFoundError( f"Unable to find the metadata file associated with {self.path}" ) # Parse mtd_data = pd.read_table( mtd_path, sep="\s=\s", names=["NAME", "value"], skipinitialspace=True, engine="python", ) # Workaround an unexpected behaviour in pandas ! if any(mtd_data.NAME == "="): mtd_data = pd.read_table( mtd_path, sep="=", names=["NAME", "=", "value"], usecols=[0, 2], skipinitialspace=True, ) # Remove useless rows mtd_data = mtd_data[~mtd_data["NAME"]. isin(["GROUP", "END_GROUP", "END"])] # Set index mtd_data = mtd_data.set_index("NAME").T # Close if needed if tar_ds: tar_ds.close() else: if self.is_archived: root = files.read_archived_xml(self.path, f".*{self.name}_MTL.xml") else: # ONLY FOR COLLECTION 2 try: mtd_file = glob.glob( os.path.join(self.path, f"{self.name}_MTL.xml"))[0] # pylint: disable=I1101: # Module 'lxml.etree' has no 'parse' member, but source is unavailable. xml_tree = etree.parse(mtd_file) root = xml_tree.getroot() except IndexError as ex: raise InvalidProductError( f"Metadata file ({self.name}.xml) not found in {self.path}" ) from ex # Get namespace namespace = "" # No namespace here mtd_data = (root, namespace) return mtd_data