Esempio n. 1
0
    def read_mtd(self) -> (etree._Element, str):
        """
        Read metadata and outputs the metadata XML root and its namespace

        .. code-block:: python

            >>> from eoreader.reader import Reader
            >>> path = r"S2A_MSIL1C_20200824T110631_N0209_R137_T30TTK_20200824T150432.SAFE.zip"
            >>> prod = Reader().open(path)
            >>> prod.read_mtd()
            (<Element {https://psd-14.sentinel2.eo.esa.int/PSD/S2_PDI_Level-2A_Tile_Metadata.xsd}Level-2A_Tile_ID at ...>,
            '{https://psd-14.sentinel2.eo.esa.int/PSD/S2_PDI_Level-2A_Tile_Metadata.xsd}')

        Returns:
            (etree._Element, str): Metadata XML root and its namespace
        """
        # Get MTD XML file
        if self.is_archived:
            root = files.read_archived_xml(self.path, ".*GRANULE.*\.xml")
        else:
            # Open metadata file
            try:
                mtd_file = glob.glob(
                    os.path.join(self.path, "GRANULE", "*", "*.xml"))[0]

                # pylint: disable=I1101:
                # Module 'lxml.etree' has no 'parse' member, but source is unavailable.
                xml_tree = etree.parse(mtd_file)
                root = xml_tree.getroot()
            except IndexError as ex:
                raise InvalidProductError(
                    f"Metadata file not found in {self.path}") from ex

        # Get namespace
        idx = root.tag.rindex("}")
        namespace = root.tag[:idx + 1]

        return root, namespace
Esempio n. 2
0
    def read_mtd(self) -> (etree._Element, str):
        """
        Read metadata and outputs the metadata XML root and its namespace

        .. code-block:: python

            >>> from eoreader.reader import Reader
            >>> path = r"LC08_L1GT_023030_20200518_20200527_01_T2"
            >>> prod = Reader().open(path)
            >>> prod.read_mtd()
            (<Element {http://www.rsi.ca/rs2/prod/xml/schemas}product at 0x1c0efbd37c8>,
            '{http://www.rsi.ca/rs2/prod/xml/schemas}')

        Returns:
            (etree._Element, str): Metadata XML root and its namespace
        """
        # Get MTD XML file
        if self.is_archived:
            root = files.read_archived_xml(self.path, ".*product\.xml")
        else:
            # Open metadata file
            try:
                mtd_file = glob.glob(os.path.join(self.path, "product.xml"))[0]

                # pylint: disable=I1101:
                # Module 'lxml.etree' has no 'parse' member, but source is unavailable.
                xml_tree = etree.parse(mtd_file)
                root = xml_tree.getroot()
            except IndexError as ex:
                raise InvalidProductError(
                    f"Metadata file (product.xml) not found in {self.path}"
                ) from ex

        # Get namespace
        idx = root.tag.rindex("}")
        namespace = root.tag[:idx + 1]

        return root, namespace
Esempio n. 3
0
    def _read_mtd(self, mtd_from_path: str, mtd_archived: str = None):
        """
        Read metadata and outputs the metadata XML root and its namespaces as a dicts as a dict

        Args:
            mtd_from_path (str): Metadata regex (glob style) to find from extracted product
            mtd_archived (str): Metadata regex (re style) to find from archived product

        Returns:
            (etree._Element, dict): Metadata XML root and its namespaces

        """
        if self.is_archived:
            root = files.read_archived_xml(self.path, mtd_archived)
        else:
            # ONLY FOR COLLECTION 2
            try:
                mtd_file = glob.glob(os.path.join(self.path, mtd_from_path),
                                     recursive=True)[0]

                # pylint: disable=I1101:
                # Module 'lxml.etree' has no 'parse' member, but source is unavailable.
                xml_tree = etree.parse(mtd_file)
                root = xml_tree.getroot()
            except IndexError as ex:
                raise InvalidProductError(
                    f"Metadata file ({mtd_from_path}) not found in {self.path}"
                ) from ex

        # Get namespaces map (only useful ones)
        nsmap = {key: f"{{{ns}}}" for key, ns in root.nsmap.items()}
        pop_list = ["xsi", "xs", "xlink"]
        for ns in pop_list:
            if ns in nsmap.keys():
                nsmap.pop(ns)

        return root, nsmap
Esempio n. 4
0
    def read_mtd(self) -> (etree._Element, str):
        """
        Read metadata and outputs the metadata XML root and its namespace

        .. code-block:: python

            >>> from eoreader.reader import Reader
            >>> path = r"S1A_IW_GRDH_1SDV_20191215T060906_20191215T060931_030355_0378F7_3696.zip"
            >>> prod = Reader().open(path)
            >>> prod.read_mtd()
            (<Element product at 0x1832895d788>, '')

        Returns:
            (etree._Element, str): Metadata XML root and its namespace
        """
        # Get MTD XML file
        if self.is_archived:
            root = files.read_archived_xml(self.path, ".*annotation.*\.xml")
        else:
            # Open metadata file
            try:
                mtd_file = glob.glob(os.path.join(self.path, "annotation", "*.xml"))[0]

                # pylint: disable=I1101:
                # Module 'lxml.etree' has no 'parse' member, but source is unavailable.
                xml_tree = etree.parse(mtd_file)
                root = xml_tree.getroot()
            except IndexError as ex:
                raise InvalidProductError(
                    f"Metadata file (product.xml) not found in {self.path}"
                ) from ex

        # Get namespace
        namespace = ""

        return root, namespace
Esempio n. 5
0
    def read_mtd(self) -> (etree._Element, str):
        """
        Read metadata and outputs the metadata XML root and its namespace

        .. code-block:: python

            >>> from eoreader.reader import Reader
            >>> path = r"SENTINEL2B_20190401-105726-885_L2A_T31UEQ_D_V2-0.zip"
            >>> prod = Reader().open(path)
            >>> prod.read_mtd()
            (<Element Muscate_Metadata_Document at 0x252d2071e88>, '')

        Returns:
            (etree._Element, str): Metadata XML root and its namespace
        """
        # Get MTD XML file
        if self.is_archived:
            root = files.read_archived_xml(self.path, ".*MTD_ALL\.xml")
        else:
            # Open metadata file
            try:
                mtd_xml = glob.glob(os.path.join(self.path, "*MTD_ALL.xml"))[0]

                # pylint: disable=I1101:
                # Module 'lxml.etree' has no 'parse' member, but source is unavailable.
                xml_tree = etree.parse(mtd_xml)
                root = xml_tree.getroot()
            except IndexError as ex:
                raise InvalidProductError(
                    f"Metadata file not found in {self.path}"
                ) from ex

        # Get namespace
        namespace = ""

        return root, namespace
Esempio n. 6
0
    def read_mtd(
            self,
            force_pd=False) -> Union[pd.DataFrame, Tuple[etree._Element, str]]:
        """
        Read Landsat metadata as:

         - a `pandas.DataFrame` whatever its collection is (by default for collection 1)
         - a XML root + its namespace if the product is retrieved from the 2nd collection (by default for collection 2)

        .. code-block:: python

            >>> from eoreader.reader import Reader
            >>> path = r"LC08_L1GT_023030_20200518_20200527_01_T2"
            >>> prod = Reader().open(path)

            >>> # COLLECTION 1 : Open metadata as panda DataFrame
            >>> prod.read_mtd()
            NAME                                           ORIGIN  ...    RESAMPLING_OPTION
            value  "Image courtesy of the U.S. Geological Survey"  ...  "CUBIC_CONVOLUTION"
            [1 rows x 197 columns]

            >>> # COLLECTION 2 : Open metadata as XML
            >>> path = r"LC08_L1TP_200030_20201220_20210310_02_T1"  # Collection 2
            >>> prod = Reader().open(path)
            >>> prod.read_mtd()
            (<Element LANDSAT_METADATA_FILE at 0x19229016048>, '')

            >>> # COLLECTION 2 : Force to pandas.DataFrame
            >>> prod.read_mtd(force_pd=True)
            NAME                                           ORIGIN  ...    RESAMPLING_OPTION
            value  "Image courtesy of the U.S. Geological Survey"  ...  "CUBIC_CONVOLUTION"
            [1 rows x 263 columns]

        Args:
            force_pd (bool): If collection 2, return a pandas.DataFrame instead of a XML root + namespace
        Returns:
            pd.DataFrame: Metadata as a Pandas DataFrame
        """
        # WARNING: always use force_pd in this class !
        as_pd = (self._collection == LandsatCollection.COL_1) or force_pd

        if as_pd:
            mtd_name = f"{self.name}_MTL.txt"
            if self.is_archived:
                # We need to extract the file in memory to be used with pandas
                tar_ds = tarfile.open(self.path, "r")
                info = [
                    f.name for f in tar_ds.getmembers() if mtd_name in f.name
                ][0]
                mtd_path = tar_ds.extractfile(info)
            else:
                # FOR COLLECTION 1 AND 2
                tar_ds = None
                mtd_path = os.path.join(self.path, mtd_name)

                if not os.path.isfile(mtd_path):
                    raise FileNotFoundError(
                        f"Unable to find the metadata file associated with {self.path}"
                    )

            # Parse
            mtd_data = pd.read_table(
                mtd_path,
                sep="\s=\s",
                names=["NAME", "value"],
                skipinitialspace=True,
                engine="python",
            )

            # Workaround an unexpected behaviour in pandas !
            if any(mtd_data.NAME == "="):
                mtd_data = pd.read_table(
                    mtd_path,
                    sep="=",
                    names=["NAME", "=", "value"],
                    usecols=[0, 2],
                    skipinitialspace=True,
                )

            # Remove useless rows
            mtd_data = mtd_data[~mtd_data["NAME"].
                                isin(["GROUP", "END_GROUP", "END"])]

            # Set index
            mtd_data = mtd_data.set_index("NAME").T

            # Close if needed
            if tar_ds:
                tar_ds.close()
        else:
            if self.is_archived:
                root = files.read_archived_xml(self.path,
                                               f".*{self.name}_MTL.xml")
            else:
                # ONLY FOR COLLECTION 2
                try:
                    mtd_file = glob.glob(
                        os.path.join(self.path, f"{self.name}_MTL.xml"))[0]

                    # pylint: disable=I1101:
                    # Module 'lxml.etree' has no 'parse' member, but source is unavailable.
                    xml_tree = etree.parse(mtd_file)
                    root = xml_tree.getroot()
                except IndexError as ex:
                    raise InvalidProductError(
                        f"Metadata file ({self.name}.xml) not found in {self.path}"
                    ) from ex

            # Get namespace
            namespace = ""  # No namespace here

            mtd_data = (root, namespace)

        return mtd_data