Ejemplo n.º 1
0
    def __init__(self, path, extensions_to_ignore=None):
        """Open the Darwin Core Archive."""
        if extensions_to_ignore is None:
            extensions_to_ignore = []

        #: The path to the Darwin Core Archive file, as passed to the constructor.
        self.archive_path = path

        if os.path.isdir(self.archive_path
                         ):  # Archive is a (directly readable) directory
            self._workin_directory_path = self.archive_path
            self._directory_to_clean = None
        else:  # Archive is zipped/tgzipped, we have to extract it first.
            self._directory_to_clean, self._workin_directory_path = self._extract(
            )

        #: An :class:`descriptors.ArchiveDescriptor` instance giving access to the archive
        #: descriptor/metafile (``meta.xml``)
        try:
            self.descriptor = ArchiveDescriptor(
                self.open_included_file(METAFILE_NAME).read(),
                files_to_ignore=extensions_to_ignore)
        except IOError as exc:
            if exc.errno == ENOENT:
                self.descriptor = None

        #: A :class:`xml.etree.ElementTree.Element` instance containing the (scientific) metadata
        #: of the archive, or None if the Archive contains no metadata.
        self.metadata = self._parse_metadata_file()
        #: If the archive contains source metadata (typically, GBIF downloads) this dict will
        #: be something like:
        #: {'dataset1_UUID': <dataset1 EML (xml.etree.ElementTree.Element instance)>,
        #: 'dataset2_UUID': <dataset2 EML (xml.etree.ElementTree.Element instance)>, ...}
        #: see :doc:`gbif_results` for more details.
        self.source_metadata = self._load_source_metadata()

        if self.descriptor:
            #  We have an Archive descriptor that we can use to access data files.
            self._corefile = CSVDataFile(self._workin_directory_path,
                                         self.descriptor.core)
            self._extensionfiles = [
                CSVDataFile(work_directory=self._workin_directory_path,
                            file_descriptor=d)
                for d in self.descriptor.extensions
            ]
        else:  # Archive without descriptor, we'll have to find and inspect the data file
            try:
                datafile_name = self._is_valid_simple_archive()
                descriptor = DataFileDescriptor.make_from_file(
                    os.path.join(self._workin_directory_path, datafile_name))

                self._corefile = CSVDataFile(
                    work_directory=self._workin_directory_path,
                    file_descriptor=descriptor)
                self._extensionfiles = []
            except InvalidSimpleArchive:
                msg = "No metafile was found, but archive includes multiple files/directories."
                raise InvalidSimpleArchive(msg)
Ejemplo n.º 2
0
    def _is_valid_simple_archive(self) -> str:

        # If the working dir appear to contains a valid simple darwin core archive
        # (one single data file + possibly some metadata), returns the name of the data file.
        #
        # Otherwise, throws an InvalidSimpleArchive exception.
        _, _, files = next(os.walk(self._working_directory_path))

        if len(files) == 1:
            return files[0]  # A single file, so that's the one
        elif len(files) == 2:
            # Two files found: if one of them is EML.xml, the other is considered as the data file
            if self.default_metadata_filename in files:
                return [f for f in files if f != self.default_metadata_filename][0]

        raise InvalidSimpleArchive()
Ejemplo n.º 3
0
    def _is_valid_simple_archive(self):
        # If the working dir appear to contains a valid simple darwin core archive
        # (one single data file + possibly some metadata), returns the name of the data file.
        #
        # Otherwise, throws an InvalidSimpleArchive exception.
        _, _, files = next(os.walk(self._workin_directory_path))

        if len(files) == 1:  # We found a single file
            return files[0]
        elif len(files) == 2:
            # Two files found: if one of them is EML.xml, the other is considered as the data file
            if DEFAULT_METADATA_FILENAME in files:
                return [
                    f for f in files if f is not DEFAULT_METADATA_FILENAME
                ][0]

        raise InvalidSimpleArchive()
Ejemplo n.º 4
0
    def __init__(self, path, extensions_to_ignore=None):
        # type: (str, List[str]) -> None
        """Open the Darwin Core Archive."""
        if extensions_to_ignore is None:
            extensions_to_ignore = []

        #: The path to the Darwin Core Archive file, as passed to the constructor.
        self.archive_path = path  # type: str

        if os.path.isdir(self.archive_path
                         ):  # Archive is a (directly readable) directory
            self._working_directory_path = self.archive_path
            self._directory_to_clean = None  # type: Optional[str]
        else:  # Archive is zipped/tgzipped, we have to extract it first.
            self._directory_to_clean, self._working_directory_path = self._extract(
            )

        #: An :class:`descriptors.ArchiveDescriptor` instance giving access to the archive
        #: descriptor/metafile (``meta.xml``)
        self.descriptor = None  # type: Optional[ArchiveDescriptor]
        try:
            self.descriptor = ArchiveDescriptor(
                self.open_included_file(self.default_metafile_name).read(),
                files_to_ignore=extensions_to_ignore)
        except IOError as exc:
            if exc.errno == ENOENT:
                pass

        #: A :class:`xml.etree.ElementTree.Element` instance containing the (scientific) metadata
        #: of the archive, or `None` if the archive has no metadata.
        self.metadata = self._parse_metadata_file()  # type: Optional[Element]

        #: If the archive contains source-level metadata (typically, GBIF downloads), this is a dict such as::
        #:
        #:      {'dataset1_UUID': <dataset1 EML> (xml.etree.ElementTree.Element object),
        #:       'dataset2_UUID': <dataset2 EML> (xml.etree.ElementTree.Element object), ...}
        #:
        #: See :doc:`gbif_results` for more details.
        self.source_metadata = self._get_source_metadata(
        )  # type: Dict[str, Element]

        if self.descriptor:  # We have an Archive descriptor that we can use to access data files.
            #: An instance of :class:`dwca.files.CSVDataFile` for the core data file.
            self.core_file = CSVDataFile(
                self._working_directory_path,
                self.descriptor.core)  # type: CSVDataFile

            #: A list of :class:`dwca.files.CSVDataFile`, one entry for each extension data file , sorted by order of
            #: appearance in the Metafile (or an empty list if the archive doesn't use extensions).
            self.extension_files = [
                CSVDataFile(work_directory=self._working_directory_path,
                            file_descriptor=d)
                for d in self.descriptor.extensions
            ]  # type: List[CSVDataFile]
        else:  # Archive without descriptor, we'll have to find and inspect the data file
            try:
                datafile_name = self._is_valid_simple_archive()
                descriptor = DataFileDescriptor.make_from_file(
                    os.path.join(self._working_directory_path, datafile_name))

                self.core_file = CSVDataFile(
                    work_directory=self._working_directory_path,
                    file_descriptor=descriptor)
                self.extension_files = []
            except InvalidSimpleArchive:
                msg = "No Metafile was found, but the archive contains multiple files/directories."
                raise InvalidSimpleArchive(msg)