Beispiel #1
0
    def __init__(self, path, extensions_to_ignore=None):
        """Open the Darwin Core Archive."""
        if extensions_to_ignore is None:
            extensions_to_ignore = []

        #: The path to the Darwin Core Archive file, as passed to the constructor.
        self.archive_path = path

        if os.path.isdir(self.archive_path
                         ):  # Archive is a (directly readable) directory
            self._workin_directory_path = self.archive_path
            self._directory_to_clean = None
        else:  # Archive is zipped/tgzipped, we have to extract it first.
            self._directory_to_clean, self._workin_directory_path = self._extract(
            )

        #: An :class:`descriptors.ArchiveDescriptor` instance giving access to the archive
        #: descriptor/metafile (``meta.xml``)
        try:
            self.descriptor = ArchiveDescriptor(
                self.open_included_file(METAFILE_NAME).read(),
                files_to_ignore=extensions_to_ignore)
        except IOError as exc:
            if exc.errno == ENOENT:
                self.descriptor = None

        #: A :class:`xml.etree.ElementTree.Element` instance containing the (scientific) metadata
        #: of the archive, or None if the Archive contains no metadata.
        self.metadata = self._parse_metadata_file()
        #: If the archive contains source metadata (typically, GBIF downloads) this dict will
        #: be something like:
        #: {'dataset1_UUID': <dataset1 EML (xml.etree.ElementTree.Element instance)>,
        #: 'dataset2_UUID': <dataset2 EML (xml.etree.ElementTree.Element instance)>, ...}
        #: see :doc:`gbif_results` for more details.
        self.source_metadata = self._load_source_metadata()

        if self.descriptor:
            #  We have an Archive descriptor that we can use to access data files.
            self._corefile = CSVDataFile(self._workin_directory_path,
                                         self.descriptor.core)
            self._extensionfiles = [
                CSVDataFile(work_directory=self._workin_directory_path,
                            file_descriptor=d)
                for d in self.descriptor.extensions
            ]
        else:  # Archive without descriptor, we'll have to find and inspect the data file
            try:
                datafile_name = self._is_valid_simple_archive()
                descriptor = DataFileDescriptor.make_from_file(
                    os.path.join(self._workin_directory_path, datafile_name))

                self._corefile = CSVDataFile(
                    work_directory=self._workin_directory_path,
                    file_descriptor=descriptor)
                self._extensionfiles = []
            except InvalidSimpleArchive:
                msg = "No metafile was found, but archive includes multiple files/directories."
                raise InvalidSimpleArchive(msg)
Beispiel #2
0
    def test_init_from_file(self):
        """ Ensure a DataFileDescriptor can be constructed directly from a CSV file.

        This is necessary for archives sans metafile.
        """
        with zipfile.ZipFile(sample_data_path('dwca-simple-csv.zip'),
                             'r') as archive:
            datafile_path = archive.extract('0008333-160118175350007.csv')

            d = DataFileDescriptor.make_from_file(datafile_path)
            # Check basic metadata with the file
            self.assertIsNone(d.raw_element)
            self.assertTrue(d.represents_corefile)
            self.assertFalse(d.represents_extension)
            self.assertIsNone(d.type)
            self.assertEqual(d.file_location, '0008333-160118175350007.csv')
            self.assertEqual(d.file_encoding, 'utf-8')
            self.assertEqual(d.lines_terminated_by, "\n")
            self.assertEqual(d.fields_terminated_by, "\t")
            self.assertEqual(d.fields_enclosed_by, '"')

            # Some checks on fields...

            # A few fields are checked
            expected_fields = ({
                'default': None,
                'index': 0,
                'term': 'gbifid'
            }, {
                'default': None,
                'index': 3,
                'term': 'kingdom'
            })

            for ef in expected_fields:
                self.assertTrue(ef in d.fields)

            # In total, there are 42 fields in this data file
            self.assertEqual(len(d.fields), 42)

            # No fields should have a default value (there's no metafile to set it!)
            for f in d.fields:
                self.assertIsNone(f['default'])

            # Ensure .terms is also set:
            self.assertEqual(len(d.terms), 42)

            # Cleanup extracted file
            os.remove(datafile_path)
    def __init__(self, path, extensions_to_ignore=None):
        """Open the Darwin Core Archive."""
        if extensions_to_ignore is None:
            extensions_to_ignore = []

        #: The path to the Darwin Core Archive file, as passed to the constructor.
        self.archive_path = path

        if os.path.isdir(self.archive_path):  # Archive is a (directly readable) directory
            self._workin_directory_path = self.archive_path
            self._directory_to_clean = None
        else:  # Archive is zipped/tgzipped, we have to extract it first.
            self._directory_to_clean, self._workin_directory_path = self._extract()

        #: An :class:`descriptors.ArchiveDescriptor` instance giving access to the archive
        #: descriptor/metafile (``meta.xml``)
        try:
            self.descriptor = ArchiveDescriptor(self.open_included_file(METAFILE_NAME).read(),
                                                files_to_ignore=extensions_to_ignore)
        except IOError as e:
            if e.errno == ENOENT:
                self.descriptor = None

        #: A :class:`xml.etree.ElementTree.Element` instance containing the (scientific) metadata
        #: of the archive, or None if the Archive contains no metadata.
        self.metadata = self._parse_metadata_file()
        #:
        self.source_metadata = None

        if self.descriptor:
            #  We have an Archive descriptor that we can use to access data files.
            self._corefile = CSVDataFile(self._workin_directory_path, self.descriptor.core)
            self._extensionfiles = [CSVDataFile(work_directory=self._workin_directory_path,
                                                file_descriptor=d)
                                    for d in self.descriptor.extensions]
        else:  # Archive without descriptor, we'll have to find and inspect the data file
            try:
                datafile_name = self._is_valid_simple_archive()
                d = DataFileDescriptor.make_from_file(os.path.join(self._workin_directory_path, datafile_name))

                self._corefile = CSVDataFile(work_directory=self._workin_directory_path,
                                             file_descriptor=d)
                self._extensionfiles = []
            except InvalidSimpleArchive:
                msg = "No metafile was found, but archive includes multiple files/directories."
                raise InvalidSimpleArchive(msg)
    def test_init_from_file(self):
        """ Ensure a DataFileDescriptor can be constructed directly from a CSV file.

        This is necessary for archives sans metafile.
        """
        with zipfile.ZipFile(SIMPLE_CSV, "r") as archive:
            datafile_path = archive.extract("0008333-160118175350007.csv")

            d = DataFileDescriptor.make_from_file(datafile_path)
            # Check basic metadata with the file
            self.assertIsNone(d.raw_element)
            self.assertTrue(d.represents_corefile)
            self.assertFalse(d.represents_extension)
            self.assertIsNone(d.type)
            self.assertEqual(d.file_location, "0008333-160118175350007.csv")
            self.assertEqual(d.file_encoding, "utf-8")
            self.assertEqual(d.lines_terminated_by, "\n")
            self.assertEqual(d.fields_terminated_by, "\t")
            self.assertEqual(d.fields_enclosed_by, '"')

            # Some checks on fields...

            # A few fields are checked
            expected_fields = (
                {"default": None, "index": 0, "term": "gbifid"},
                {"default": None, "index": 3, "term": "kingdom"},
            )

            for ef in expected_fields:
                self.assertTrue(ef in d.fields)

            # In total, there are 42 fields in this data file
            self.assertEqual(len(d.fields), 42)

            # No fields should have a default value (there's no metafile to set it!)
            for f in d.fields:
                self.assertIsNone(f["default"])

            # Ensure .terms is also set:
            self.assertEqual(len(d.terms), 42)

            # Cleanup extracted file
            os.remove(datafile_path)
Beispiel #5
0
    def __init__(self, path, extensions_to_ignore=None):
        # type: (str, List[str]) -> None
        """Open the Darwin Core Archive."""
        if extensions_to_ignore is None:
            extensions_to_ignore = []

        #: The path to the Darwin Core Archive file, as passed to the constructor.
        self.archive_path = path  # type: str

        if os.path.isdir(self.archive_path
                         ):  # Archive is a (directly readable) directory
            self._working_directory_path = self.archive_path
            self._directory_to_clean = None  # type: Optional[str]
        else:  # Archive is zipped/tgzipped, we have to extract it first.
            self._directory_to_clean, self._working_directory_path = self._extract(
            )

        #: An :class:`descriptors.ArchiveDescriptor` instance giving access to the archive
        #: descriptor/metafile (``meta.xml``)
        self.descriptor = None  # type: Optional[ArchiveDescriptor]
        try:
            self.descriptor = ArchiveDescriptor(
                self.open_included_file(self.default_metafile_name).read(),
                files_to_ignore=extensions_to_ignore)
        except IOError as exc:
            if exc.errno == ENOENT:
                pass

        #: A :class:`xml.etree.ElementTree.Element` instance containing the (scientific) metadata
        #: of the archive, or `None` if the archive has no metadata.
        self.metadata = self._parse_metadata_file()  # type: Optional[Element]

        #: If the archive contains source-level metadata (typically, GBIF downloads), this is a dict such as::
        #:
        #:      {'dataset1_UUID': <dataset1 EML> (xml.etree.ElementTree.Element object),
        #:       'dataset2_UUID': <dataset2 EML> (xml.etree.ElementTree.Element object), ...}
        #:
        #: See :doc:`gbif_results` for more details.
        self.source_metadata = self._get_source_metadata(
        )  # type: Dict[str, Element]

        if self.descriptor:  # We have an Archive descriptor that we can use to access data files.
            #: An instance of :class:`dwca.files.CSVDataFile` for the core data file.
            self.core_file = CSVDataFile(
                self._working_directory_path,
                self.descriptor.core)  # type: CSVDataFile

            #: A list of :class:`dwca.files.CSVDataFile`, one entry for each extension data file , sorted by order of
            #: appearance in the Metafile (or an empty list if the archive doesn't use extensions).
            self.extension_files = [
                CSVDataFile(work_directory=self._working_directory_path,
                            file_descriptor=d)
                for d in self.descriptor.extensions
            ]  # type: List[CSVDataFile]
        else:  # Archive without descriptor, we'll have to find and inspect the data file
            try:
                datafile_name = self._is_valid_simple_archive()
                descriptor = DataFileDescriptor.make_from_file(
                    os.path.join(self._working_directory_path, datafile_name))

                self.core_file = CSVDataFile(
                    work_directory=self._working_directory_path,
                    file_descriptor=descriptor)
                self.extension_files = []
            except InvalidSimpleArchive:
                msg = "No Metafile was found, but the archive contains multiple files/directories."
                raise InvalidSimpleArchive(msg)