Exemple #1
0
    def __init__(self, path, extensions_to_ignore=None):
        """Open the Darwin Core Archive."""
        if extensions_to_ignore is None:
            extensions_to_ignore = []

        #: The path to the Darwin Core Archive file, as passed to the constructor.
        self.archive_path = path

        if os.path.isdir(self.archive_path
                         ):  # Archive is a (directly readable) directory
            self._workin_directory_path = self.archive_path
            self._directory_to_clean = None
        else:  # Archive is zipped/tgzipped, we have to extract it first.
            self._directory_to_clean, self._workin_directory_path = self._extract(
            )

        #: An :class:`descriptors.ArchiveDescriptor` instance giving access to the archive
        #: descriptor/metafile (``meta.xml``)
        try:
            self.descriptor = ArchiveDescriptor(
                self.open_included_file(METAFILE_NAME).read(),
                files_to_ignore=extensions_to_ignore)
        except IOError as exc:
            if exc.errno == ENOENT:
                self.descriptor = None

        #: A :class:`xml.etree.ElementTree.Element` instance containing the (scientific) metadata
        #: of the archive, or None if the Archive contains no metadata.
        self.metadata = self._parse_metadata_file()
        #: If the archive contains source metadata (typically, GBIF downloads) this dict will
        #: be something like:
        #: {'dataset1_UUID': <dataset1 EML (xml.etree.ElementTree.Element instance)>,
        #: 'dataset2_UUID': <dataset2 EML (xml.etree.ElementTree.Element instance)>, ...}
        #: see :doc:`gbif_results` for more details.
        self.source_metadata = self._load_source_metadata()

        if self.descriptor:
            #  We have an Archive descriptor that we can use to access data files.
            self._corefile = CSVDataFile(self._workin_directory_path,
                                         self.descriptor.core)
            self._extensionfiles = [
                CSVDataFile(work_directory=self._workin_directory_path,
                            file_descriptor=d)
                for d in self.descriptor.extensions
            ]
        else:  # Archive without descriptor, we'll have to find and inspect the data file
            try:
                datafile_name = self._is_valid_simple_archive()
                descriptor = DataFileDescriptor.make_from_file(
                    os.path.join(self._workin_directory_path, datafile_name))

                self._corefile = CSVDataFile(
                    work_directory=self._workin_directory_path,
                    file_descriptor=descriptor)
                self._extensionfiles = []
            except InvalidSimpleArchive:
                msg = "No metafile was found, but archive includes multiple files/directories."
                raise InvalidSimpleArchive(msg)
Exemple #2
0
    def test_lines_to_ignore_attribute(self):
        """.lines_to_ignore works as documented"""

        metaxml_section = r"""
        <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
            <files>
                <location>occurrence.txt</location>
            </files>
            <id index="0" />
            <field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
            <field index="2" term="http://rs.tdwg.org/dwc/terms/locality"/>
            <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/>
            <field index="4" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
        </core>
        """

        descriptor = DataFileDescriptor.make_from_metafile_section(
            ET.fromstring(metaxml_section))
        data_file = CSVDataFile(sample_data_path('dwca-simple-dir'),
                                descriptor)

        self.assertEqual(data_file.lines_to_ignore, 1)

        metaxml_section = r"""
                <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="3" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
                    <files>
                        <location>occurrence.txt</location>
                    </files>
                    <id index="0" />
                    <field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
                    <field index="2" term="http://rs.tdwg.org/dwc/terms/locality"/>
                    <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/>
                    <field index="4" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
                </core>
                """

        descriptor = DataFileDescriptor.make_from_metafile_section(
            ET.fromstring(metaxml_section))
        data_file = CSVDataFile(sample_data_path('dwca-simple-dir'),
                                descriptor)

        self.assertEqual(data_file.lines_to_ignore, 3)
    def test_iterate(self):
        metaxml_section = r"""
        <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
            <files><location>occurrence.txt</location></files>
                <id index="0" />
                <field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
                <field index="2" term="http://rs.tdwg.org/dwc/terms/locality"/>
                <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/>
                <field index="4" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
            </core>
         """

        descriptor = DataFileDescriptor.make_from_metafile_section(
            ET.fromstring(metaxml_section))
        data_file = CSVDataFile(sample_data_path("dwca-simple-dir"),
                                descriptor)

        for row in data_file:
            self.assertIsInstance(row, str)
    def test_close(self):
        metaxml_section = r"""
        <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
            <files><location>occurrence.txt</location></files>
            <id index="0" />
            <field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
            <field index="2" term="http://rs.tdwg.org/dwc/terms/locality"/>
            <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/>
            <field index="4" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
        </core>
        """

        descriptor = DataFileDescriptor.make_from_metafile_section(
            ET.fromstring(metaxml_section))
        data_file = CSVDataFile(DIRECTORY_ARCHIVE_PATH, descriptor)

        data_file.close()

        with self.assertRaises(ValueError):
            # It's not possible anymore to access the data because file has been closed.
            data_file.get_row_by_position(1)
    def test_file_descriptor_attribute(self):
        """The instance of DataFileDescriptor which is passed to the constructor is available in .file_descriptor"""

        metaxml_section = r"""
        <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
            <files>
                <location>occurrence.txt</location>
            </files>
            <id index="0" />
            <field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
            <field index="2" term="http://rs.tdwg.org/dwc/terms/locality"/>
            <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/>
            <field index="4" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
        </core>
        """

        descriptor = DataFileDescriptor.make_from_metafile_section(
            ET.fromstring(metaxml_section))
        data_file = CSVDataFile(DIRECTORY_ARCHIVE_PATH, descriptor)

        self.assertEqual(data_file.file_descriptor, descriptor)
Exemple #6
0
    def __init__(self, path, extensions_to_ignore=None):
        # type: (str, List[str]) -> None
        """Open the Darwin Core Archive."""
        if extensions_to_ignore is None:
            extensions_to_ignore = []

        #: The path to the Darwin Core Archive file, as passed to the constructor.
        self.archive_path = path  # type: str

        if os.path.isdir(self.archive_path
                         ):  # Archive is a (directly readable) directory
            self._working_directory_path = self.archive_path
            self._directory_to_clean = None  # type: Optional[str]
        else:  # Archive is zipped/tgzipped, we have to extract it first.
            self._directory_to_clean, self._working_directory_path = self._extract(
            )

        #: An :class:`descriptors.ArchiveDescriptor` instance giving access to the archive
        #: descriptor/metafile (``meta.xml``)
        self.descriptor = None  # type: Optional[ArchiveDescriptor]
        try:
            self.descriptor = ArchiveDescriptor(
                self.open_included_file(self.default_metafile_name).read(),
                files_to_ignore=extensions_to_ignore)
        except IOError as exc:
            if exc.errno == ENOENT:
                pass

        #: A :class:`xml.etree.ElementTree.Element` instance containing the (scientific) metadata
        #: of the archive, or `None` if the archive has no metadata.
        self.metadata = self._parse_metadata_file()  # type: Optional[Element]

        #: If the archive contains source-level metadata (typically, GBIF downloads), this is a dict such as::
        #:
        #:      {'dataset1_UUID': <dataset1 EML> (xml.etree.ElementTree.Element object),
        #:       'dataset2_UUID': <dataset2 EML> (xml.etree.ElementTree.Element object), ...}
        #:
        #: See :doc:`gbif_results` for more details.
        self.source_metadata = self._get_source_metadata(
        )  # type: Dict[str, Element]

        if self.descriptor:  # We have an Archive descriptor that we can use to access data files.
            #: An instance of :class:`dwca.files.CSVDataFile` for the core data file.
            self.core_file = CSVDataFile(
                self._working_directory_path,
                self.descriptor.core)  # type: CSVDataFile

            #: A list of :class:`dwca.files.CSVDataFile`, one entry for each extension data file , sorted by order of
            #: appearance in the Metafile (or an empty list if the archive doesn't use extensions).
            self.extension_files = [
                CSVDataFile(work_directory=self._working_directory_path,
                            file_descriptor=d)
                for d in self.descriptor.extensions
            ]  # type: List[CSVDataFile]
        else:  # Archive without descriptor, we'll have to find and inspect the data file
            try:
                datafile_name = self._is_valid_simple_archive()
                descriptor = DataFileDescriptor.make_from_file(
                    os.path.join(self._working_directory_path, datafile_name))

                self.core_file = CSVDataFile(
                    work_directory=self._working_directory_path,
                    file_descriptor=descriptor)
                self.extension_files = []
            except InvalidSimpleArchive:
                msg = "No Metafile was found, but the archive contains multiple files/directories."
                raise InvalidSimpleArchive(msg)