Example #1
0
    def test_lines_to_ignore(self):
        # With explicit "0"
        metaxml_section = """
        <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy=""
        ignoreHeaderLines="0" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
            <files>
                <location>occurrence.txt</location>
            </files>
            <id index="0" />
            <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/>
        </core>
        """

        core_descriptor = DataFileDescriptor.make_from_metafile_section(
            ET.fromstring(metaxml_section))

        self.assertEqual(core_descriptor.lines_to_ignore, 0)

        # With explicit 1
        metaxml_section = """
        <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy=""
        ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
            <files>
                <location>occurrence.txt</location>
            </files>
            <id index="0" />
            <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/>
        </core>
        """

        core_descriptor = DataFileDescriptor.make_from_metafile_section(
            ET.fromstring(metaxml_section))

        self.assertEqual(core_descriptor.lines_to_ignore, 1)

        # Implicit 0 (when nothing stated)
        metaxml_section = """
        <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy=""
        rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
            <files>
                <location>occurrence.txt</location>
            </files>
            <id index="0" />
            <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/>
        </core>
        """

        core_descriptor = DataFileDescriptor.make_from_metafile_section(
            ET.fromstring(metaxml_section))

        self.assertEqual(core_descriptor.lines_to_ignore, 0)
    def test_lines_to_ignore(self):
        # With explicit "0"
        metaxml_section = """
        <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy=""
        ignoreHeaderLines="0" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
            <files>
                <location>occurrence.txt</location>
            </files>
            <id index="0" />
            <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/>
        </core>
        """

        core_descriptor = DataFileDescriptor.make_from_metafile_section(ET.fromstring(metaxml_section))

        self.assertEqual(core_descriptor.lines_to_ignore, 0)

        # With explicit 1
        metaxml_section = """
        <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy=""
        ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
            <files>
                <location>occurrence.txt</location>
            </files>
            <id index="0" />
            <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/>
        </core>
        """

        core_descriptor = DataFileDescriptor.make_from_metafile_section(ET.fromstring(metaxml_section))

        self.assertEqual(core_descriptor.lines_to_ignore, 1)

        # Implicit 0 (when nothing stated)
        metaxml_section = """
        <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy=""
        rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
            <files>
                <location>occurrence.txt</location>
            </files>
            <id index="0" />
            <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/>
        </core>
        """

        core_descriptor = DataFileDescriptor.make_from_metafile_section(ET.fromstring(metaxml_section))

        self.assertEqual(core_descriptor.lines_to_ignore, 0)
Example #3
0
    def test_headers_unordered(self):
        metaxml_section = """
        <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy=""
        ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Taxon">
            <files>
                <location>taxon.txt</location>
            </files>
            <field index="4" term="http://rs.tdwg.org/dwc/terms/phylum"/>
            <id index="0" />
            <field index="1" term="http://rs.tdwg.org/dwc/terms/order"/>
            <field index="2" term="http://rs.tdwg.org/dwc/terms/class"/>
            <field index="6" term="http://rs.tdwg.org/dwc/terms/family"/>
            <field index="3" term="http://rs.tdwg.org/dwc/terms/kingdom"/>
            <field index="5" term="http://rs.tdwg.org/dwc/terms/genus"/>
        </core>
        """
        core_descriptor = DataFileDescriptor.make_from_metafile_section(
            ET.fromstring(metaxml_section))

        expected_headers_core = [
            'id', 'http://rs.tdwg.org/dwc/terms/order',
            'http://rs.tdwg.org/dwc/terms/class',
            'http://rs.tdwg.org/dwc/terms/kingdom',
            'http://rs.tdwg.org/dwc/terms/phylum',
            'http://rs.tdwg.org/dwc/terms/genus',
            'http://rs.tdwg.org/dwc/terms/family'
        ]

        self.assertEqual(core_descriptor.headers, expected_headers_core)
Example #4
0
    def test_short_headers(self):
        metaxml_section = """
                <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy=""
                ignoreHeaderLines="0" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
                    <files>
                        <location>occurrence.txt</location>
                    </files>
                    <id index="0" />
                    <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/>
                    <field index="1" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
                    <field index="2" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
                    <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/>
                    <field index="4" term="http://rs.tdwg.org/dwc/terms/locality"/>
                </core>
                """

        core_descriptor = DataFileDescriptor.make_from_metafile_section(
            ET.fromstring(metaxml_section))

        expected_short_headers_core = [
            'id', 'scientificName', 'basisOfRecord', 'family', 'locality'
        ]

        self.assertEqual(core_descriptor.short_headers,
                         expected_short_headers_core)
Example #5
0
    def test_fields(self):
        metaxml_section = """
        <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy=""
        ignoreHeaderLines="0" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
            <files>
                <location>occurrence.txt</location>
            </files>
            <id index="0" />
            <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/>
            <field index="1" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
            <field index="2" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
            <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/>
            <field index="4" term="http://rs.tdwg.org/dwc/terms/locality"/>
        </core>
        """

        core_descriptor = DataFileDescriptor.make_from_metafile_section(
            ET.fromstring(metaxml_section))

        # .fields is supposed to return a list of dicts like those
        expected_fields = ({
            'term': 'http://rs.tdwg.org/dwc/terms/country',
            'index': None,
            'default': 'Belgium'
        }, {
            'term': 'http://rs.tdwg.org/dwc/terms/scientificName',
            'index': 1,
            'default': None
        })

        for ef in expected_fields:
            self.assertTrue(ef in core_descriptor.fields)

        self.assertEqual(len(core_descriptor.fields), 5)
    def test_fields(self):
        metaxml_section = """
        <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy=""
        ignoreHeaderLines="0" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
            <files>
                <location>occurrence.txt</location>
            </files>
            <id index="0" />
            <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/>
            <field index="1" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
            <field index="2" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
            <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/>
            <field index="4" term="http://rs.tdwg.org/dwc/terms/locality"/>
        </core>
        """

        core_descriptor = DataFileDescriptor.make_from_metafile_section(ET.fromstring(metaxml_section))

        # .fields is supposed to return a list of dicts like those
        expected_fields = (
            {"term": "http://rs.tdwg.org/dwc/terms/country", "index": None, "default": "Belgium"},
            {"term": "http://rs.tdwg.org/dwc/terms/scientificName", "index": 1, "default": None},
        )

        for ef in expected_fields:
            self.assertTrue(ef in core_descriptor.fields)

        self.assertEqual(len(core_descriptor.fields), 5)
    def test_headers_defaultvalue(self):
        """ Ensure headers work properly when confronted to default values (w/o column in file)"""
        metaxml_section = """
        <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy=""
        ignoreHeaderLines="0" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
            <files>
                <location>occurrence.txt</location>
            </files>
            <id index="0" />
            <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/>
            <field index="1" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
            <field index="2" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
            <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/>
            <field index="4" term="http://rs.tdwg.org/dwc/terms/locality"/>
        </core>
        """

        core_descriptor = DataFileDescriptor.make_from_metafile_section(ET.fromstring(metaxml_section))

        expected_headers_core = [
            "id",
            "http://rs.tdwg.org/dwc/terms/scientificName",
            "http://rs.tdwg.org/dwc/terms/basisOfRecord",
            "http://rs.tdwg.org/dwc/terms/family",
            "http://rs.tdwg.org/dwc/terms/locality",
        ]

        self.assertEqual(core_descriptor.headers, expected_headers_core)
    def test_headers_unordered(self):
        metaxml_section = """
        <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy=""
        ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Taxon">
            <files>
                <location>taxon.txt</location>
            </files>
            <field index="4" term="http://rs.tdwg.org/dwc/terms/phylum"/>
            <id index="0" />
            <field index="1" term="http://rs.tdwg.org/dwc/terms/order"/>
            <field index="2" term="http://rs.tdwg.org/dwc/terms/class"/>
            <field index="6" term="http://rs.tdwg.org/dwc/terms/family"/>
            <field index="3" term="http://rs.tdwg.org/dwc/terms/kingdom"/>
            <field index="5" term="http://rs.tdwg.org/dwc/terms/genus"/>
        </core>
        """
        core_descriptor = DataFileDescriptor.make_from_metafile_section(ET.fromstring(metaxml_section))

        expected_headers_core = [
            "id",
            "http://rs.tdwg.org/dwc/terms/order",
            "http://rs.tdwg.org/dwc/terms/class",
            "http://rs.tdwg.org/dwc/terms/kingdom",
            "http://rs.tdwg.org/dwc/terms/phylum",
            "http://rs.tdwg.org/dwc/terms/genus",
            "http://rs.tdwg.org/dwc/terms/family",
        ]

        self.assertEqual(core_descriptor.headers, expected_headers_core)
Example #9
0
    def __init__(self, path, extensions_to_ignore=None):
        """Open the Darwin Core Archive."""
        if extensions_to_ignore is None:
            extensions_to_ignore = []

        #: The path to the Darwin Core Archive file, as passed to the constructor.
        self.archive_path = path

        if os.path.isdir(self.archive_path
                         ):  # Archive is a (directly readable) directory
            self._workin_directory_path = self.archive_path
            self._directory_to_clean = None
        else:  # Archive is zipped/tgzipped, we have to extract it first.
            self._directory_to_clean, self._workin_directory_path = self._extract(
            )

        #: An :class:`descriptors.ArchiveDescriptor` instance giving access to the archive
        #: descriptor/metafile (``meta.xml``)
        try:
            self.descriptor = ArchiveDescriptor(
                self.open_included_file(METAFILE_NAME).read(),
                files_to_ignore=extensions_to_ignore)
        except IOError as exc:
            if exc.errno == ENOENT:
                self.descriptor = None

        #: A :class:`xml.etree.ElementTree.Element` instance containing the (scientific) metadata
        #: of the archive, or None if the Archive contains no metadata.
        self.metadata = self._parse_metadata_file()
        #: If the archive contains source metadata (typically, GBIF downloads) this dict will
        #: be something like:
        #: {'dataset1_UUID': <dataset1 EML (xml.etree.ElementTree.Element instance)>,
        #: 'dataset2_UUID': <dataset2 EML (xml.etree.ElementTree.Element instance)>, ...}
        #: see :doc:`gbif_results` for more details.
        self.source_metadata = self._load_source_metadata()

        if self.descriptor:
            #  We have an Archive descriptor that we can use to access data files.
            self._corefile = CSVDataFile(self._workin_directory_path,
                                         self.descriptor.core)
            self._extensionfiles = [
                CSVDataFile(work_directory=self._workin_directory_path,
                            file_descriptor=d)
                for d in self.descriptor.extensions
            ]
        else:  # Archive without descriptor, we'll have to find and inspect the data file
            try:
                datafile_name = self._is_valid_simple_archive()
                descriptor = DataFileDescriptor.make_from_file(
                    os.path.join(self._workin_directory_path, datafile_name))

                self._corefile = CSVDataFile(
                    work_directory=self._workin_directory_path,
                    file_descriptor=descriptor)
                self._extensionfiles = []
            except InvalidSimpleArchive:
                msg = "No metafile was found, but archive includes multiple files/directories."
                raise InvalidSimpleArchive(msg)
Example #10
0
    def test_lines_to_ignore_attribute(self):
        """.lines_to_ignore works as documented"""

        metaxml_section = r"""
        <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
            <files>
                <location>occurrence.txt</location>
            </files>
            <id index="0" />
            <field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
            <field index="2" term="http://rs.tdwg.org/dwc/terms/locality"/>
            <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/>
            <field index="4" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
        </core>
        """

        descriptor = DataFileDescriptor.make_from_metafile_section(
            ET.fromstring(metaxml_section))
        data_file = CSVDataFile(sample_data_path('dwca-simple-dir'),
                                descriptor)

        self.assertEqual(data_file.lines_to_ignore, 1)

        metaxml_section = r"""
                <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="3" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
                    <files>
                        <location>occurrence.txt</location>
                    </files>
                    <id index="0" />
                    <field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
                    <field index="2" term="http://rs.tdwg.org/dwc/terms/locality"/>
                    <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/>
                    <field index="4" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
                </core>
                """

        descriptor = DataFileDescriptor.make_from_metafile_section(
            ET.fromstring(metaxml_section))
        data_file = CSVDataFile(sample_data_path('dwca-simple-dir'),
                                descriptor)

        self.assertEqual(data_file.lines_to_ignore, 3)
Example #11
0
    def test_init_from_file(self):
        """ Ensure a DataFileDescriptor can be constructed directly from a CSV file.

        This is necessary for archives sans metafile.
        """
        with zipfile.ZipFile(sample_data_path('dwca-simple-csv.zip'),
                             'r') as archive:
            datafile_path = archive.extract('0008333-160118175350007.csv')

            d = DataFileDescriptor.make_from_file(datafile_path)
            # Check basic metadata with the file
            self.assertIsNone(d.raw_element)
            self.assertTrue(d.represents_corefile)
            self.assertFalse(d.represents_extension)
            self.assertIsNone(d.type)
            self.assertEqual(d.file_location, '0008333-160118175350007.csv')
            self.assertEqual(d.file_encoding, 'utf-8')
            self.assertEqual(d.lines_terminated_by, "\n")
            self.assertEqual(d.fields_terminated_by, "\t")
            self.assertEqual(d.fields_enclosed_by, '"')

            # Some checks on fields...

            # A few fields are checked
            expected_fields = ({
                'default': None,
                'index': 0,
                'term': 'gbifid'
            }, {
                'default': None,
                'index': 3,
                'term': 'kingdom'
            })

            for ef in expected_fields:
                self.assertTrue(ef in d.fields)

            # In total, there are 42 fields in this data file
            self.assertEqual(len(d.fields), 42)

            # No fields should have a default value (there's no metafile to set it!)
            for f in d.fields:
                self.assertIsNone(f['default'])

            # Ensure .terms is also set:
            self.assertEqual(len(d.terms), 42)

            # Cleanup extracted file
            os.remove(datafile_path)
    def __init__(self, path, extensions_to_ignore=None):
        """Open the Darwin Core Archive."""
        if extensions_to_ignore is None:
            extensions_to_ignore = []

        #: The path to the Darwin Core Archive file, as passed to the constructor.
        self.archive_path = path

        if os.path.isdir(self.archive_path):  # Archive is a (directly readable) directory
            self._workin_directory_path = self.archive_path
            self._directory_to_clean = None
        else:  # Archive is zipped/tgzipped, we have to extract it first.
            self._directory_to_clean, self._workin_directory_path = self._extract()

        #: An :class:`descriptors.ArchiveDescriptor` instance giving access to the archive
        #: descriptor/metafile (``meta.xml``)
        try:
            self.descriptor = ArchiveDescriptor(self.open_included_file(METAFILE_NAME).read(),
                                                files_to_ignore=extensions_to_ignore)
        except IOError as e:
            if e.errno == ENOENT:
                self.descriptor = None

        #: A :class:`xml.etree.ElementTree.Element` instance containing the (scientific) metadata
        #: of the archive, or None if the Archive contains no metadata.
        self.metadata = self._parse_metadata_file()
        #:
        self.source_metadata = None

        if self.descriptor:
            #  We have an Archive descriptor that we can use to access data files.
            self._corefile = CSVDataFile(self._workin_directory_path, self.descriptor.core)
            self._extensionfiles = [CSVDataFile(work_directory=self._workin_directory_path,
                                                file_descriptor=d)
                                    for d in self.descriptor.extensions]
        else:  # Archive without descriptor, we'll have to find and inspect the data file
            try:
                datafile_name = self._is_valid_simple_archive()
                d = DataFileDescriptor.make_from_file(os.path.join(self._workin_directory_path, datafile_name))

                self._corefile = CSVDataFile(work_directory=self._workin_directory_path,
                                             file_descriptor=d)
                self._extensionfiles = []
            except InvalidSimpleArchive:
                msg = "No metafile was found, but archive includes multiple files/directories."
                raise InvalidSimpleArchive(msg)
    def test_exposes_coreid_index_of_extensions(self):
        ext_section = """
        <extension encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.gbif.org/terms/1.0/Description">
            <files><location>description.txt</location></files>
            <coreid index="0" />
            <field index="1" term="http://purl.org/dc/terms/type"/>
            <field index="2" term="http://purl.org/dc/terms/language"/>
            <field index="3" term="http://purl.org/dc/terms/description"/>
        </extension>
        """

        ext_descriptor = DataFileDescriptor.make_from_metafile_section(ET.fromstring(ext_section))

        self.assertEqual(ext_descriptor.coreid_index, 0)

        # ... but it doesn't have .id_index (only for core!)
        self.assertIsNone(ext_descriptor.id_index)
Example #14
0
    def test_exposes_coreid_index_of_extensions(self):
        ext_section = """
        <extension encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.gbif.org/terms/1.0/Description">
            <files><location>description.txt</location></files>
            <coreid index="0" />
            <field index="1" term="http://purl.org/dc/terms/type"/>
            <field index="2" term="http://purl.org/dc/terms/language"/>
            <field index="3" term="http://purl.org/dc/terms/description"/>
        </extension>
        """

        ext_descriptor = DataFileDescriptor.make_from_metafile_section(
            ET.fromstring(ext_section))

        self.assertEqual(ext_descriptor.coreid_index, 0)

        # ... but it doesn't have .id_index (only for core!)
        self.assertIsNone(ext_descriptor.id_index)
    def test_content_raw_element_tag(self):
        """ Test the content of raw_element seems decent. """
        ext_section = """
        <extension encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n"
        fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.gbif.org/terms/1.0/Description">
            <files><location>description.txt</location></files>
            <coreid index="0" />
            <field index="1" term="http://purl.org/dc/terms/type"/>
            <field index="2" term="http://purl.org/dc/terms/language"/>
            <field index="3" term="http://purl.org/dc/terms/description"/>
        </extension>
        """

        ext_descriptor = DataFileDescriptor.make_from_metafile_section(ET.fromstring(ext_section))

        self.assertEqual(ext_descriptor.raw_element.tag, "extension")
        self.assertEqual(ext_descriptor.raw_element.get("encoding"), "utf-8")
        self.assertEqual(len(ext_descriptor.raw_element.findall("field")), 3)
    def test_init_from_file(self):
        """ Ensure a DataFileDescriptor can be constructed directly from a CSV file.

        This is necessary for archives sans metafile.
        """
        with zipfile.ZipFile(SIMPLE_CSV, "r") as archive:
            datafile_path = archive.extract("0008333-160118175350007.csv")

            d = DataFileDescriptor.make_from_file(datafile_path)
            # Check basic metadata with the file
            self.assertIsNone(d.raw_element)
            self.assertTrue(d.represents_corefile)
            self.assertFalse(d.represents_extension)
            self.assertIsNone(d.type)
            self.assertEqual(d.file_location, "0008333-160118175350007.csv")
            self.assertEqual(d.file_encoding, "utf-8")
            self.assertEqual(d.lines_terminated_by, "\n")
            self.assertEqual(d.fields_terminated_by, "\t")
            self.assertEqual(d.fields_enclosed_by, '"')

            # Some checks on fields...

            # A few fields are checked
            expected_fields = (
                {"default": None, "index": 0, "term": "gbifid"},
                {"default": None, "index": 3, "term": "kingdom"},
            )

            for ef in expected_fields:
                self.assertTrue(ef in d.fields)

            # In total, there are 42 fields in this data file
            self.assertEqual(len(d.fields), 42)

            # No fields should have a default value (there's no metafile to set it!)
            for f in d.fields:
                self.assertIsNone(f["default"])

            # Ensure .terms is also set:
            self.assertEqual(len(d.terms), 42)

            # Cleanup extracted file
            os.remove(datafile_path)
    def test_iterate(self):
        metaxml_section = r"""
        <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
            <files><location>occurrence.txt</location></files>
                <id index="0" />
                <field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
                <field index="2" term="http://rs.tdwg.org/dwc/terms/locality"/>
                <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/>
                <field index="4" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
            </core>
         """

        descriptor = DataFileDescriptor.make_from_metafile_section(
            ET.fromstring(metaxml_section))
        data_file = CSVDataFile(sample_data_path("dwca-simple-dir"),
                                descriptor)

        for row in data_file:
            self.assertIsInstance(row, str)
Example #18
0
    def test_content_raw_element_tag(self):
        """ Test the content of raw_element seems decent. """
        ext_section = """
        <extension encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n"
        fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.gbif.org/terms/1.0/Description">
            <files><location>description.txt</location></files>
            <coreid index="0" />
            <field index="1" term="http://purl.org/dc/terms/type"/>
            <field index="2" term="http://purl.org/dc/terms/language"/>
            <field index="3" term="http://purl.org/dc/terms/description"/>
        </extension>
        """

        ext_descriptor = DataFileDescriptor.make_from_metafile_section(
            ET.fromstring(ext_section))

        self.assertEqual(ext_descriptor.raw_element.tag, 'extension')
        self.assertEqual(ext_descriptor.raw_element.get('encoding'), 'utf-8')
        self.assertEqual(len(ext_descriptor.raw_element.findall('field')), 3)
    def test_file_details(self):
        metaxml_section = """
        <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy=""
        ignoreHeaderLines="0" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
            <files>
                <location>occurrence.txt</location>
            </files>
            <id index="0" />
            <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/>
            <field index="1" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
            <field index="2" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
            <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/>
            <field index="4" term="http://rs.tdwg.org/dwc/terms/locality"/>
        </core>
        """

        core_descriptor = DataFileDescriptor.make_from_metafile_section(ET.fromstring(metaxml_section))

        self.assertEqual(core_descriptor.file_location, "occurrence.txt")
        self.assertEqual(core_descriptor.file_encoding, "utf-8")
Example #20
0
    def test_close(self):
        metaxml_section = r"""
        <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
            <files><location>occurrence.txt</location></files>
            <id index="0" />
            <field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
            <field index="2" term="http://rs.tdwg.org/dwc/terms/locality"/>
            <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/>
            <field index="4" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
        </core>
        """

        descriptor = DataFileDescriptor.make_from_metafile_section(
            ET.fromstring(metaxml_section))
        data_file = CSVDataFile(DIRECTORY_ARCHIVE_PATH, descriptor)

        data_file.close()

        with self.assertRaises(ValueError):
            # It's not possible anymore to access the data because file has been closed.
            data_file.get_row_by_position(1)
Example #21
0
    def test_file_descriptor_attribute(self):
        """The instance of DataFileDescriptor which is passed to the constructor is available in .file_descriptor"""

        metaxml_section = r"""
        <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
            <files>
                <location>occurrence.txt</location>
            </files>
            <id index="0" />
            <field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
            <field index="2" term="http://rs.tdwg.org/dwc/terms/locality"/>
            <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/>
            <field index="4" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
        </core>
        """

        descriptor = DataFileDescriptor.make_from_metafile_section(
            ET.fromstring(metaxml_section))
        data_file = CSVDataFile(DIRECTORY_ARCHIVE_PATH, descriptor)

        self.assertEqual(data_file.file_descriptor, descriptor)
    def test_exposes_id_index_of_core(self):
        metaxml_section = """
        <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy=""
        ignoreHeaderLines="0" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
            <files>
                <location>occurrence.txt</location>
            </files>
            <id index="0" />
            <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/>
            <field index="1" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
            <field index="2" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
            <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/>
            <field index="4" term="http://rs.tdwg.org/dwc/terms/locality"/>
        </core>
        """

        core_descriptor = DataFileDescriptor.make_from_metafile_section(ET.fromstring(metaxml_section))

        self.assertEqual(core_descriptor.id_index, 0)

        # ... but it doesn't have .coreid_index (only for extensions!)
        self.assertIsNone(core_descriptor.coreid_index)
    def test_tell_if_represents_core(self):
        # 1. Test with core
        with DwCAReader(BASIC_ARCHIVE_PATH) as dwca:
            core_descriptor = dwca.descriptor.core
            self.assertTrue(core_descriptor.represents_corefile)
            self.assertFalse(core_descriptor.represents_extension)

        ext_section = """
        <extension encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n"
        fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.gbif.org/terms/1.0/Description">
            <files><location>description.txt</location></files>
            <coreid index="0" />
            <field index="1" term="http://purl.org/dc/terms/type"/>
            <field index="2" term="http://purl.org/dc/terms/language"/>
            <field index="3" term="http://purl.org/dc/terms/description"/>
        </extension>
        """

        # 2. And with extension
        ext_descriptor = DataFileDescriptor.make_from_metafile_section(ET.fromstring(ext_section))
        self.assertFalse(ext_descriptor.represents_corefile)
        self.assertTrue(ext_descriptor.represents_extension)
Example #24
0
    def test_tell_if_represents_core(self):
        # 1. Test with core
        with DwCAReader(BASIC_ARCHIVE_PATH) as dwca:
            core_descriptor = dwca.descriptor.core
            self.assertTrue(core_descriptor.represents_corefile)
            self.assertFalse(core_descriptor.represents_extension)

        ext_section = """
        <extension encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n"
        fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.gbif.org/terms/1.0/Description">
            <files><location>description.txt</location></files>
            <coreid index="0" />
            <field index="1" term="http://purl.org/dc/terms/type"/>
            <field index="2" term="http://purl.org/dc/terms/language"/>
            <field index="3" term="http://purl.org/dc/terms/description"/>
        </extension>
        """

        # 2. And with extension
        ext_descriptor = DataFileDescriptor.make_from_metafile_section(
            ET.fromstring(ext_section))
        self.assertFalse(ext_descriptor.represents_corefile)
        self.assertTrue(ext_descriptor.represents_extension)
Example #25
0
    def test_exposes_id_index_of_core(self):
        metaxml_section = """
        <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy=""
        ignoreHeaderLines="0" rowType="http://rs.tdwg.org/dwc/terms/Occurrence">
            <files>
                <location>occurrence.txt</location>
            </files>
            <id index="0" />
            <field default="Belgium" term="http://rs.tdwg.org/dwc/terms/country"/>
            <field index="1" term="http://rs.tdwg.org/dwc/terms/scientificName"/>
            <field index="2" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/>
            <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/>
            <field index="4" term="http://rs.tdwg.org/dwc/terms/locality"/>
        </core>
        """

        core_descriptor = DataFileDescriptor.make_from_metafile_section(
            ET.fromstring(metaxml_section))

        self.assertEqual(core_descriptor.id_index, 0)

        # ... but it doesn't have .coreid_index (only for extensions!)
        self.assertIsNone(core_descriptor.coreid_index)
Example #26
0
    def __init__(self, path, extensions_to_ignore=None):
        # type: (str, List[str]) -> None
        """Open the Darwin Core Archive."""
        if extensions_to_ignore is None:
            extensions_to_ignore = []

        #: The path to the Darwin Core Archive file, as passed to the constructor.
        self.archive_path = path  # type: str

        if os.path.isdir(self.archive_path
                         ):  # Archive is a (directly readable) directory
            self._working_directory_path = self.archive_path
            self._directory_to_clean = None  # type: Optional[str]
        else:  # Archive is zipped/tgzipped, we have to extract it first.
            self._directory_to_clean, self._working_directory_path = self._extract(
            )

        #: An :class:`descriptors.ArchiveDescriptor` instance giving access to the archive
        #: descriptor/metafile (``meta.xml``)
        self.descriptor = None  # type: Optional[ArchiveDescriptor]
        try:
            self.descriptor = ArchiveDescriptor(
                self.open_included_file(self.default_metafile_name).read(),
                files_to_ignore=extensions_to_ignore)
        except IOError as exc:
            if exc.errno == ENOENT:
                pass

        #: A :class:`xml.etree.ElementTree.Element` instance containing the (scientific) metadata
        #: of the archive, or `None` if the archive has no metadata.
        self.metadata = self._parse_metadata_file()  # type: Optional[Element]

        #: If the archive contains source-level metadata (typically, GBIF downloads), this is a dict such as::
        #:
        #:      {'dataset1_UUID': <dataset1 EML> (xml.etree.ElementTree.Element object),
        #:       'dataset2_UUID': <dataset2 EML> (xml.etree.ElementTree.Element object), ...}
        #:
        #: See :doc:`gbif_results` for more details.
        self.source_metadata = self._get_source_metadata(
        )  # type: Dict[str, Element]

        if self.descriptor:  # We have an Archive descriptor that we can use to access data files.
            #: An instance of :class:`dwca.files.CSVDataFile` for the core data file.
            self.core_file = CSVDataFile(
                self._working_directory_path,
                self.descriptor.core)  # type: CSVDataFile

            #: A list of :class:`dwca.files.CSVDataFile`, one entry for each extension data file , sorted by order of
            #: appearance in the Metafile (or an empty list if the archive doesn't use extensions).
            self.extension_files = [
                CSVDataFile(work_directory=self._working_directory_path,
                            file_descriptor=d)
                for d in self.descriptor.extensions
            ]  # type: List[CSVDataFile]
        else:  # Archive without descriptor, we'll have to find and inspect the data file
            try:
                datafile_name = self._is_valid_simple_archive()
                descriptor = DataFileDescriptor.make_from_file(
                    os.path.join(self._working_directory_path, datafile_name))

                self.core_file = CSVDataFile(
                    work_directory=self._working_directory_path,
                    file_descriptor=descriptor)
                self.extension_files = []
            except InvalidSimpleArchive:
                msg = "No Metafile was found, but the archive contains multiple files/directories."
                raise InvalidSimpleArchive(msg)