def test_manual_cleanup_zipped(self):
        """Test no temporary files are left after execution (calling close() manually)."""
        num_files_before = len(os.listdir("."))

        r = DwCAReader(BASIC_ARCHIVE_PATH)
        r.close()

        num_files_after = len(os.listdir("."))

        self.assertEqual(num_files_before, num_files_after)
    def test_exception_invalid_archives_missing_metadata(self):
        """Ensure an exception is raised when referencing a missing metadata file."""
        # Sometimes, the archive metafile references a metadata file that's not present in the
        # archive. See for example http://dev.gbif.org/issues/browse/PF-2125
        with self.assertRaises(InvalidArchive) as cm:
            a = DwCAReader(INVALID_LACKS_METADATA)
            a.close()

        the_exception = cm.exception

        expected_message = "eml.xml is referenced in the archive descriptor but missing."
        self.assertEqual(str(the_exception), expected_message)
Ejemplo n.º 3
0
def dwca_metadata(dwca_file):
    """Open a Darwin Core archive and return the metadata."""
    # Open the Darwin Core Archive given in dwca_file
    dwca = DwCAReader(dwca_file)
    if not dwca:
        return None
        
    # Pull the metadata from the archive
    metadata=dwca.metadata
    
    # Close the archive to free resources
    dwca.close()
    
    return metadata
Ejemplo n.º 4
0
    def test_pd_read_default_values(self):
        with DwCAReader(sample_data_path('dwca-test-default.zip')) as dwca:
            df = dwca.pd_read('occurrence.txt')

            self.assertIn('country', df.columns.values.tolist())
            for country in df['country'].values.tolist():
                self.assertEqual(country, 'Belgium')
Ejemplo n.º 5
0
def whip_dwca(dwca_zip, specifications, maxentries=None):
    """Whip a Darwin Core Archive

    Validate the core file of a `Darwin Core Archive`_ zipped data set,
    using the :class:`~dwca.read.DwCAReader` reading and iterator capabilities.

    .. _Darwin Core Archive: https://en.wikipedia.org/wiki/Darwin_Core_Archive

    Parameters
    ----------
    dwca_zip : str
        Filename of the zipped Darwin Core Archive.
    specifications : dict
        Valid specifications whip dictionary schema.
    maxentries : int
        Define the limit of records to validate from the Archive, useful to
        have a quick set on the frst subset of data.

    Returns
    -------
    whip_it : pywhip.pywhi.Whip
        Whip validator clasc instance, containing the errors and reporting
        capabilities.
    """
    # Extract data header - only core support
    with DwCAReader(dwca_zip) as dwca:
        field_names = [field['term'].split('/')[-1] for field in
                       dwca.core_file.file_descriptor.fields]

    # Apply whip
    whip_it = Whip(specifications)
    whip_it._whip(whip_it.generate_dwca(dwca_zip),
                  field_names, maxentries)
    return whip_it
Ejemplo n.º 6
0
    def test_no_temporary_dir_directory(self):
        """If archive is a directory, no need to create temporary files."""
        num_files_before = len(os.listdir("."))
        with DwCAReader(sample_data_path("dwca-simple-dir")):
            num_files_during = len(os.listdir("."))

        self.assertEqual(num_files_before, num_files_during)
Ejemplo n.º 7
0
    def test_get_corerow_by_id_other(self):
        genus_qn = "http://rs.tdwg.org/dwc/terms/genus"

        with DwCAReader(sample_data_path("dwca-ids.zip")) as dwca:
            # Passed as an integer, conversion will be tried...
            r = dwca.get_corerow_by_id(3)
            self.assertEqual("Peliperdix", r.data[genus_qn])
Ejemplo n.º 8
0
 def test_csv_quote_dir_archive(self):
     """If the field separator is in a quoted field, don't break on it."""
     with DwCAReader(sample_data_path("dwca-csv-quote-dir")) as dwca:
         rows = list(dwca)
         self.assertEqual(len(rows), 2)
         self.assertEqual(rows[0].data[qn("basisOfRecord")],
                          "Observation, something")
    def load_rows(self):

        with DwCAReader(self.gbif_path) as dwca:

            # We can now interact with the 'dwca' object
            print("Read core type: " + dwca.descriptor.core.type.__str__() +
                  "! :)")

            # Check if a Darwin Core term in present in the core file
            if 'http://rs.tdwg.org/dwc/terms/locality' in dwca.descriptor.core.terms:
                print("Locality term is present! :)")
            else:
                print("Locality term is not present.  :(")

            # Using full qualnames for DarwincCore terms (such as 'http://rs.tdwg.org/dwc/terms/country') is verbose...
            # The qualname() helper function make life easy for common terms.
            # (here, it has been imported as 'qn'):
            qn('locality')
            # => u'http://rs.tdwg.org/dwc/terms/locality'
            # Combined with previous examples, this can be used to things more clear:
            # For example:
            if qn('locality') in dwca.descriptor.core.terms:
                pass

            # Or:
            if dwca.descriptor.core.type == qn('Occurrence'):
                pass

            # load row data into memory
            self.gbif = dwca.rows
Ejemplo n.º 10
0
    def test_core_file(self):
        with DwCAReader(
                sample_data_path("dwca-simple-test-archive.zip")) as dwca:
            self.assertIsInstance(dwca.core_file, CSVDataFile)

            # Quick content check just to be sure
            self.assertEqual(dwca.core_file.lines_to_ignore, 1)
Ejemplo n.º 11
0
    def test_pd_read_default_values(self):
        with DwCAReader(sample_data_path("dwca-test-default.zip")) as dwca:
            df = dwca.pd_read("occurrence.txt")

            self.assertIn("country", df.columns.values.tolist())
            for country in df["country"].values.tolist():
                self.assertEqual(country, "Belgium")
Ejemplo n.º 12
0
    def test_exposes_core_terms(self):
        with DwCAReader(
                sample_data_path('dwca-star-test-archive.zip')) as star_dwca:
            # The Core file contains the following rows
            # <field index="1" term="http://rs.tdwg.org/dwc/terms/family"/>
            # <field index="2" term="http://rs.tdwg.org/dwc/terms/phylum"/>
            # <field index="3" term="http://rs.tdwg.org/dwc/terms/order"/>
            # <field index="4" term="http://rs.tdwg.org/dwc/terms/genus"/>
            # <field index="5" term="http://rs.tdwg.org/dwc/terms/kingdom"/>
            # <field index="6" term="http://rs.tdwg.org/dwc/terms/class"/>

            # It also contains an id column (should not appear here)
            # There's an extension with 3 fields, should not appear here.

            # Assert correct size
            descriptor = star_dwca.descriptor
            self.assertEqual(6, len(descriptor.core.terms))

            # Assert correct content (should be a set, so unordered)
            fields = set([
                'http://rs.tdwg.org/dwc/terms/kingdom',
                'http://rs.tdwg.org/dwc/terms/order',
                'http://rs.tdwg.org/dwc/terms/class',
                'http://rs.tdwg.org/dwc/terms/genus',
                'http://rs.tdwg.org/dwc/terms/family',
                'http://rs.tdwg.org/dwc/terms/phylum'
            ])

            self.assertEqual(fields, descriptor.core.terms)
Ejemplo n.º 13
0
    def test_get_corerow_by_id_string(self):
        genus_qn = 'http://rs.tdwg.org/dwc/terms/genus'

        with DwCAReader(sample_data_path('dwca-ids.zip')) as dwca:
            # Number can be passed as a string....
            r = dwca.get_corerow_by_id('3')
            self.assertEqual('Peliperdix', r.data[genus_qn])
Ejemplo n.º 14
0
    def test_get_corerow_by_id_string(self):
        genus_qn = "http://rs.tdwg.org/dwc/terms/genus"

        with DwCAReader(sample_data_path("dwca-ids.zip")) as dwca:
            # Number can be passed as a string....
            r = dwca.get_corerow_by_id("3")
            self.assertEqual("Peliperdix", r.data[genus_qn])
Ejemplo n.º 15
0
    def test_deprecated_row_by_position(self):
        """get_row_by_index() has been renamed get_corerow_by_position(). Make sure it still works, w/ warning."""

        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always", DeprecationWarning)

            # Copy-pasted code from the long term test_get_corerow_by_position()
            with DwCAReader(sample_data_path('dwca-ids.zip')) as dwca:
                # Row IDs are ordered like this in core: id 4-1-3-2
                first_row = dwca.get_row_by_index(0)
                self.assertEqual(4, int(first_row.id))

                self.assertEqual(1, len(w))  # Warning was issued
                the_warning = w[0]
                assert issubclass(the_warning.category, DeprecationWarning)
                self.assertEqual("This method has been renamed to get_corerow_by_position().", str(the_warning.message))

                last_row = dwca.get_row_by_index(3)
                self.assertEqual(2, int(last_row.id))

                # Exception raised if bigger than archive (last index: 3)
                with self.assertRaises(RowNotFound):
                    dwca.get_row_by_index(4)

                with self.assertRaises(RowNotFound):
                    dwca.get_row_by_index(1000)
Ejemplo n.º 16
0
    def test_pd_read_no_data_files(self):
        with DwCAReader(sample_data_path('dwca-simple-test-archive.zip')) as dwca:
            with self.assertRaises(NotADataFile):
                dwca.pd_read('imaginary_file.txt')

            with self.assertRaises(NotADataFile):
                dwca.pd_read('eml.xml')
Ejemplo n.º 17
0
    def test_dont_enclose_unenclosed(self):
        """If fields_enclosed_by is set to an empty string, don't enclose (even if quotes are present)"""
        with DwCAReader(sample_data_path('dwca-simple-dir')) as dwca:
            rows = list(dwca)

            self.assertEqual('"betta" splendens', rows[2].data[qn('scientificName')])
            self.assertEqual("'betta' splendens", rows[3].data[qn('scientificName')])
Ejemplo n.º 18
0
    def test_explicit_encoding_metadata(self):
        """If the metadata file explicitly specifies encoding (<xml ...>), make sure it is used."""

        with DwCAReader(sample_data_path('dwca-metadata-windows1252-encoding')) as dwca:
            v = (dwca.metadata.find('dataset').find('creator').find('individualName')
                 .find('surName').text)
            self.assertEqual(v, u'Noé')  # Is the accent properly interpreted?
Ejemplo n.º 19
0
    def test_read_core_value(self):
        """Retrieve a simple value from core file"""
        with DwCAReader(sample_data_path('dwca-simple-test-archive.zip')) as dwca:
            rows = list(dwca)

            # Check basic locality values from sample file
            self.assertEqual('Borneo', rows[0].data[qn('locality')])
            self.assertEqual('Mumbai', rows[1].data[qn('locality')])
Ejemplo n.º 20
0
    def test_read_core_value(self):
        """Retrieve a simple value from core file"""
        with DwCAReader(BASIC_ARCHIVE_PATH) as dwca:
            rows = list(dwca)

            # Check basic locality values from sample file
            self.assertEqual('Borneo', rows[0].data[qn('locality')])
            self.assertEqual('Mumbai', rows[1].data[qn('locality')])
Ejemplo n.º 21
0
 def test_orphaned_extension_rows(self):
     # Archive with extensions and orphaned rows
     with DwCAReader(sample_data_path("dwca-orphaned-rows.zip")) as dwca:
         expected = {
             "description.txt": {u"5": [3, 4], u"6": [5]},
             "vernacularname.txt": {u"7": [4]},
         }
         self.assertEqual(expected, dwca.orphaned_extension_rows())
Ejemplo n.º 22
0
    def test_use_extensions(self):
        """Ensure the .use_extensions attribute of DwCAReader works as intended."""
        with DwCAReader(sample_data_path('dwca-simple-test-archive.zip')) as dwca:
            self.assertFalse(dwca.use_extensions)  # Basic archive without extensions

        with DwCAReader(sample_data_path('dwca-simple-csv.zip')) as dwca:  # Just a CSV file, so no extensions
            self.assertFalse(dwca.use_extensions)

        with DwCAReader(sample_data_path('dwca-star-test-archive.zip')) as dwca:
            self.assertTrue(dwca.use_extensions)

        with DwCAReader(sample_data_path('dwca-2extensions.zip')) as dwca:
            self.assertTrue(dwca.use_extensions)

        with DwCAReader(sample_data_path('dwca-star-test-archive.zip'), extensions_to_ignore="vernacularname.txt") as dwca:
            # We ignore the extension, so archive appears without
            self.assertFalse(dwca.use_extensions)
Ejemplo n.º 23
0
 def test_partial_default(self):
     with DwCAReader(sample_data_path("dwca-partial-default.zip")) as dwca:
         self.assertEqual(
             dwca.rows[0].data[qn("country")], "France"
         )  # Value comes from data file
         self.assertEqual(
             dwca.rows[1].data[qn("country")], "Belgium"
         )  # Value is field default
Ejemplo n.º 24
0
    def test_row_class(self):
        with DwCAReader(sample_data_path('dwca-star-test-archive.zip')) as star_dwca:
            for row in star_dwca:
                self.assertIsInstance(row, CoreRow)

                # But the extensions are... extensions (hum)
                for an_extension in row.extensions:
                    self.assertIsInstance(an_extension, ExtensionRow)
Ejemplo n.º 25
0
    def test_open_included_file(self):
        """Ensure DwCAReader.open_included_file work as expected."""
        # Let's use it to read the raw core data file:
        with DwCAReader(DIRECTORY_ARCHIVE_PATH) as dwca:
            f = dwca.open_included_file('occurrence.txt')

            raw_occ = f.read()
            self.assertTrue(raw_occ.endswith('betta splendens\n'))
Ejemplo n.º 26
0
 def test_orphaned_extension_rows(self):
     # Archive with extensions and orphaned rows
     with DwCAReader(sample_data_path('dwca-orphaned-rows.zip')) as dwca:
         expected = {
             'description.txt': {u'5': [3, 4], u'6': [5]},
             'vernacularname.txt': {u'7': [4]}
         }
         self.assertEqual(expected, dwca.orphaned_extension_rows())
Ejemplo n.º 27
0
    def test_row_class(self):
        with DwCAReader(EXTENSION_ARCHIVE_PATH) as star_dwca:
            for row in star_dwca:
                self.assertIsInstance(row, CoreRow)

                # But the extensions are... extensions (hum)
                for an_extension in row.extensions:
                    self.assertIsInstance(an_extension, ExtensionRow)
Ejemplo n.º 28
0
 def test_pd_read_quotedir(self):
     with DwCAReader(sample_data_path("dwca-csv-quote-dir")) as dwca:
         df = dwca.pd_read("occurrence.txt")
         # The field separator is found in a quoted field, don't break
         self.assertEqual(df.shape, (2, 5))
         self.assertEqual(
             df["basisOfRecord"].values.tolist()[0], "Observation, something"
         )
Ejemplo n.º 29
0
    def test_position(self):
        # Test with archives with and without headers:
        archives_to_test = (BASIC_ARCHIVE_PATH, NOHEADERS1_PATH)

        for archive_path in archives_to_test:
            with DwCAReader(archive_path) as dwca:
                for i, row in enumerate(dwca):
                    self.assertEqual(i, row.position)
    def test_open_included_file(self):
        """Ensure DwCAReader.open_included_file work as expected."""
        # Let's use it to read the raw core data file:
        with DwCAReader(sample_data_path('dwca-simple-dir')) as dwca:
            f = dwca.open_included_file('occurrence.txt')

            raw_occ = f.read()
            self.assertTrue(raw_occ.endswith("'betta' splendens\n"))
    def test_simplecsv_archive(self):
        """Ensure the reader works with archives consiting of a single CSV file.

        As described in page #2 of http://www.gbif.org/resource/80639, those archives consists
        of a single core data file where the first line provides the names of the Darwin Core terms
        represented in the published data. That also seems to match quite well the definition of
        Simple Darwin Core expressed as text: http://rs.tdwg.org/dwc/terms/simple/index.htm.
        """
        with DwCAReader(sample_data_path('dwca-simple-csv.zip')) as dwca:
            # Ensure we get the correct number of rows
            self.assertEqual(len(dwca.rows), 3)
            # Ensure we can access arbitrary data
            self.assertEqual(
                dwca.get_corerow_by_position(1).data['decimallatitude'],
                '-31.98333')
            # Archive descriptor should be None
            self.assertIsNone(dwca.descriptor)
            # (scientific) metadata should be None
            self.assertIsNone(dwca.metadata)

        # Let's do the same tests again but with DOS line endings in the data file
        with DwCAReader(sample_data_path('dwca-simple-csv-dos.zip')) as dwca:
            # Ensure we get the correct number of rows
            self.assertEqual(len(dwca.rows), 3)
            # Ensure we can access arbitrary data
            self.assertEqual(
                dwca.get_corerow_by_position(1).data['decimallatitude'],
                '-31.98333')
            # Archive descriptor should be None
            self.assertIsNone(dwca.descriptor)
            # (scientific) metadata should be None
            self.assertIsNone(dwca.metadata)

        # And with a file where fields are not double quotes-enclosed:
        with DwCAReader(
                sample_data_path('dwca-simple-csv-notenclosed.zip')) as dwca:
            # Ensure we get the correct number of rows
            self.assertEqual(len(dwca.rows), 3)
            # Ensure we can access arbitrary data
            self.assertEqual(
                dwca.get_corerow_by_position(1).data['decimallatitude'],
                '-31.98333')
            # Archive descriptor should be None
            self.assertIsNone(dwca.descriptor)
            # (scientific) metadata should be None
            self.assertIsNone(dwca.metadata)
Ejemplo n.º 32
0
    def test_ignore_extension(self):
        """Ensure the extensions_to_ignore argument work as expected."""

        # This archive has two extensions, but we ask to ignore one...
        with DwCAReader(
                sample_data_path("dwca-2extensions.zip"),
                extensions_to_ignore="description.txt",
        ) as multi_dwca:

            rows = list(multi_dwca)

            # 3 vernacular names
            self.assertEqual(3, len(rows[0].extensions))
            # 1 Vernacular name
            self.assertEqual(1, len(rows[1].extensions))
            # No extensions for this core line
            self.assertEqual(0, len(rows[2].extensions))

        # Here, we ignore the only extension of an archive
        with DwCAReader(
                sample_data_path("dwca-star-test-archive.zip"),
                extensions_to_ignore="vernacularname.txt",
        ) as star_dwca:
            rows = list(star_dwca)

            self.assertEqual(0, len(rows[0].extensions))
            self.assertEqual(0, len(rows[1].extensions))
            self.assertEqual(0, len(rows[2].extensions))
            self.assertEqual(0, len(rows[3].extensions))

        # And here, we check it is silently ignored and everything works in case we ask to
        # ignore an unexisting extension
        with DwCAReader(
                sample_data_path("dwca-2extensions.zip"),
                extensions_to_ignore="helloworld.txt",
        ) as multi_dwca:

            rows = list(multi_dwca)

            # 3 vernacular names + 2 taxon descriptions
            self.assertEqual(5, len(rows[0].extensions))
            # 1 Vernacular name, no taxon description
            self.assertEqual(1, len(rows[1].extensions))
            # No extensions for this core row
            self.assertEqual(0, len(rows[2].extensions))
    def test_auto_cleanup_directory(self):
        """If the source is already a directory, there's nothing to create nor cleanup."""
        num_files_before = len(os.listdir('.'))

        with DwCAReader(sample_data_path('dwca-simple-dir')):
            pass

        num_files_after = len(os.listdir('.'))
        self.assertEqual(num_files_before, num_files_after)
    def test_source_data_not_destroyed_directory(self):
        """In archive=directory, it should not be destroyed after use.

        (check that the cleanup routine for zipped file is not accidentaly called)
        """
        r = DwCAReader(DIRECTORY_ARCHIVE_PATH)
        r.close()

        # If previously destroyed, this will fail...
        r = DwCAReader(DIRECTORY_ARCHIVE_PATH)
        self.assertIsInstance(r.metadata, ET.Element)
        r.close()
    def test_exception_invalid_simple_archives(self):
        """Ensure an exception is raised when simple archives can't be interpreted.

        When there's no metafile in an archive, this one consists of a single data core file,
        and possibly some metadata in EML.xml. If the archive doesn't follow this structure,
        python-dwca-reader can't detect the data file and should throw an InvalidArchive exception.
        """
        # There's a random file (in addition to data and EML.xml) in this one, so we can't choose
        # which file is the datafile.
        with self.assertRaises(InvalidArchive):
            a = DwCAReader(INVALID_SIMPLE_TOOMUCH)
            a.close()

        with self.assertRaises(InvalidArchive):
            a = DwCAReader(INVALID_SIMPLE_TWO)
            a.close()
    fullpath = tsvoutputfile

    if not os.path.isfile(inputfile):
        return None

    # Make an appropriate reader based on whether the archive is standard or a GBIF
    # download.
    dwcareader = None
    if type=='gbif':
        try:
            dwcareader = GBIFResultsReader(inputfile)
        except Exception, e:
            logging.error('GBIF archive %s has an exception: %s ' % (inputfile, e))
            pass
    else:
        dwcareader = DwCAReader(inputfile)
    if dwcareader is None:
        print 'No viable archive found at %s' % inputfile
        return None

    termnames=list(dwcareader.descriptor.core.terms)
    shorttermnames=short_term_names(termnames)
    dialect = csv.excel
    dialect.lineterminator='\r'
    dialect.delimiter='\t'
    with open(fullpath, 'w') as tsvfile:
        writer = csv.DictWriter(tsvfile, dialect=dialect, fieldnames=shorttermnames, 
            quoting=csv.QUOTE_NONE, quotechar='')
        writer.writeheader()
 
    rowcount = 0
 def test_classic_opening(self):
     """Ensure it also works w/o the 'with' statement."""
     dwca = DwCAReader(BASIC_ARCHIVE_PATH)
     self.assertIsInstance(dwca.metadata, ET.Element)
     dwca.close()