def test_core_contains_term(self): """Test the core_contains_term method.""" # Example file contains locality but no country with DwCAReader(BASIC_ARCHIVE_PATH) as dwca: self.assertTrue(dwca.core_contains_term(qn('locality'))) self.assertFalse(dwca.core_contains_term(qn('country')))
def load_rows(self): with DwCAReader(self.gbif_path) as dwca: # We can now interact with the 'dwca' object print("Read core type: " + dwca.descriptor.core.type.__str__() + "! :)") # Check if a Darwin Core term in present in the core file if 'http://rs.tdwg.org/dwc/terms/locality' in dwca.descriptor.core.terms: print("Locality term is present! :)") else: print("Locality term is not present. :(") # Using full qualnames for DarwincCore terms (such as 'http://rs.tdwg.org/dwc/terms/country') is verbose... # The qualname() helper function make life easy for common terms. # (here, it has been imported as 'qn'): qn('locality') # => u'http://rs.tdwg.org/dwc/terms/locality' # Combined with previous examples, this can be used to things more clear: # For example: if qn('locality') in dwca.descriptor.core.terms: pass # Or: if dwca.descriptor.core.type == qn('Occurrence'): pass # load row data into memory self.gbif = dwca.rows
def test_dont_enclose_unenclosed(self): """If fields_enclosed_by is set to an empty string, don't enclose (even if quotes are present)""" with DwCAReader(sample_data_path('dwca-simple-dir')) as dwca: rows = list(dwca) self.assertEqual('"betta" splendens', rows[2].data[qn('scientificName')]) self.assertEqual("'betta' splendens", rows[3].data[qn('scientificName')])
def test_read_core_value(self): """Retrieve a simple value from core file""" with DwCAReader(BASIC_ARCHIVE_PATH) as dwca: rows = list(dwca) # Check basic locality values from sample file self.assertEqual('Borneo', rows[0].data[qn('locality')]) self.assertEqual('Mumbai', rows[1].data[qn('locality')])
def test_partial_default(self): with DwCAReader(sample_data_path("dwca-partial-default.zip")) as dwca: self.assertEqual( dwca.rows[0].data[qn("country")], "France" ) # Value comes from data file self.assertEqual( dwca.rows[1].data[qn("country")], "Belgium" ) # Value is field default
def test_read_core_value(self): """Retrieve a simple value from core file""" with DwCAReader(BASIC_ARCHIVE_PATH) as dwca: rows = list(dwca) # Check basic locality values from sample file self.assertEqual("Borneo", rows[0].data[qn("locality")]) self.assertEqual("Mumbai", rows[1].data[qn("locality")])
def test_read_core_value(self): """Retrieve a simple value from core file""" with DwCAReader(sample_data_path('dwca-simple-test-archive.zip')) as dwca: rows = list(dwca) # Check basic locality values from sample file self.assertEqual('Borneo', rows[0].data[qn('locality')]) self.assertEqual('Mumbai', rows[1].data[qn('locality')])
def test_qn(self): """Test the qn (shortcut generator) helper""" # Test success self.assertEqual("http://rs.tdwg.org/dwc/terms/Occurrence", qn("Occurrence")) # Test failure with self.assertRaises(StopIteration): qn("dsfsdfsdfsdfsdfsd")
def set_term_value(rowdata, term, value): """Set the value of the term in the given rowdata.""" if rowdata is None: return if term in rowdata.keys(): rowdata[term]=value elif qn(term) in rowdata.keys(): rowdata[qn(term)]=value return
def set_term_value(rowdata, term, value): """Set the value of the term in the given rowdata.""" if rowdata is None: return if term in rowdata.keys(): rowdata[term] = value elif qn(term) in rowdata.keys(): rowdata[qn(term)] = value return
def test_archives_without_metadata(self): """Ensure we can deal with an archive containing a metafile, but no metadata.""" with DwCAReader(NOMETADATA_PATH) as dwca: self.assertIsNone(dwca.metadata) # But the data is nevertheless accessible rows = list(dwca) self.assertEqual(len(rows), 2) self.assertEqual("Borneo", rows[0].data[qn("locality")]) self.assertEqual("Mumbai", rows[1].data[qn("locality")])
def test_archives_without_metadata(self): """Ensure we can deal with an archive containing a metafile, but no metadata.""" with DwCAReader(sample_data_path('dwca-nometadata.zip')) as dwca: self.assertIsNone(dwca.metadata) # But the data is nevertheless accessible rows = list(dwca) self.assertEqual(len(rows), 2) self.assertEqual('Borneo', rows[0].data[qn('locality')]) self.assertEqual('Mumbai', rows[1].data[qn('locality')])
def dwcaline_to_epsg4326(line): """ Returns a {'lat': X, 'lon': Y} dict for the given DwCALine. """ try: lat = float(line.data[qn('decimalLatitude')]) lon = float(line.data[qn('decimalLongitude')]) except ValueError: raise CannotConvertException() return {'lat': lat, 'lon': lon}
def applicable_to_archive(self, archive): if archive.core_rowtype == qn('Occurrence'): if archive.core_contains_term(qn('kingdom')): return True else: self.logger.log("Core should contain the 'kingdom' term", MessageTypes.APPLICABILITY, MessageLevels.ERROR) return False else: self.logger.log("Archive core should be of Occurrence type.", MessageTypes.APPLICABILITY, MessageLevels.ERROR) return False
def test_core_contains_term(self): """Test the core_contains_term method.""" # Example file contains locality but no country with DwCAReader(sample_data_path('dwca-simple-test-archive.zip')) as dwca: self.assertTrue(dwca.core_contains_term(qn('locality'))) self.assertFalse(dwca.core_contains_term(qn('country'))) # Also test it with a simple (= no metafile) archive with DwCAReader(sample_data_path('dwca-simple-csv.zip')) as dwca: self.assertTrue(dwca.core_contains_term('datasetkey')) self.assertFalse(dwca.core_contains_term('trucmachin'))
def row_has_term_value(rowdata, term): """Return True if the row contains a value for the term other than ''.""" if rowdata is None: return False if term in rowdata.keys(): if rowdata[term]!='': return True elif qn(term) in rowdata.keys(): if rowdata[qn(term)]!='': return True return False
def test_core_contains_term(self): """Test the core_contains_term method.""" # Example file contains locality but no country with DwCAReader(BASIC_ARCHIVE_PATH) as dwca: self.assertTrue(dwca.core_contains_term(qn("locality"))) self.assertFalse(dwca.core_contains_term(qn("country"))) # Also test it with a simple (= no metafile) archive with DwCAReader(SIMPLE_CSV) as dwca: self.assertTrue(dwca.core_contains_term("datasetkey")) self.assertFalse(dwca.core_contains_term("trucmachin"))
def row_has_term_value(rowdata, term): """Return True if the row contains a value for the term other than ''.""" if rowdata is None: return False if term in rowdata.keys(): if rowdata[term] != '': return True elif qn(term) in rowdata.keys(): if rowdata[qn(term)] != '': return True return False
def test_core_contains_term(self): """Test the core_contains_term method.""" # Example file contains locality but no country with DwCAReader(BASIC_ARCHIVE_PATH) as dwca: self.assertTrue(dwca.core_contains_term(qn('locality'))) self.assertFalse(dwca.core_contains_term(qn('country'))) # Also test it with a simple (= no metafile) archive with DwCAReader(SIMPLE_CSV) as dwca: self.assertTrue(dwca.core_contains_term('datasetkey')) self.assertFalse(dwca.core_contains_term('trucmachin'))
def test_enclosed_data(self): """Ensure data is properly trimmed when fieldsEnclosedBy is in use.""" with DwCAReader(BASIC_ENCLOSED_ARCHIVE_PATH) as dwca: rows = list(dwca) # Locality is enclosed in "'" chars, they should be trimmed... self.assertEqual('Borneo', rows[0].data[qn('locality')]) self.assertEqual('Mumbai', rows[1].data[qn('locality')]) # But family isn't, so it shouldn't be altered self.assertEqual('Tetraodontidae', rows[0].data[qn('family')]) self.assertEqual('Osphronemidae', rows[1].data[qn('family')])
def test_tgz_archives(self): """Ensure the reader (basic features) works with a .tgz Archive.""" with DwCAReader(BASIC_ARCHIVE_TGZ_PATH) as dwca: self.assertIsInstance(dwca.metadata, ET.Element) for row in dwca: self.assertIsInstance(row, CoreRow) rows = list(dwca) self.assertEqual(len(rows), 2) self.assertEqual('Borneo', rows[0].data[qn('locality')]) self.assertEqual('Mumbai', rows[1].data[qn('locality')])
def test_enclosed_data(self): """Ensure data is properly trimmed when fieldsEnclosedBy is in use.""" with DwCAReader(BASIC_ENCLOSED_ARCHIVE_PATH) as dwca: rows = list(dwca) # Locality is enclosed in "'" chars, they should be trimmed... self.assertEqual("Borneo", rows[0].data[qn("locality")]) self.assertEqual("Mumbai", rows[1].data[qn("locality")]) # But family isn't, so it shouldn't be altered self.assertEqual("Tetraodontidae", rows[0].data[qn("family")]) self.assertEqual("Osphronemidae", rows[1].data[qn("family")])
def test_tgz_archives(self): """Ensure the reader (basic features) works with a .tgz Archive.""" with DwCAReader(BASIC_ARCHIVE_TGZ_PATH) as dwca: self.assertIsInstance(dwca.metadata, ET.Element) for row in dwca: self.assertIsInstance(row, CoreRow) rows = list(dwca) self.assertEqual(len(rows), 2) self.assertEqual("Borneo", rows[0].data[qn("locality")]) self.assertEqual("Mumbai", rows[1].data[qn("locality")])
def test_tgz_archives(self): """Ensure the reader (basic features) works with a .tgz Archive.""" with DwCAReader(sample_data_path('dwca-simple-test-archive.tgz')) as dwca: self.assertIsInstance(dwca.metadata, ET.Element) for row in dwca: self.assertIsInstance(row, CoreRow) rows = list(dwca) self.assertEqual(len(rows), 2) self.assertEqual('Borneo', rows[0].data[qn('locality')]) self.assertEqual('Mumbai', rows[1].data[qn('locality')])
def test_enclosed_data(self): """Ensure data is properly trimmed when fieldsEnclosedBy is in use.""" with DwCAReader(sample_data_path('dwca-simple-test-archive-enclosed.zip')) as dwca: rows = list(dwca) # Locality is enclosed in "'" chars, they should be trimmed... self.assertEqual('Borneo', rows[0].data[qn('locality')]) self.assertEqual('Mumbai', rows[1].data[qn('locality')]) # But family isn't, so it shouldn't be altered self.assertEqual('Tetraodontidae', rows[0].data[qn('family')]) self.assertEqual('Osphronemidae', rows[1].data[qn('family')])
def check_datatype(self, row, term, dtype): """check for datatypes (broader as python-specific, also json,...) """ if dtype == 'json': try: json.loads(row.data[qn(term)]) except: self._add_failure(row, term, 'ValidDataType') elif dtype == 'int' or dtype == 'integer': if not isinstance(row.data[qn(term)], int): self._add_failure(row, term, 'ValidDataType') else: raise Exception("{} not supported".format(dtype))
def create_occurrence_from_dwcaline(line): #import pdb; pdb.set_trace() occ = Occurrence() # Simple fields # TODO: move these long Dwc strings to a specific module ? occ.catalog_number = line.data[qn('catalogNumber')] occ.scientificname = '' # TODO: Remove this field event_date = line.data[qn('eventDate')] if event_date != '': occ.event_date = event_date # Foreign keys mgrs_id = line.data[qn('verbatimCoordinates')] occ.square = MGRSSquare.objects.get_or_create(label=mgrs_id)[0] species = line.data[qn('specificEpithet')] genus = line.data[qn('genus')] family = line.data[qn('family')] scientificname = line.data[qn('scientificName')] specificepithet = line.data[qn('specificEpithet')] occ.species = get_or_create_taxonomy(family, genus, species, scientificname, specificepithet) occ.save()
def row_has_term(rowdata, term): """Return True if the row contains the term in its data dictionary by name or identifier.""" if rowdata is None: return False if term in rowdata.keys() or qn(term) in rowdata.keys(): return True return False
def test_csv_quote_dir_archive(self): """If the field separator is in a quoted field, don't break on it.""" with DwCAReader(sample_data_path('dwca-csv-quote-dir')) as dwca: rows = list(dwca) self.assertEqual(len(rows), 2) self.assertEqual(rows[0].data[qn('basisOfRecord')], 'Observation, something')
def test_exposes_core_type(self): """Test that it exposes the Archive Core Type as type""" with DwCAReader(BASIC_ARCHIVE_PATH) as dwca: coredescriptor = dwca.descriptor.core # dwca-simple-test-archive.zip should be of Occurrence type self.assertEqual(coredescriptor.type, "http://rs.tdwg.org/dwc/terms/Occurrence") # Check that shortcuts also work self.assertEqual(coredescriptor.type, qn("Occurrence"))
def test_dwcareader_features(self): """Ensure we didn't break inherited basic DwCAReader features.""" with GBIFResultsReader(GBIF_RESULTS_PATH) as results_dwca: self.assertEqual(158, len(results_dwca.rows)) self.assertEqual('http://rs.tdwg.org/dwc/terms/Occurrence', results_dwca.descriptor.core.type) row1 = results_dwca.rows[0] self.assertEqual('Tetraodontidae', row1.data[qn('family')]) self.assertEqual([], row1.extensions)
def test_exposes_core_type(self): """Test that it exposes the Archive Core Type as type""" with DwCAReader(BASIC_ARCHIVE_PATH) as dwca: coredescriptor = dwca.descriptor.core # dwca-simple-test-archive.zip should be of Occurrence type self.assertEqual(coredescriptor.type, 'http://rs.tdwg.org/dwc/terms/Occurrence') # Check that shortcuts also work self.assertEqual(coredescriptor.type, qn('Occurrence'))
def get_term_value(rowdata, term): """Return the value of the term in the given rowdata.""" if rowdata is None: return None if term in rowdata.keys(): return rowdata[term] # Try a Darwin Core fully qualified term if it wasn't found as is. try: q = qn(term) except Exception, e: return None
def test_read_core_value_default(self): """Retrieve a (default) value from core Test similar to test_read_core_value(), but the retrieved data comes from a default value (in meta.xml) instead of from the core text file. This is part of the standard and was produced by IPT prior to version 2.0.3. """ with DwCAReader(DEFAULT_VAL_PATH) as dwca: for l in dwca: self.assertEqual("Belgium", l.data[qn("country")])
def test_read_core_value_default(self): """Retrieve a (default) value from core Test similar to test_read_core_value(), but the retrieved data comes from a default value (in meta.xml) instead of from the core text file. This is part of the standard and was produced by IPT prior to version 2.0.3. """ with DwCAReader(sample_data_path('dwca-test-default.zip')) as dwca: for l in dwca: self.assertEqual('Belgium', l.data[qn('country')])
def archive_has_core_term(dwcareader, term): """Return True if the core file contains a column for the term name or identifier.""" if dwcareader is None or term is None: return False if term in dwcareader.descriptor.core.terms: return True try: q = qn(term) except Exception, e: logging.error('archive_has_core_term(): %s is not a Simple Darwin Core term. The search is case-sensitive.' % (term)) return False
def get_term_value(rowdata, term): """Return the value of the term in the given rowdata.""" if rowdata is None: return None if term in rowdata.keys(): return rowdata[term] # Try a Darwin Core fully qualified term if it wasn't found as is. try: q=qn(term) except Exception, e: return None
def _add_failure(self, row, term, test): """add the row id to the specific term and the sample if news """ if term in self.log.keys(): if not test in self.log[term].keys(): self.log[term][test] = {test : self._setup_termtest_dict()} else: self.log[term] = {test : self._setup_termtest_dict()} self.log[term][test]["ids"].append(row.id) if not self._check_if_new_failure(row, term, test): self.log[term][test]["sample"].append(row.data[qn(term)])
def archive_has_core_term(dwcareader, term): """Return True if the core file contains a column for the term name or identifier.""" if dwcareader is None or term is None: return False if term in dwcareader.descriptor.core.terms: return True try: q = qn(term) except Exception, e: logging.error( 'archive_has_core_term(): %s is not a Simple Darwin Core term. The search is case-sensitive.' % (term)) return False
def test_subdirectory_archive(self): """Ensure we support Archives where all the content is under a single directory.""" num_files_before = len(os.listdir(".")) num_files_during = None with DwCAReader(SUBDIR_ARCHIVE_PATH) as dwca: # Ensure we have access to metadata self.assertIsInstance(dwca.metadata, ET.Element) # And to the rows themselves for row in dwca: self.assertIsInstance(row, CoreRow) rows = list(dwca) self.assertEqual("Borneo", rows[0].data[qn("locality")]) num_files_during = len(os.listdir(".")) num_files_after = len(os.listdir(".")) # Let's also check temporary dir is correctly created and removed. self.assertEqual(num_files_before + 1, num_files_during) self.assertEqual(num_files_before, num_files_after)
def test_subdirectory_archive(self): """Ensure we support Archives where all the content is under a single directory.""" tmp_dir = tempfile.gettempdir() num_files_before = len(os.listdir(tmp_dir)) with DwCAReader(sample_data_path('dwca-simple-subdir.zip')) as dwca: # Ensure we have access to metadata self.assertIsInstance(dwca.metadata, ET.Element) # And to the rows themselves for row in dwca: self.assertIsInstance(row, CoreRow) rows = list(dwca) self.assertEqual('Borneo', rows[0].data[qn('locality')]) num_files_during = len(os.listdir(tmp_dir)) num_files_after = len(os.listdir(tmp_dir)) # Let's also check temporary dir is correctly created and removed. self.assertEqual(num_files_before + 1, num_files_during) self.assertEqual(num_files_before, num_files_after)
def valid_dwca(dwca): return (dwca.core_rowtype == qn('Occurrence') and dwca.core_contains_term(qn('decimalLatitude')) and dwca.core_contains_term(qn('decimalLongitude')))
def _check_if_new_failure(self, row, term, test): """check if the failure is different from the previous failures for this term-test combination; if so, store the value """ return row.data[qn(term)] in self.log[term][test]["sample"]
def test_partial_default(self): with DwCAReader(sample_data_path('dwca-partial-default.zip')) as dwca: self.assertEqual(dwca.rows[0].data[qn('country')], 'France') # Value comes from data file self.assertEqual(dwca.rows[1].data[qn('country')], 'Belgium') # Value is field default
def check_not_equal(self, row, term, value): """test if a specific term is equal to the provided value, log row id if not equal """ if row.data[qn(term)] == value: self._add_failure(row, term, 'NotEqual')
def check_equal_options(self, row, term, values): """test if a specific term is equal to one of the provided options in a list """ if not row.data[qn(term)] in values: self._add_failure(row, term, 'EqualList')
dwca_url = csvrow['dataset_url'] filename = csvrow['id'] dwca_file = f'./data/dwca/{filename}.zip' # Download if not path.exists(dwca_file): r = requests.get(dwca_url, stream=True) with open(dwca_file, 'wb') as fd: for byte in r.raw: fd.write(byte) with DwCAReader(dwca_file) as dwca: core_type = dwca.descriptor.core.type has_term = { t: qn(t) in dwca.descriptor.core.terms for t in lookup_terms } for row in dwca: potential_interaction_terms = [] for term in lookup_terms: if has_term[term] == True: term_value = row.data[qn(term)] if term_value: potential_interaction_terms.append(term) for ext in row.extensions: for interaction_ext in extensions: if qn(interaction_ext) == ext.rowtype: saveToDB(csvrow, { **row.data,
def assess_line(self, line): kingdom = line.data[qn('kingdom')].lower().strip("\n\t") if len(kingdom) > 0 and kingdom not in self._accepted: self.logger.log("'{kingdom}' not in accepted list.".format(kingdom=kingdom))