def test_row_source_missing_metadata(self): with GBIFResultsReader( sample_data_path( 'gbif-results-lacks-s-metadata.zip')) as results: # We have source metadata, but not for all datasets/line... # We sould have None in this cases first_row = results.get_corerow_by_id('607759330') self.assertEqual(None, first_row.source_metadata)
def test_dwcareader_features(self): """Ensure we didn't break inherited basic DwCAReader features.""" with GBIFResultsReader(GBIF_RESULTS_PATH) as results_dwca: self.assertEqual(158, len(results_dwca.rows)) self.assertEqual('http://rs.tdwg.org/dwc/terms/Occurrence', results_dwca.descriptor.core.type) row1 = results_dwca.rows[0] self.assertEqual('Tetraodontidae', row1.data[qn('family')]) self.assertEqual([], row1.extensions)
def test_row_human_representation(self): with DwCAReader(sample_data_path( 'dwca-simple-test-archive.zip')) as basic_dwca: l = basic_dwca.rows[0] l_repr = str(l) self.assertIn("Rowtype: http://rs.tdwg.org/dwc/terms/Occurrence", l_repr) self.assertIn("Source: Core file", l_repr) self.assertIn("Row id:", l_repr) self.assertIn("Reference extension rows: No", l_repr) self.assertIn("Reference source metadata: No", l_repr) if sys.version_info[0] == 2: # Python 2 self.assertIn( "http://rs.tdwg.org/dwc/terms/scientificName': u'tetraodon fluviatilis'", l_repr) else: self.assertIn( "http://rs.tdwg.org/dwc/terms/scientificName': 'tetraodon fluviatilis'", l_repr) with DwCAReader( sample_data_path('dwca-star-test-archive.zip')) as star_dwca: l = star_dwca.rows[0] l_repr = str(l) self.assertIn("Rowtype: http://rs.tdwg.org/dwc/terms/Taxon", l_repr) self.assertIn("Source: Core file", l_repr) self.assertIn("Row id: 1", l_repr) self.assertIn("Reference extension rows: Yes", l_repr) self.assertIn("Reference source metadata: No", l_repr) extension_l_repr = str(l.extensions[0]) self.assertIn( "Rowtype: http://rs.gbif.org/terms/1.0/VernacularName", extension_l_repr) self.assertIn("Source: Extension file", extension_l_repr) self.assertIn("Core row id: 1", extension_l_repr) self.assertIn("ostrich", extension_l_repr) self.assertIn("Reference extension rows: No", extension_l_repr) self.assertIn("Reference source metadata: No", extension_l_repr) with GBIFResultsReader( sample_data_path('gbif-results.zip')) as gbif_dwca: l = gbif_dwca.rows[0] l_repr = str(l) self.assertIn("Rowtype: http://rs.tdwg.org/dwc/terms/Occurrence", l_repr) self.assertIn("Source: Core file", l_repr) self.assertIn("Reference source metadata: Yes", l_repr)
def main(): logging.basicConfig(level=logging.DEBUG) options = _getoptions() if options.dwca_file is None: print 'syntax: dwca_utils.py -f dwca_file [-v vocab_path] [-t archive_type]' return # Make an appropriate reader based on whether the archive is standard or a GBIF # download. dwcareader = None if options.archive_type == 'gbif': try: dwcareader = GBIFResultsReader(options.dwca_file) except Exception, e: logging.error('GBIF archive %s has an exception: %s ' % (options.dwca_file, e))
def dwca_metadata_from_file(inputfile, archivetype=None): ''' Return metadata from a Darwin Core Archive file.''' if inputfile is None or len(inputfile.strip()) == 0: return None # Make an appropriate reader based on whether the archive is standard or a GBIF # download. dwcareader = None if archivetype is not None and archivetype.lower() == 'gbif': try: dwcareader = GBIFResultsReader(inputfile) except Exception, e: s = 'Unable to read GBIF archive %s. %s %s' % (inputfile, e, __version__) logging.error(s) pass
def test_row_source_metadata(self): with GBIFResultsReader(GBIF_RESULTS_PATH) as results: first_row = results.get_corerow_by_id('607759330') m = first_row.source_metadata self.assertIsInstance(m, ET.Element) v = (m.find('dataset').find('creator').find('individualName').find( 'givenName').text) self.assertEqual(v, 'Stanley') last_row = results.get_corerow_by_id('782700656') m = last_row.source_metadata self.assertIsInstance(m, ET.Element) v = m.find('dataset').find('language').text self.assertEqual(v, 'en')
def dwca_core_to_tsv(): """Save the core of the archive to a csv file with short DwC term names as headers.""" inputfile = dwcafile fullpath = tsvoutputfile if not os.path.isfile(inputfile): return None # Make an appropriate reader based on whether the archive is standard or a GBIF # download. dwcareader = None if type == 'gbif': try: dwcareader = GBIFResultsReader(inputfile) except Exception, e: logging.error('GBIF archive %s has an exception: %s ' % (inputfile, e)) pass
def test_source_metadata(self): with GBIFResultsReader(GBIF_RESULTS_PATH) as results: # We have 23 EML files in dataset/ self.assertEqual(23, len(results.source_metadata)) # Assert a key is present self.assertTrue('eccf4b09-f0c8-462d-a48c-41a7ce36815a' in results.source_metadata) self.assertFalse('incorrect-UUID' in results.source_metadata) # Assert it's the correct EML file (content!) sm = results.source_metadata metadata = sm['eccf4b09-f0c8-462d-a48c-41a7ce36815a'] self.assertIsInstance(metadata, ET.Element) # Assert we can read basic fields from EML: self.assertEqual( metadata.find('dataset').find('creator').find( 'individualName').find('givenName').text, 'Rob')
def dwca_core_to_tsv(options): ''' Save the core of the archive to a tsv file with DwC term names as headers. options - a dictionary of parameters loglevel - the level at which to log (e.g., DEBUG) workspace - path to a directory for the outputfile (optional) inputfile - full path to the input Darwin Core archive file (required) outputfile - file name of the tsv output file, no path (optional) archivetype - archive type ('standard' or 'gbif') (optional; default 'standard') returns a dictionary with information about the results workspace - actual path to the directory where the outputfile was written outputfile - actual full path to the output tsv file rowcount - the number of rows in the Darwin Core archive file success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False artifacts - a dictionary of persistent objects created ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug('Started %s' % __version__) logging.debug('options: %s' % options) # Make a list for the response returnvars = [ 'workspace', 'outputfile', 'rowcount', 'success', 'message', 'artifacts' ] ### Standard outputs ### success = False message = None ### Custom outputs ### rowcount = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputfile = None outputfile = None archivetype = 'standard' ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile) == 0: message = 'No input file given. %s' % __version__ returnvals = [ workspace, outputfile, rowcount, success, message, artifacts ] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) # Look to see if the input file is at the absolute path or in the workspace. if os.path.isfile(inputfile) == False: if os.path.isfile(workspace + '/' + inputfile) == True: inputfile = workspace + '/' + inputfile else: message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [ workspace, outputfile, rowcount, success, message, artifacts ] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile) == 0: outputfile = 'dwca_%s.txt' % str(uuid.uuid1()) outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) try: archivetype = options['archivetype'] except: pass # Note: The DwCAReader creates a temporary directory of its own and cleans it up # Make a reader based on whether the archive is standard or a GBIF download. dwcareader = None if archivetype is not None and archivetype.lower() == 'gbif': try: with GBIFResultsReader(inputfile) as dwcareader: rowcount = write_core_csv_file(dwcareader, outputfile) except Exception, e: message = 'Error %s ' % e message += 'reading GBIF archive: %s. %s' % (inputfile, __version__) returnvals = [ workspace, outputfile, rowcount, success, message, artifacts ] logging.debug('message:\n%s' % message) return response(returnvars, returnvals)
def test_row_source_missing_metadata(self): with GBIFResultsReader(MISSINGMETA_PATH) as results: # We have source metadata, but not for all datasets/line... # We sould have None in this cases first_row = results.get_corerow_by_id('607759330') self.assertEqual(None, first_row.source_metadata)
def test_rights_access(self): """Check the content of rights.txt is accessible.""" with GBIFResultsReader(GBIF_RESULTS_PATH) as results_dwca: self.assertEqual(self.RIGHTS_CONTENT, results_dwca.rights)
def test_citations_access(self): """Check the content of citations.txt is accessible.""" with GBIFResultsReader(GBIF_RESULTS_PATH) as results_dwca: self.assertEqual(self.CITATIONS_CONTENT, results_dwca.citations)
def test_rights_access(self): """Check the content of rights.txt is accessible.""" with GBIFResultsReader( sample_data_path('gbif-results.zip')) as results_dwca: self.assertEqual(self.RIGHTS_CONTENT, results_dwca.rights)
def test_citations_access(self): """Check the content of citations.txt is accessible.""" with GBIFResultsReader( sample_data_path('gbif-results.zip')) as results_dwca: self.assertEqual(self.CITATIONS_CONTENT, results_dwca.citations)