def __init__(self, path, extensions_to_ignore=None): """Open the Darwin Core Archive.""" if extensions_to_ignore is None: extensions_to_ignore = [] #: The path to the Darwin Core Archive file, as passed to the constructor. self.archive_path = path if os.path.isdir(self.archive_path ): # Archive is a (directly readable) directory self._workin_directory_path = self.archive_path self._directory_to_clean = None else: # Archive is zipped/tgzipped, we have to extract it first. self._directory_to_clean, self._workin_directory_path = self._extract( ) #: An :class:`descriptors.ArchiveDescriptor` instance giving access to the archive #: descriptor/metafile (``meta.xml``) try: self.descriptor = ArchiveDescriptor( self.open_included_file(METAFILE_NAME).read(), files_to_ignore=extensions_to_ignore) except IOError as exc: if exc.errno == ENOENT: self.descriptor = None #: A :class:`xml.etree.ElementTree.Element` instance containing the (scientific) metadata #: of the archive, or None if the Archive contains no metadata. self.metadata = self._parse_metadata_file() #: If the archive contains source metadata (typically, GBIF downloads) this dict will #: be something like: #: {'dataset1_UUID': <dataset1 EML (xml.etree.ElementTree.Element instance)>, #: 'dataset2_UUID': <dataset2 EML (xml.etree.ElementTree.Element instance)>, ...} #: see :doc:`gbif_results` for more details. self.source_metadata = self._load_source_metadata() if self.descriptor: # We have an Archive descriptor that we can use to access data files. self._corefile = CSVDataFile(self._workin_directory_path, self.descriptor.core) self._extensionfiles = [ CSVDataFile(work_directory=self._workin_directory_path, file_descriptor=d) for d in self.descriptor.extensions ] else: # Archive without descriptor, we'll have to find and inspect the data file try: datafile_name = self._is_valid_simple_archive() descriptor = DataFileDescriptor.make_from_file( os.path.join(self._workin_directory_path, datafile_name)) self._corefile = CSVDataFile( work_directory=self._workin_directory_path, file_descriptor=descriptor) self._extensionfiles = [] except InvalidSimpleArchive: msg = "No metafile was found, but archive includes multiple files/directories." raise InvalidSimpleArchive(msg)
def test_exposes_extensions_none(self): all_metaxml = """ <archive xmlns="http://rs.tdwg.org/dwc/text/" metadata="eml.xml"> <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Occurrence"> <files> <location>occurrence.txt</location> </files> <id index="0" /> <field index="1" term="http://rs.tdwg.org/dwc/terms/basisOfRecord"/> <field index="2" term="http://rs.tdwg.org/dwc/terms/locality"/> <field index="3" term="http://rs.tdwg.org/dwc/terms/family"/> <field index="4" term="http://rs.tdwg.org/dwc/terms/scientificName"/> </core> </archive> """ d = ArchiveDescriptor(all_metaxml) self.assertEqual(len(d.extensions), 0)
def test_exposes_extensions_2ext(self): all_metaxml = """ <archive xmlns="http://rs.tdwg.org/dwc/text/" metadata="eml.xml"> <core encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.tdwg.org/dwc/terms/Taxon"> <files> <location>taxon.txt</location> </files> <id index="0" /> <field index="1" term="http://rs.tdwg.org/dwc/terms/order"/> <field index="2" term="http://rs.tdwg.org/dwc/terms/class"/> <field index="3" term="http://rs.tdwg.org/dwc/terms/kingdom"/> <field index="4" term="http://rs.tdwg.org/dwc/terms/phylum"/> <field index="5" term="http://rs.tdwg.org/dwc/terms/genus"/> <field index="6" term="http://rs.tdwg.org/dwc/terms/family"/> </core> <extension encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.gbif.org/terms/1.0/Description"> <files> <location>description.txt</location> </files> <coreid index="0" /> <field index="1" term="http://purl.org/dc/terms/type"/> <field index="2" term="http://purl.org/dc/terms/language"/> <field index="3" term="http://purl.org/dc/terms/description"/> </extension> <extension encoding="utf-8" fieldsTerminatedBy="\t" linesTerminatedBy="\n" fieldsEnclosedBy="" ignoreHeaderLines="1" rowType="http://rs.gbif.org/terms/1.0/VernacularName"> <files> <location>vernacularname.txt</location> </files> <coreid index="0" /> <field index="1" term="http://rs.tdwg.org/dwc/terms/countryCode"/> <field index="2" term="http://purl.org/dc/terms/language"/> <field index="3" term="http://rs.tdwg.org/dwc/terms/vernacularName"/> </extension> </archive> """ d = ArchiveDescriptor(all_metaxml) expected_extensions_files = ('description.txt', 'vernacularname.txt') for ext in d.extensions: self.assertTrue(ext.file_location in expected_extensions_files) self.assertEqual(len(d.extensions), 2)
def __init__(self, path, extensions_to_ignore=None): # type: (str, List[str]) -> None """Open the Darwin Core Archive.""" if extensions_to_ignore is None: extensions_to_ignore = [] #: The path to the Darwin Core Archive file, as passed to the constructor. self.archive_path = path # type: str if os.path.isdir(self.archive_path ): # Archive is a (directly readable) directory self._working_directory_path = self.archive_path self._directory_to_clean = None # type: Optional[str] else: # Archive is zipped/tgzipped, we have to extract it first. self._directory_to_clean, self._working_directory_path = self._extract( ) #: An :class:`descriptors.ArchiveDescriptor` instance giving access to the archive #: descriptor/metafile (``meta.xml``) self.descriptor = None # type: Optional[ArchiveDescriptor] try: self.descriptor = ArchiveDescriptor( self.open_included_file(self.default_metafile_name).read(), files_to_ignore=extensions_to_ignore) except IOError as exc: if exc.errno == ENOENT: pass #: A :class:`xml.etree.ElementTree.Element` instance containing the (scientific) metadata #: of the archive, or `None` if the archive has no metadata. self.metadata = self._parse_metadata_file() # type: Optional[Element] #: If the archive contains source-level metadata (typically, GBIF downloads), this is a dict such as:: #: #: {'dataset1_UUID': <dataset1 EML> (xml.etree.ElementTree.Element object), #: 'dataset2_UUID': <dataset2 EML> (xml.etree.ElementTree.Element object), ...} #: #: See :doc:`gbif_results` for more details. self.source_metadata = self._get_source_metadata( ) # type: Dict[str, Element] if self.descriptor: # We have an Archive descriptor that we can use to access data files. #: An instance of :class:`dwca.files.CSVDataFile` for the core data file. self.core_file = CSVDataFile( self._working_directory_path, self.descriptor.core) # type: CSVDataFile #: A list of :class:`dwca.files.CSVDataFile`, one entry for each extension data file , sorted by order of #: appearance in the Metafile (or an empty list if the archive doesn't use extensions). self.extension_files = [ CSVDataFile(work_directory=self._working_directory_path, file_descriptor=d) for d in self.descriptor.extensions ] # type: List[CSVDataFile] else: # Archive without descriptor, we'll have to find and inspect the data file try: datafile_name = self._is_valid_simple_archive() descriptor = DataFileDescriptor.make_from_file( os.path.join(self._working_directory_path, datafile_name)) self.core_file = CSVDataFile( work_directory=self._working_directory_path, file_descriptor=descriptor) self.extension_files = [] except InvalidSimpleArchive: msg = "No Metafile was found, but the archive contains multiple files/directories." raise InvalidSimpleArchive(msg)
def read_meta_xml(metaxml): with open(metaxml, 'r') as f: return ArchiveDescriptor(f.read())