def _parse_metadata_file(self): """Load the archive (scientific) Metadata file, parse it with\ ElementTree and return its content (or None if the Archive contains no metadata). :raises: :class:`dwca.exceptions.InvalidArchive` if the archive references an inexisting metadata file. """ # If the archive has descriptor, look for the metadata filename there. if self.descriptor and self.descriptor.metadata_filename: filename = self.descriptor.metadata_filename try: return self._parse_xml_included_file(filename) except IOError as exc: if exc.errno == ENOENT: # File not found msg = "{} is referenced in the archive descriptor but missing.".format( filename) raise InvalidArchive(msg) else: # Otherwise, the metadata file has to be named 'EML.xml' try: return self._parse_xml_included_file(DEFAULT_METADATA_FILENAME) except IOError as e: if e.errno == ENOENT: # File not found, this is an archive without metadata return None
def _parse_metadata_file(self): # type: () -> Optional[Element] """Load the archive (scientific) Metadata file, parse it with\ ElementTree and return its content (or `None` if the archive has no metadata). :raises: :class:`dwca.exceptions.InvalidArchive` if the archive references an non-existent metadata file. """ # If the archive has descriptor, look for the metadata filename there. if self.descriptor and self.descriptor.metadata_filename: filename = self.descriptor.metadata_filename try: return self._parse_xml_included_file(filename) except IOError as exc: if exc.errno == ENOENT: # File not found msg = "{} is referenced in the archive descriptor but missing.".format( filename) raise InvalidArchive(msg) else: # Otherwise, the metadata file has to be named 'EML.xml' try: return self._parse_xml_included_file( self.default_metadata_filename) except IOError as exc: if exc.errno == ENOENT: # File not found, this is an archive without metadata return None assert False # For MyPy, see: https://github.com/python/mypy/issues/4223#issuecomment-342865133
def __init__(self, csv_line, position, datafile_descriptor): # type: (str, int, DataFileDescriptor) -> None #: An instance of :class:`dwca.descriptors.DataFileDescriptor` describing the originating #: data file. self.descriptor = datafile_descriptor # type: DataFileDescriptor #: The row position/index (starting at 0) in the source data file. This can be used, for example with #: :meth:`dwca.read.DwCAReader.get_corerow_by_position` or :meth:`dwca.files.CSVDataFile.get_row_by_position`. self.position = position # type: int #: The csv line type as stated in the archive descriptor. #: (or None if the archive has no descriptor). Examples: #: http://rs.tdwg.org/dwc/terms/Occurrence, #: http://rs.gbif.org/terms/1.0/VernacularName, ... self.rowtype = self.descriptor.type # type: Optional[str] # self.raw_fields is a list of the csv_line's content #: self.raw_fields = csv_line_to_fields( csv_line, line_ending=self.descriptor.lines_terminated_by, field_ending=self.descriptor.fields_terminated_by, fields_enclosed_by=self.descriptor.fields_enclosed_by) # TODO: raw_fields is a new property: to test # TODO: Consistency check ?? self.raw_fields length should be : # num of self.raw_fields described in core_meta + 2 (id and \n) #: A dict containing the Row data, such as:: #: #: {'dwc_term_1': 'value', #: 'dwc_term_2': 'value', #: ...} #: #: Usage:: #: #: myrow.data['http://rs.tdwg.org/dwc/terms/locality'] # => "Brussels" #: #: .. note:: The :func:`dwca.darwincore.utils.qualname` helper is available to make such calls less verbose. self.data = {} # type: Dict[str, str] for field_descriptor in self.descriptor.fields: try: column_index = int(field_descriptor['index']) field_row_value = self.raw_fields[column_index] except TypeError: # int() argument must be a string... We don't have an index for this field field_row_value = None except IndexError: msg = 'The descriptor references a non-existent field (index={i})'.format( i=column_index) raise InvalidArchive(msg) field_default_value = field_descriptor['default'] self.data[field_descriptor[ 'term']] = field_row_value or field_default_value or ''
def __init__(self, csv_line, position, descriptor): #: An instance of :class:`dwca.descriptors.DataFileDescriptor` describing the originating #: data file. self.descriptor = descriptor #: The row position/index (starting at 0) in the source data file. This can be used, for example with #: :meth:`DwCAReader.get_corerow_by_position` or :meth:`CSVDataFile.get_row_by_position`. self.position = position #: The csv line type as stated in the archive descriptor. #: Examples: http://rs.tdwg.org/dwc/terms/Occurrence, #: http://rs.gbif.org/terms/1.0/VernacularName, ... self.rowtype = self.descriptor.type line_ending = self.descriptor.lines_terminated_by field_ending = self.descriptor.fields_terminated_by fields_enclosed_by = self.descriptor.fields_enclosed_by # self.raw_fields is a list of the csv_line's content #: self.raw_fields = [] for f in csv_line.rstrip(line_ending).split(field_ending): self.raw_fields.append(f.strip(fields_enclosed_by)) # TODO: raw_fields is a new property: to test # TODO: Consistency chek ?? self.raw_fields length should be : # num of self.raw_fields described in core_meta + 2 (id and \n) #: A dict containing the Row data, such as: #: {'dwc_term_1': 'value', #: 'dwc_term_2': 'value', #: ...}. #: #: Example:: #: #: print myrow.data['http://rs.tdwg.org/dwc/terms/locality'] # => "Brussels" #: #: .. note:: The :func:`dwca.darwincore.utils.qualname` helper is avalaible to make such calls less verbose. self.data = {} for f in self.descriptor.fields: # if field by default, we can find its value directly in <field> # attribute if f['default'] is not None: self.data[f['term']] = f['default'] else: # else, we have to look in core file field_index = int(f['index']) try: self.data[f['term']] = self.raw_fields[field_index] except IndexError: msg = 'The descriptor references a non-existent field (index={i})'.format(i=field_index) raise InvalidArchive(msg)
def __init__(self, metaxml_content: str, files_to_ignore: List[str] = None) -> None: if files_to_ignore is None: files_to_ignore = [] # Let's drop the XML namespace to avoid prefixes metaxml_content = re.sub(' xmlns="[^"]+"', '', metaxml_content, count=1) #: A :class:`xml.etree.ElementTree.Element` instance containing the complete Archive Descriptor. self.raw_element = ET.fromstring(metaxml_content) # type: Element #: The path (relative to archive root) of the (scientific) metadata of the archive. self.metadata_filename = self.raw_element.get('metadata', None) #: An instance of :class:`dwca.descriptors.DataFileDescriptor` describing the core data file. raw_core_element = self.raw_element.find('core') self.core = DataFileDescriptor.make_from_metafile_section( raw_core_element) # type: DataFileDescriptor #: A list of :class:`dwca.descriptors.DataFileDescriptor` instances describing each of the archive's extension #: data files. self.extensions = [] # type: List[DataFileDescriptor] for extension_tag in self.raw_element.findall( 'extension'): # type: Element location_tag = extension_tag.find('./files/location') if location_tag is not None: extension_filename = location_tag.text if extension_filename not in files_to_ignore: self.extensions.append( DataFileDescriptor.make_from_metafile_section( extension_tag)) else: raise InvalidArchive( "An extension file is referenced in Metafile, but its path is not specified." ) #: A list of extension (types) in use in the archive. #: #: Example:: #: #: ["http://rs.gbif.org/terms/1.0/VernacularName", #: "http://rs.gbif.org/terms/1.0/Description"] self.extensions_type = [e.type for e in self.extensions]
def _unzip_or_untar(self) -> str: """Create a temporary dir. and uncompress/unarchive self.archive_path there. Returns the path to that temporary directory. Raises InvalidArchive if not a zip nor a tgz file. """ tmp_dir = mkdtemp() # We first try to unzip (most common archives) try: # Security note: with Python < 2.7.4, a zip file may be able to write outside of the # directory using absolute paths, parent (..) path, ... See note in ZipFile.extract doc zipfile.ZipFile(self.archive_path, 'r').extractall(tmp_dir) except zipfile.BadZipfile: # Doesn't look like a valid zip, let's see if it's a tar archive (possibly compressed) try: tarfile.open(self.archive_path, 'r:*').extractall(tmp_dir) except tarfile.ReadError: raise InvalidArchive("The archive cannot be read. Is it a .zip or .tgz file?") return tmp_dir