def _retrieve_version(self) -> str: try: metadata = json.loads( read_file_in_tar(self.filepath, 'metadata.json')) except (IOError, FileNotFoundError) as error: raise CorruptArchive(str(error)) if 'export_version' not in metadata: raise CorruptArchive( "metadata.json doest not contain an 'export_version' key") return metadata['export_version']
def metadata(self) -> ArchiveMetadata: metadata = self._get_metadata() export_parameters = metadata.get('export_parameters', {}) output = { 'export_version': metadata['export_version'], 'aiida_version': metadata['aiida_version'], 'all_fields_info': metadata['all_fields_info'], 'unique_identifiers': metadata['unique_identifiers'], 'graph_traversal_rules': export_parameters.get('graph_traversal_rules', None), 'entities_starting_set': export_parameters.get('entities_starting_set', None), 'include_comments': export_parameters.get('include_comments', None), 'include_logs': export_parameters.get('include_logs', None), 'conversion_info': metadata.get('conversion_info', []) } try: return ArchiveMetadata(**output) except TypeError as error: raise CorruptArchive(f'Metadata invalid: {error}')
def iter_node_repos( self, uuids: Iterable[str], callback: Callable[[str, Any], None] = null_callback, ) -> Iterator[Folder]: path_prefixes = [ os.path.join(self.REPO_FOLDER, export_shard_uuid(uuid)) for uuid in uuids ] if not path_prefixes: return self.assert_within_context() assert self._sandbox is not None # required by mypy # unarchive the common folder if it does not exist common_prefix = os.path.commonpath(path_prefixes) if not self._sandbox.get_subfolder(common_prefix).exists(): self._extract(path_prefix=common_prefix, callback=callback) callback( 'init', { 'total': len(path_prefixes), 'description': 'Iterating node repositories' }) for uuid, path_prefix in zip(uuids, path_prefixes): callback('update', 1) subfolder = self._sandbox.get_subfolder(path_prefix) if not subfolder.exists(): raise CorruptArchive( f'Unable to find the repository folder for Node with UUID={uuid} in the exported file' ) yield subfolder
def _extract_archive(self, filepath: Path, callback: Callable[[str, Any], None]): try: ZipPath(self.filepath, mode='r', allow_zip64=True).extract_tree(filepath, callback=callback) except zipfile.BadZipfile as error: raise CorruptArchive(f'The input file cannot be read: {error}')
def _extract(self, *, path_prefix: str, callback: Callable[[str, Any], None] = null_callback): self.assert_within_context() assert self._sandbox is not None # required by mypy try: ZipPath(self.filename, mode='r', allow_zip64=True).joinpath(path_prefix).extract_tree( self._sandbox.abspath, callback=callback, cb_descript='Extracting repository files') except zipfile.BadZipfile as error: raise CorruptArchive(f'The input file cannot be read: {error}') except NotADirectoryError as error: raise CorruptArchive( f'Unable to find required folder in archive: {error}')
def _get_data(self): if self._data is None: try: self._data = json.loads( read_file_in_tar(self.filename, self.FILENAME_DATA)) except (IOError, FileNotFoundError) as error: raise CorruptArchive(str(error)) return self._data
def _get_data(self): if self._data is None: path = Path(self.filename) / self.FILENAME_DATA if not path.exists(): raise CorruptArchive( f'required file `{self.FILENAME_DATA}` is not included') self._data = json.loads(path.read_text(encoding='utf8')) return self._data
def extract_tar(infile, folder, nodes_export_subfolder=None, **kwargs): """ Extract the nodes to be imported from a (possibly zipped) tar file. :param infile: file path :type infile: str :param folder: a temporary fodler used to extract the file tree :type folder: :py:class:`~aiida.common.folders.SandboxFolder` :param nodes_export_subfolder: name of the subfolder for AiiDA nodes :type nodes_export_subfolder: str :param silent: suppress progress bar :type silent: bool :raises TypeError: if parameter types are not respected :raises `~aiida.tools.importexport.common.exceptions.CorruptArchive`: if the archive misses files or files have incorrect formats """ # pylint: disable=fixme if nodes_export_subfolder: if not isinstance(nodes_export_subfolder, str): raise TypeError('nodes_export_subfolder must be a string') else: nodes_export_subfolder = NODES_EXPORT_SUBFOLDER try: with tarfile.open(infile, 'r:*', format=tarfile.PAX_FORMAT) as handle: if len(handle.getmembers()) == 1 and handle.getmembers()[0].size == 0: raise CorruptArchive('no files detected in archive') file_iterator = get_file_iterator(file_handle=handle, folderpath=folder.abspath, **kwargs) for member in file_iterator: if member.isdev(): # safety: skip if character device, block device or FIFO print('WARNING, device found inside the import file: {}'.format(member.name), file=sys.stderr) continue if member.issym() or member.islnk(): # safety: in export, I set dereference=True therefore # there should be no symbolic or hard links. print('WARNING, symlink found inside the import file: {}'.format(member.name), file=sys.stderr) continue # Check that we are only exporting nodes within the subfolder! # TODO: better check such that there are no .. in the # path; use probably the folder limit checks if not member.name.startswith(nodes_export_subfolder + os.sep): continue update_description(member.name, file_iterator) handle.extract(path=folder.abspath, member=member) except tarfile.ReadError: raise ValueError('The input file format for import is not valid (not a tar file)') close_progress_bar(leave=False)
def _extract_archive(self, filepath: Path, callback: Callable[[str, Any], None]): try: TarPath(self.filepath, mode='r:*', pax_format=tarfile.PAX_FORMAT).extract_tree( filepath, allow_dev=False, allow_symlink=False, callback=callback) except tarfile.ReadError as error: raise CorruptArchive(f'The input file cannot be read: {error}')
def iter_entity_fields( self, name: str, fields: Optional[Tuple[str, ...]] = None ) -> Iterator[Tuple[int, Dict[str, Any]]]: if name not in self.entity_names: raise ValueError(f'Unknown entity name: {name}') data = self._get_data()['export_data'].get(name, {}) if name == NODE_ENTITY_NAME: # here we merge in the attributes and extras before yielding attributes = self._get_data().get('node_attributes', {}) extras = self._get_data().get('node_extras', {}) for pk, all_fields in data.items(): if pk not in attributes: raise CorruptArchive( f'Unable to find attributes info for Node with Pk={pk}' ) if pk not in extras: raise CorruptArchive( f'Unable to find extra info for Node with Pk={pk}') all_fields = { **all_fields, **{ 'attributes': attributes[pk], 'extras': extras[pk] } } if fields is not None: all_fields = { k: v for k, v in all_fields.items() if k in fields } yield int(pk), all_fields else: for pk, all_fields in data.items(): if fields is not None: all_fields = { k: v for k, v in all_fields.items() if k in fields } yield int(pk), all_fields
def unpack(self): """Unpack the archive and store the contents in a sandbox.""" if os.path.isdir(self.filepath): extract_tree(self.filepath, self.folder) elif tarfile.is_tarfile(self.filepath): extract_tar(self.filepath, self.folder, silent=self._silent, nodes_export_subfolder=NODES_EXPORT_SUBFOLDER) elif zipfile.is_zipfile(self.filepath): extract_zip(self.filepath, self.folder, silent=self._silent, nodes_export_subfolder=NODES_EXPORT_SUBFOLDER) else: raise CorruptArchive('unrecognized archive format') if not self.folder.get_content_list(): raise ContentNotExistent('the provided archive {} is empty'.format(self.filepath)) self._unpacked = True
def extract_zip(infile, folder, nodes_export_subfolder=None, **kwargs): """Extract the nodes to be imported from a zip file. :param infile: file path :type infile: str :param folder: a temporary folder used to extract the file tree :type folder: :py:class:`~aiida.common.folders.SandboxFolder` :param nodes_export_subfolder: name of the subfolder for AiiDA nodes :type nodes_export_subfolder: str :param silent: suppress progress bar :type silent: bool :raises TypeError: if parameter types are not respected :raises `~aiida.tools.importexport.common.exceptions.CorruptArchive`: if the archive misses files or files have incorrect formats """ # pylint: disable=fixme if nodes_export_subfolder: if not isinstance(nodes_export_subfolder, str): raise TypeError('nodes_export_subfolder must be a string') else: nodes_export_subfolder = NODES_EXPORT_SUBFOLDER try: with zipfile.ZipFile(infile, 'r', allowZip64=True) as handle: if not handle.namelist(): raise CorruptArchive('no files detected in archive') file_iterator = get_file_iterator(file_handle=handle, folderpath=folder.abspath, **kwargs) for membername in file_iterator: # Check that we are only exporting nodes within the subfolder! # TODO: better check such that there are no .. in the # path; use probably the folder limit checks if not membername.startswith(nodes_export_subfolder + os.sep): continue update_description(membername, file_iterator) handle.extract(path=folder.abspath, member=membername) except zipfile.BadZipfile: raise ValueError('The input file format for import is not valid (not a zip file)') close_progress_bar(leave=False)
def get_file_iterator(file_handle, folderpath, silent=True, **kwargs): # pylint: disable=unused-argument """Go through JSON files and then return new file_iterator :param file_handle: A file handle returned from `with open() as file_handle:`. :type file_handle: `tarfile.TarFile`, `zipfile.ZipFile` :param folderpath: Path to folder. :type folderpath: str :param silent: suppress progress bar. :type silent: bool :return: List of filenames in the archive, wrapped in the `tqdm` progress bar. :rtype: `tqdm.tqdm` """ json_files = {'metadata.json', 'data.json'} if isinstance(file_handle, tarfile.TarFile): file_format = 'tar' elif isinstance(file_handle, zipfile.ZipFile): file_format = 'zip' else: raise TypeError('Can only handle Tar or Zip files.') close_progress_bar(leave=False) file_iterator = get_progress_bar(iterable=json_files, leave=False, disable=silent) for json_file in file_iterator: update_description(json_file, file_iterator) try: if file_format == 'tar': file_handle.extract(path=folderpath, member=file_handle.getmember(json_file)) else: file_handle.extract(path=folderpath, member=json_file) except KeyError: raise CorruptArchive('required file `{}` is not included'.format(json_file)) close_progress_bar(leave=False) if file_format == 'tar': return get_progress_bar(iterable=file_handle.getmembers(), unit='files', leave=False, disable=silent) # zip return get_progress_bar(iterable=file_handle.namelist(), unit='files', leave=False, disable=silent)
def data_migration_legacy_process_attributes(data): """Apply migration 0040 - REV. 1.0.40 Data migration for some legacy process attributes. Attribute keys that are renamed: * `_sealed` -> `sealed` Attribute keys that are removed entirely: * `_finished` * `_failed` * `_aborted` * `_do_abort` Finally, after these first migrations, any remaining process nodes are screened for the existence of the `process_state` attribute. If they have it, it is checked whether the state is active or not, if not, the `sealed` attribute is created and set to `True`. :raises `~aiida.tools.importexport.common.exceptions.CorruptArchive`: if a Node, found to have attributes, cannot be found in the list of exported entities. :raises `~aiida.tools.importexport.common.exceptions.CorruptArchive`: if the 'sealed' attribute does not exist and the ProcessNode is in an active state, i.e. `process_state` is one of ('created', 'running', 'waiting'). A log-file, listing all illegal ProcessNodes, will be produced in the current directory. """ from aiida.tools.importexport.common.exceptions import CorruptArchive from aiida.manage.database.integrity import write_database_integrity_violation attrs_to_remove = [ '_sealed', '_finished', '_failed', '_aborted', '_do_abort' ] active_states = {'created', 'running', 'waiting'} illegal_cases = [] for node_pk, content in data['node_attributes'].items(): try: if data['export_data']['Node'][node_pk]['node_type'].startswith( 'process.'): # Check if the ProcessNode has a 'process_state' attribute, and if it's non-active. # Raise if the ProcessNode is in an active state, otherwise set `'sealed' = True` process_state = content.get('process_state', '') if process_state in active_states: # The ProcessNode is in an active state, and should therefore never have been allowed # to be exported. The Node will be added to a log that is saved in the working directory, # then a CorruptArchive will be raised, since the archive needs to be migrated manually. uuid_pk = data['export_data']['Node'][node_pk].get( 'uuid', node_pk) illegal_cases.append([uuid_pk, process_state]) continue # No reason to do more now # Either the ProcessNode is in a non-active state or its 'process_state' hasn't been set. # In both cases we claim the ProcessNode 'sealed' and make it importable. content['sealed'] = True # Remove attributes for attr in attrs_to_remove: content.pop(attr, None) except KeyError as exc: raise CorruptArchive( f'Your export archive is corrupt! Org. exception: {exc}') if illegal_cases: headers = ['UUID/PK', 'process_state'] warning_message = 'Found ProcessNodes with active process states ' \ 'that should never have been allowed to be exported.' write_database_integrity_violation(illegal_cases, headers, warning_message) raise CorruptArchive( 'Your export archive is corrupt! ' 'Please see the log-file in your current directory for more details.' )
def extract_zip(infile, folder, nodes_export_subfolder=None, silent=False): """ Extract the nodes to be imported from a zip file. :param infile: file path :type infile: str :param folder: a temporary folder used to extract the file tree :type folder: :py:class:`~aiida.common.folders.SandboxFolder` :param nodes_export_subfolder: name of the subfolder for AiiDA nodes :type nodes_export_subfolder: str :param silent: suppress debug print :type silent: bool :raises TypeError: if parameter types are not respected :raises `~aiida.tools.importexport.common.exceptions.CorruptArchive`: if the archive misses files or files have incorrect formats """ # pylint: disable=fixme if not silent: print('READING DATA AND METADATA...') if nodes_export_subfolder: if not isinstance(nodes_export_subfolder, str): raise TypeError('nodes_export_subfolder must be a string') else: nodes_export_subfolder = NODES_EXPORT_SUBFOLDER try: with zipfile.ZipFile(infile, 'r', allowZip64=True) as handle: if not handle.namelist(): raise CorruptArchive('no files detected') try: handle.extract(path=folder.abspath, member='metadata.json') except KeyError: raise CorruptArchive( 'required file `metadata.json` is not included') try: handle.extract(path=folder.abspath, member='data.json') except KeyError: raise CorruptArchive( 'required file `data.json` is not included') if not silent: print('EXTRACTING NODE DATA...') for membername in handle.namelist(): # Check that we are only exporting nodes within the subfolder! # TODO: better check such that there are no .. in the # path; use probably the folder limit checks if not membername.startswith(nodes_export_subfolder + os.sep): continue handle.extract(path=folder.abspath, member=membername) except zipfile.BadZipfile: raise ValueError( 'The input file format for import is not valid (not a zip file)')
def extract_tar(infile, folder, nodes_export_subfolder=None, silent=False): """ Extract the nodes to be imported from a (possibly zipped) tar file. :param infile: file path :type infile: str :param folder: a temporary fodler used to extract the file tree :type folder: :py:class:`~aiida.common.folders.SandboxFolder` :param nodes_export_subfolder: name of the subfolder for AiiDA nodes :type nodes_export_subfolder: str :param silent: suppress debug print :type silent: bool :raises TypeError: if parameter types are not respected :raises `~aiida.tools.importexport.common.exceptions.CorruptArchive`: if the archive misses files or files have incorrect formats """ # pylint: disable=fixme if not silent: print('READING DATA AND METADATA...') if nodes_export_subfolder: if not isinstance(nodes_export_subfolder, str): raise TypeError('nodes_export_subfolder must be a string') else: nodes_export_subfolder = NODES_EXPORT_SUBFOLDER try: with tarfile.open(infile, 'r:*', format=tarfile.PAX_FORMAT) as handle: try: handle.extract(path=folder.abspath, member=handle.getmember('metadata.json')) except KeyError: raise CorruptArchive( 'required file `metadata.json` is not included') try: handle.extract(path=folder.abspath, member=handle.getmember('data.json')) except KeyError: raise CorruptArchive( 'required file `data.json` is not included') if not silent: print('EXTRACTING NODE DATA...') for member in handle.getmembers(): if member.isdev(): # safety: skip if character device, block device or FIFO print('WARNING, device found inside the import file: {}'. format(member.name), file=sys.stderr) continue if member.issym() or member.islnk(): # safety: in export, I set dereference=True therefore # there should be no symbolic or hard links. print('WARNING, link found inside the import file: {}'. format(member.name), file=sys.stderr) continue # Check that we are only exporting nodes within the subfolder! # TODO: better check such that there are no .. in the # path; use probably the folder limit checks if not member.name.startswith(nodes_export_subfolder + os.sep): continue handle.extract(path=folder.abspath, member=member) except tarfile.ReadError: raise ValueError('The input file format for import is not valid (1)')
def extract_tar(infile, folder, nodes_export_subfolder=None, check_files=('data.json', 'metadata.json'), **kwargs): """ Extract the nodes to be imported from a (possibly zipped) tar file. :param infile: file path :type infile: str :param folder: a temporary fodler used to extract the file tree :type folder: :py:class:`~aiida.common.folders.SandboxFolder` :param nodes_export_subfolder: name of the subfolder for AiiDA nodes :type nodes_export_subfolder: str :param check_files: list of files to check are present :param silent: suppress progress bar :type silent: bool :raises TypeError: if parameter types are not respected :raises `~aiida.tools.importexport.common.exceptions.CorruptArchive`: if the archive misses files or files have incorrect formats """ warnings.warn( 'extract_tar function is deprecated and will be removed in AiiDA v2.0.0, ' 'use extract_tree in the archive-path package instead', AiidaDeprecationWarning) # pylint: disable=no-member if nodes_export_subfolder: if not isinstance(nodes_export_subfolder, str): raise TypeError('nodes_export_subfolder must be a string') else: nodes_export_subfolder = NODES_EXPORT_SUBFOLDER if not kwargs.get('silent', False): set_progress_bar_tqdm(unit='files') else: set_progress_reporter(None) data_files = set() try: with tarfile.open(infile, 'r:*', format=tarfile.PAX_FORMAT) as handle: members = handle.getmembers() if len(members) == 1 and members[0].size == 0: raise CorruptArchive('no files detected in archive') with get_progress_reporter()(total=len(members)) as progress: for member in members: progress.update() if member.isdev(): # safety: skip if character device, block device or FIFO print( f'WARNING, device found inside the import file: {member.name}', file=sys.stderr) continue if member.issym() or member.islnk(): # safety: in export, I set dereference=True therefore # there should be no symbolic or hard links. print( f'WARNING, symlink found inside the import file: {member.name}', file=sys.stderr) continue # Check that we are only exporting nodes within the subfolder! # better check such that there are no .. in the # path; use probably the folder limit checks if member.name in check_files: data_files.add(member.name) elif not member.name.startswith(nodes_export_subfolder + os.sep): continue _update_description(member.name, progress) handle.extract(path=folder.abspath, member=member) except tarfile.ReadError: raise ValueError( 'The input file format for import is not valid (not a tar file)') for name in check_files: if name not in data_files: raise CorruptArchive('Archive missing required file:f {name}')
def extract_zip(infile, folder, nodes_export_subfolder=None, check_files=('data.json', 'metadata.json'), **kwargs): """Extract the nodes to be imported from a zip file. :param infile: file path :type infile: str :param folder: a temporary folder used to extract the file tree :type folder: :py:class:`~aiida.common.folders.SandboxFolder` :param nodes_export_subfolder: name of the subfolder for AiiDA nodes :type nodes_export_subfolder: str :param check_files: list of files to check are present :param silent: suppress progress bar :type silent: bool :raises TypeError: if parameter types are not respected :raises `~aiida.tools.importexport.common.exceptions.CorruptArchive`: if the archive misses files or files have incorrect formats """ warnings.warn( 'extract_zip function is deprecated and will be removed in AiiDA v2.0.0, ' 'use extract_tree in the archive-path package instead', AiidaDeprecationWarning) # pylint: disable=no-member if nodes_export_subfolder: if not isinstance(nodes_export_subfolder, str): raise TypeError('nodes_export_subfolder must be a string') else: nodes_export_subfolder = NODES_EXPORT_SUBFOLDER if not kwargs.get('silent', False): set_progress_bar_tqdm(unit='files') else: set_progress_reporter(None) data_files = set() try: with zipfile.ZipFile(infile, 'r', allowZip64=True) as handle: members = handle.namelist() if not members: raise CorruptArchive('no files detected in archive') with get_progress_reporter()(total=len(members)) as progress: for membername in members: progress.update() # Check that we are only exporting nodes within the subfolder! # better check such that there are no .. in the # path; use probably the folder limit checks if membername in check_files: data_files.add(membername) elif not membername.startswith(nodes_export_subfolder + os.sep): continue _update_description(membername, progress) handle.extract(path=folder.abspath, member=membername) except zipfile.BadZipfile: raise ValueError( 'The input file format for import is not valid (not a zip file)') for name in check_files: if name not in data_files: raise CorruptArchive('Archive missing required file:f {name}')
def export_version(self) -> str: metadata = self._get_metadata() if 'export_version' not in metadata: raise CorruptArchive('export_version missing from metadata.json') return metadata['export_version']