def get_use_cache(*, identifier=None): """Return whether the caching mechanism should be used for the given process type according to the configuration. :param identifier: Process type string of the node :type identifier: str :return: boolean, True if caching is enabled, False otherwise :raises: `~aiida.common.exceptions.ConfigurationError` if the configuration is invalid, either due to a general configuration error, or by defining the class both enabled and disabled """ type_check(identifier, str, allow_none=True) if identifier is not None: type_check(identifier, str) enable_matches = [ pattern for pattern in _CONFIG[ConfigKeys.ENABLED.value] if _match_wildcard(string=identifier, pattern=pattern) ] disable_matches = [ pattern for pattern in _CONFIG[ConfigKeys.DISABLED.value] if _match_wildcard(string=identifier, pattern=pattern) ] if enable_matches and disable_matches: # If both enable and disable have matching identifier, we search for # the most specific one. This is determined by checking whether # all other patterns match the specific pattern. PatternWithResult = namedtuple('PatternWithResult', ['pattern', 'use_cache']) most_specific = [] for specific_pattern in enable_matches: if all( _match_wildcard(string=specific_pattern, pattern=other_pattern) for other_pattern in enable_matches + disable_matches ): most_specific.append(PatternWithResult(pattern=specific_pattern, use_cache=True)) for specific_pattern in disable_matches: if all( _match_wildcard(string=specific_pattern, pattern=other_pattern) for other_pattern in enable_matches + disable_matches ): most_specific.append(PatternWithResult(pattern=specific_pattern, use_cache=False)) if len(most_specific) > 1: raise exceptions.ConfigurationError(( 'Invalid configuration: multiple matches for identifier {}' ', but the most specific identifier is not unique. Candidates: {}' ).format(identifier, [match.pattern for match in most_specific])) if not most_specific: raise exceptions.ConfigurationError( 'Invalid configuration: multiple matches for identifier {}, but none of them is most specific.'. format(identifier) ) return most_specific[0].use_cache if enable_matches: return True if disable_matches: return False return _CONFIG[ConfigKeys.DEFAULT.value]
def _get_config(config_file): """Return the caching configuration. :param config_file: the absolute path to the caching configuration file :return: the configuration dictionary """ from aiida.manage.configuration import get_profile from aiida.plugins.entry_point import is_valid_entry_point_string, load_entry_point_from_string profile = get_profile() if profile is None: exceptions.ConfigurationError('no profile has been loaded') try: with open(config_file, 'r', encoding='utf8') as handle: config = yaml.safe_load(handle)[profile.name] except (OSError, IOError, KeyError): # No config file, or no config for this profile return DEFAULT_CONFIG # Validate configuration for key in config: if key not in DEFAULT_CONFIG: raise ValueError( "Configuration error: Invalid key '{}' in cache_config.yml". format(key)) # Add defaults where key is either completely missing or specifies no values in which case it will be `None` for key, default_config in DEFAULT_CONFIG.items(): if key not in config or config[key] is None: config[key] = default_config # Validate the entry point identifiers for key in [ConfigKeys.ENABLED.value, ConfigKeys.DISABLED.value]: # If the key is defined in the file but contains no values, it will be `None` if config[key] is None: continue for identifier in config[key]: if not is_valid_entry_point_string(identifier): raise exceptions.ConfigurationError( "entry point '{}' in 'cache_config.yml' is not a valid entry point string." .format(identifier)) try: load_entry_point_from_string(identifier) except exceptions.EntryPointError as exception: raise exceptions.ConfigurationError( "entry point '{}' in 'cache_config.yml' can not be loaded: {}." .format(identifier, exception)) return config
def _get_config(config_file): """Return the caching configuration. :param config_file: the absolute path to the caching configuration file :return: the configuration dictionary """ from aiida.manage.configuration import get_profile profile = get_profile() if profile is None: exceptions.ConfigurationError('no profile has been loaded') try: with open(config_file, 'r', encoding='utf8') as handle: config = yaml.safe_load(handle)[profile.name] except (OSError, IOError, KeyError): # No config file, or no config for this profile return DEFAULT_CONFIG # Validate configuration for key in config: if key not in DEFAULT_CONFIG: raise exceptions.ConfigurationError( "Configuration error: Invalid key '{}' in cache_config.yml". format(key)) # Add defaults where key is either completely missing or specifies no values in which case it will be `None` for key, default_config in DEFAULT_CONFIG.items(): if key not in config or config[key] is None: config[key] = default_config try: type_check(config[ConfigKeys.DEFAULT.value], bool) type_check(config[ConfigKeys.ENABLED.value], list) type_check(config[ConfigKeys.DISABLED.value], list) except TypeError as exc: raise exceptions.ConfigurationError( 'Invalid type in caching configuration file.') from exc # Check validity of enabled and disabled entries try: for identifier in config[ConfigKeys.ENABLED.value] + config[ ConfigKeys.DISABLED.value]: _validate_identifier_pattern(identifier=identifier) except ValueError as exc: raise exceptions.ConfigurationError( 'Invalid identifier pattern in enable or disable list.') from exc return config
def get(self, computer, user): """ Return a AuthInfo given a computer and a user :param computer: a Computer instance :param user: a User instance :return: an AuthInfo object associated with the given computer and user :raise NotExistent: if the user is not configured to use computer :raise sqlalchemy.orm.exc.MultipleResultsFound: if the user is configured more than once to use the computer! Should never happen """ from django.core.exceptions import ObjectDoesNotExist, MultipleObjectsReturned try: authinfo = DbAuthInfo.objects.get(dbcomputer=computer.dbcomputer, aiidauser=user.id) return self.from_dbmodel(authinfo) except ObjectDoesNotExist: raise exceptions.NotExistent( "The aiida user {} is not configured to use computer {}". format(user.email, computer.name)) except MultipleObjectsReturned: raise exceptions.ConfigurationError( "The aiida user {} is configured more than once to use " "computer {}! Only one configuration is allowed".format( user.email, computer.name))
def load_config(create=False): """Instantiate Config object representing an AiiDA configuration file. Warning: Contrary to :func:`~aiida.manage.configuration.get_config`, this function is uncached and will always create a new Config object. You may want to call :func:`~aiida.manage.configuration.get_config` instead. :param create: if True, will create the configuration file if it does not already exist :type create: bool :return: the config :rtype: :class:`~aiida.manage.configuration.config.Config` :raises aiida.common.MissingConfigurationError: if the configuration file could not be found and create=False """ import os from aiida.common import exceptions from .config import Config filepath = get_config_path() if not os.path.isfile(filepath) and not create: raise exceptions.MissingConfigurationError('configuration file {} does not exist'.format(filepath)) try: config = Config.from_file(filepath) except ValueError: raise exceptions.ConfigurationError('configuration file {} contains invalid JSON'.format(filepath)) return config
def get(self, computer, user): """ Return a SqlaAuthInfo given a computer and a user :param computer: a Computer instance :param user: a User instance :return: an AuthInfo object associated with the given computer and user :raise NotExistent: if the user is not configured to use computer :raise sqlalchemy.orm.exc.MultipleResultsFound: if the user is configured more than once to use the computer! Should never happen """ from aiida.backends.sqlalchemy.models.authinfo import DbAuthInfo from aiida.backends.sqlalchemy import get_scoped_session session = get_scoped_session() from sqlalchemy.orm.exc import MultipleResultsFound, NoResultFound try: authinfo = session.query(DbAuthInfo).filter_by( dbcomputer_id=computer.id, aiidauser_id=user.id, ).one() return self.from_dbmodel(authinfo) except NoResultFound: raise exceptions.NotExistent( "The aiida user {} is not configured to use computer {}".format( user.email, computer.name)) except MultipleResultsFound: raise exceptions.ConfigurationError( "The aiida user {} is configured more than once to use " "computer {}! Only one configuration is allowed".format( user.email, computer.name))
def configure_repository(self): """Validates the configured repository and in the case of a file system repo makes sure the folder exists.""" import errno try: os.makedirs(self.repository_path) except OSError as exception: if exception.errno != errno.EEXIST: raise exceptions.ConfigurationError( 'could not create the configured repository `{}`: {}'. format(self.repository_path, str(exception)))
def get_transport_class(self): """ Get the transport class for this computer. Can be used to instantiate a transport instance. :return: the transport class """ try: return TransportFactory(self.transport_type) except exceptions.EntryPointError as exception: raise exceptions.ConfigurationError( f'No transport found for {self.label} [type {self.transport_type}], message: {exception}' )
def _parse_repository_uri(self): """ This function validates the REPOSITORY_URI, that should be in the format protocol://address :note: At the moment, only the file protocol is supported. :return: a tuple (protocol, address). """ import uritools parts = uritools.urisplit(self.repository_uri) if parts.scheme != u'file': raise exceptions.ConfigurationError( 'invalid repository protocol, only the local `file://` is supported' ) if not os.path.isabs(parts.path): raise exceptions.ConfigurationError( 'invalid repository URI: the path has to be absolute') return parts.scheme, os.path.expanduser(parts.path)
def load_config(create=False): """Instantiate the Config object representing the configuration file of the current AiiDA instance. :param create: if True, will create the configuration file if it does not already exist :type create: bool :return: the config :rtype: :class:`~aiida.manage.configuration.config.Config` :raises aiida.common.MissingConfigurationError: if the configuration file could not be found and create=False """ import os from aiida.common import exceptions from .config import Config from .settings import AIIDA_CONFIG_FOLDER, DEFAULT_CONFIG_FILE_NAME filepath = os.path.join(AIIDA_CONFIG_FOLDER, DEFAULT_CONFIG_FILE_NAME) if IN_RT_DOC_MODE: # The following is a dummy config.json configuration that it is used for the # proper compilation of the documentation on readthedocs. from aiida.manage.external.postgres import DEFAULT_DBINFO import tempfile return Config( tempfile.mkstemp()[1], { 'default_profile': 'default', 'profiles': { 'default': { 'AIIDADB_ENGINE': 'postgresql_psycopg2', 'AIIDADB_BACKEND': 'django', 'AIIDADB_HOST': DEFAULT_DBINFO['host'], 'AIIDADB_PORT': DEFAULT_DBINFO['port'], 'AIIDADB_NAME': 'aiidadb', 'AIIDADB_PASS': '******', 'default_user_email': '*****@*****.**', 'TIMEZONE': 'Europe/Zurich', 'AIIDADB_REPOSITORY_URI': 'file:///tmp/repository', 'AIIDADB_USER': '******' } } } ) if not os.path.isfile(filepath) and not create: raise exceptions.MissingConfigurationError('configuration file {} does not exist'.format(filepath)) try: config = Config.from_file(filepath) except ValueError: raise exceptions.ConfigurationError('configuration file {} contains invalid JSON'.format(filepath)) return config
def get_transport_class(self): """ Get the transport class for this computer. Can be used to instantiate a transport instance. :return: the transport class """ try: return TransportFactory(self.get_transport_type()) except exceptions.EntryPointError as exception: raise exceptions.ConfigurationError( 'No transport found for {} [type {}], message: {}'.format( self.name, self.get_transport_type(), exception ) )
def get_scheduler(self): """ Get a scheduler instance for this computer :return: the scheduler instance :rtype: :class:`aiida.schedulers.Scheduler` """ try: scheduler_class = SchedulerFactory(self.scheduler_type) # I call the init without any parameter return scheduler_class() except exceptions.EntryPointError as exception: raise exceptions.ConfigurationError( f'No scheduler found for {self.label} [type {self.scheduler_type}], message: {exception}' )
def get_transport(self): """Return a fully configured transport that can be used to connect to the computer set for this instance. :rtype: :class:`aiida.transports.Transport` """ computer = self.computer transport_type = computer.get_transport_type() try: transport_class = TransportFactory(transport_type) except exceptions.EntryPointError as exception: raise exceptions.ConfigurationError( 'transport type `{}` could not be loaded: {}'.format( transport_type, exception)) return transport_class(machine=computer.hostname, **self.get_auth_params())
def upload_calculation(node, transport, calc_info, script_filename, dry_run=False): """Upload a `CalcJob` instance :param node: the `CalcJobNode`. :param transport: an already opened transport to use to submit the calculation. :param calc_info: the calculation info datastructure returned by `CalcJobNode.presubmit` :param script_filename: the job launch script returned by `CalcJobNode.presubmit` :return: tuple of ``calc_info`` and ``script_filename`` """ from logging import LoggerAdapter from tempfile import NamedTemporaryFile from aiida.orm import load_node, Code, RemoteData # If the calculation already has a `remote_folder`, simply return. The upload was apparently already completed # before, which can happen if the daemon is restarted and it shuts down after uploading but before getting the # chance to perform the state transition. Upon reloading this calculation, it will re-attempt the upload. link_label = 'remote_folder' if node.get_outgoing(RemoteData, link_label_filter=link_label).first(): execlogger.warning( 'CalcJobNode<{}> already has a `{}` output: skipping upload'. format(node.pk, link_label)) return calc_info, script_filename computer = node.computer codes_info = calc_info.codes_info input_codes = [ load_node(_.code_uuid, sub_classes=(Code, )) for _ in codes_info ] logger_extra = get_dblogger_extra(node) transport.set_logger_extra(logger_extra) logger = LoggerAdapter(logger=execlogger, extra=logger_extra) if not dry_run and node.has_cached_links(): raise ValueError( 'Cannot submit calculation {} because it has cached input links! If you just want to test the ' 'submission, set `metadata.dry_run` to True in the inputs.'.format( node.pk)) folder = node._raw_input_folder # If we are performing a dry-run, the working directory should actually be a local folder that should already exist if dry_run: workdir = transport.getcwd() else: remote_user = transport.whoami() # TODO Doc: {username} field # TODO: if something is changed here, fix also 'verdi computer test' remote_working_directory = computer.get_workdir().format( username=remote_user) if not remote_working_directory.strip(): raise exceptions.ConfigurationError( "[submission of calculation {}] No remote_working_directory configured for computer '{}'" .format(node.pk, computer.name)) # If it already exists, no exception is raised try: transport.chdir(remote_working_directory) except IOError: logger.debug( '[submission of calculation {}] Unable to chdir in {}, trying to create it' .format(node.pk, remote_working_directory)) try: transport.makedirs(remote_working_directory) transport.chdir(remote_working_directory) except EnvironmentError as exc: raise exceptions.ConfigurationError( '[submission of calculation {}] ' 'Unable to create the remote directory {} on ' "computer '{}': {}".format(node.pk, remote_working_directory, computer.name, exc)) # Store remotely with sharding (here is where we choose # the folder structure of remote jobs; then I store this # in the calculation properties using _set_remote_dir # and I do not have to know the logic, but I just need to # read the absolute path from the calculation properties. transport.mkdir(calc_info.uuid[:2], ignore_existing=True) transport.chdir(calc_info.uuid[:2]) transport.mkdir(calc_info.uuid[2:4], ignore_existing=True) transport.chdir(calc_info.uuid[2:4]) try: # The final directory may already exist, most likely because this function was already executed once, but # failed and as a result was rescheduled by the eninge. In this case it would be fine to delete the folder # and create it from scratch, except that we cannot be sure that this the actual case. Therefore, to err on # the safe side, we move the folder to the lost+found directory before recreating the folder from scratch transport.mkdir(calc_info.uuid[4:]) except OSError: # Move the existing directory to lost+found, log a warning and create a clean directory anyway path_existing = os.path.join(transport.getcwd(), calc_info.uuid[4:]) path_lost_found = os.path.join(remote_working_directory, REMOTE_WORK_DIRECTORY_LOST_FOUND) path_target = os.path.join(path_lost_found, calc_info.uuid) logger.warning( 'tried to create path {} but it already exists, moving the entire folder to {}' .format(path_existing, path_target)) # Make sure the lost+found directory exists, then copy the existing folder there and delete the original transport.mkdir(path_lost_found, ignore_existing=True) transport.copytree(path_existing, path_target) transport.rmtree(path_existing) # Now we can create a clean folder for this calculation transport.mkdir(calc_info.uuid[4:]) finally: transport.chdir(calc_info.uuid[4:]) # I store the workdir of the calculation for later file retrieval workdir = transport.getcwd() node.set_remote_workdir(workdir) # I first create the code files, so that the code can put # default files to be overwritten by the plugin itself. # Still, beware! The code file itself could be overwritten... # But I checked for this earlier. for code in input_codes: if code.is_local(): # Note: this will possibly overwrite files for f in code.get_folder_list(): transport.put(code.get_abs_path(f), f) transport.chmod(code.get_local_executable(), 0o755) # rwxr-xr-x # In a dry_run, the working directory is the raw input folder, which will already contain these resources if not dry_run: for filename in folder.get_content_list(): logger.debug( '[submission of calculation {}] copying file/folder {}...'. format(node.pk, filename)) transport.put(folder.get_abs_path(filename), filename) # local_copy_list is a list of tuples, each with (uuid, dest_rel_path) # NOTE: validation of these lists are done inside calculation.presubmit() local_copy_list = calc_info.local_copy_list or [] remote_copy_list = calc_info.remote_copy_list or [] remote_symlink_list = calc_info.remote_symlink_list or [] for uuid, filename, target in local_copy_list: logger.debug( '[submission of calculation {}] copying local file/folder to {}'. format(node.pk, target)) try: data_node = load_node(uuid=uuid) except exceptions.NotExistent: logger.warning( 'failed to load Node<{}> specified in the `local_copy_list`'. format(uuid)) # Note, once #2579 is implemented, use the `node.open` method instead of the named temporary file in # combination with the new `Transport.put_object_from_filelike` # Since the content of the node could potentially be binary, we read the raw bytes and pass them on with NamedTemporaryFile(mode='wb+') as handle: handle.write(data_node.get_object_content(filename, mode='rb')) handle.flush() handle.seek(0) transport.put(handle.name, target) if dry_run: if remote_copy_list: with open(os.path.join(workdir, '_aiida_remote_copy_list.txt'), 'w') as handle: for remote_computer_uuid, remote_abs_path, dest_rel_path in remote_copy_list: handle.write( 'would have copied {} to {} in working directory on remote {}' .format(remote_abs_path, dest_rel_path, computer.name)) if remote_symlink_list: with open(os.path.join(workdir, '_aiida_remote_symlink_list.txt'), 'w') as handle: for remote_computer_uuid, remote_abs_path, dest_rel_path in remote_symlink_list: handle.write( 'would have created symlinks from {} to {} in working directory on remote {}' .format(remote_abs_path, dest_rel_path, computer.name)) else: for (remote_computer_uuid, remote_abs_path, dest_rel_path) in remote_copy_list: if remote_computer_uuid == computer.uuid: logger.debug( '[submission of calculation {}] copying {} remotely, directly on the machine {}' .format(node.pk, dest_rel_path, computer.name)) try: transport.copy(remote_abs_path, dest_rel_path) except (IOError, OSError): logger.warning( '[submission of calculation {}] Unable to copy remote resource from {} to {}! ' 'Stopping.'.format(node.pk, remote_abs_path, dest_rel_path)) raise else: raise NotImplementedError( '[submission of calculation {}] Remote copy between two different machines is ' 'not implemented yet'.format(node.pk)) for (remote_computer_uuid, remote_abs_path, dest_rel_path) in remote_symlink_list: if remote_computer_uuid == computer.uuid: logger.debug( '[submission of calculation {}] copying {} remotely, directly on the machine {}' .format(node.pk, dest_rel_path, computer.name)) try: transport.symlink(remote_abs_path, dest_rel_path) except (IOError, OSError): logger.warning( '[submission of calculation {}] Unable to create remote symlink from {} to {}! ' 'Stopping.'.format(node.pk, remote_abs_path, dest_rel_path)) raise else: raise IOError( 'It is not possible to create a symlink between two different machines for ' 'calculation {}'.format(node.pk)) if not dry_run: # Make sure that attaching the `remote_folder` with a link is the last thing we do. This gives the biggest # chance of making this method idempotent. That is to say, if a runner gets interrupted during this action, it # will simply retry the upload, unless we got here and managed to link it up, in which case we move to the next # task. Because in that case, the check for the existence of this link at the top of this function will exit # early from this command. remotedata = RemoteData(computer=computer, remote_path=workdir) remotedata.add_incoming(node, link_type=LinkType.CREATE, link_label='remote_folder') remotedata.store() return calc_info, script_filename
def submit_calculation(calculation, transport): """ Submit a calculation :param calculation: the instance of JobCalculation to submit. :param transport: an already opened transport to use to submit the calculation. """ from aiida.orm import Code from aiida.common.exceptions import InputValidationError from aiida.orm.data.remote import RemoteData computer = calculation.get_computer() if not computer.is_enabled(): return logger_extra = get_dblogger_extra(calculation) transport._set_logger_extra(logger_extra) if calculation._has_cached_links(): raise ValueError("Cannot submit calculation {} because it has " "cached input links! If you " "just want to test the submission, use the " "test_submit() method, otherwise store all links" "first".format(calculation.pk)) s = computer.get_scheduler() s.set_transport(transport) with SandboxFolder() as folder: calcinfo, script_filename = calculation._presubmit( folder, use_unstored_links=False) codes_info = calcinfo.codes_info input_codes = [ load_node(_.code_uuid, sub_class=Code) for _ in codes_info ] for code in input_codes: if not code.can_run_on(computer): raise InputValidationError( "The selected code {} for calculation " "{} cannot run on computer {}".format( code.pk, calculation.pk, computer.name)) # After this call, no modifications to the folder should be done calculation._store_raw_input_folder(folder.abspath) # NOTE: some logic is partially replicated in the 'test_submit' # method of JobCalculation. If major logic changes are done # here, make sure to update also the test_submit routine remote_user = transport.whoami() # TODO Doc: {username} field # TODO: if something is changed here, fix also 'verdi computer test' remote_working_directory = computer.get_workdir().format( username=remote_user) if not remote_working_directory.strip(): raise exceptions.ConfigurationError( "[submission of calculation {}] " "No remote_working_directory configured for computer " "'{}'".format(calculation.pk, computer.name)) # If it already exists, no exception is raised try: transport.chdir(remote_working_directory) except IOError: execlogger.debug( "[submission of calculation {}] " "Unable to chdir in {}, trying to create it".format( calculation.pk, remote_working_directory), extra=logger_extra) try: transport.makedirs(remote_working_directory) transport.chdir(remote_working_directory) except (IOError, OSError) as e: raise exceptions.ConfigurationError( "[submission of calculation {}] " "Unable to create the remote directory {} on " "computer '{}': {}".format(calculation.pk, remote_working_directory, computer.name, e.message)) # Store remotely with sharding (here is where we choose # the folder structure of remote jobs; then I store this # in the calculation properties using _set_remote_dir # and I do not have to know the logic, but I just need to # read the absolute path from the calculation properties. transport.mkdir(calcinfo.uuid[:2], ignore_existing=True) transport.chdir(calcinfo.uuid[:2]) transport.mkdir(calcinfo.uuid[2:4], ignore_existing=True) transport.chdir(calcinfo.uuid[2:4]) transport.mkdir(calcinfo.uuid[4:]) transport.chdir(calcinfo.uuid[4:]) workdir = transport.getcwd() # I store the workdir of the calculation for later file # retrieval calculation._set_remote_workdir(workdir) # I first create the code files, so that the code can put # default files to be overwritten by the plugin itself. # Still, beware! The code file itself could be overwritten... # But I checked for this earlier. for code in input_codes: if code.is_local(): # Note: this will possibly overwrite files for f in code.get_folder_list(): transport.put(code.get_abs_path(f), f) transport.chmod(code.get_local_executable(), 0o755) # rwxr-xr-x # copy all files, recursively with folders for f in folder.get_content_list(): execlogger.debug("[submission of calculation {}] " "copying file/folder {}...".format( calculation.pk, f), extra=logger_extra) transport.put(folder.get_abs_path(f), f) # local_copy_list is a list of tuples, # each with (src_abs_path, dest_rel_path) # NOTE: validation of these lists are done # inside calculation._presubmit() local_copy_list = calcinfo.local_copy_list remote_copy_list = calcinfo.remote_copy_list remote_symlink_list = calcinfo.remote_symlink_list if local_copy_list is not None: for src_abs_path, dest_rel_path in local_copy_list: execlogger.debug("[submission of calculation {}] " "copying local file/folder to {}".format( calculation.pk, dest_rel_path), extra=logger_extra) transport.put(src_abs_path, dest_rel_path) if remote_copy_list is not None: for (remote_computer_uuid, remote_abs_path, dest_rel_path) in remote_copy_list: if remote_computer_uuid == computer.uuid: execlogger.debug( "[submission of calculation {}] " "copying {} remotely, directly on the machine " "{}".format(calculation.pk, dest_rel_path, computer.name)) try: transport.copy(remote_abs_path, dest_rel_path) except (IOError, OSError): execlogger.warning( "[submission of calculation {}] " "Unable to copy remote resource from {} to {}! " "Stopping.".format(calculation.pk, remote_abs_path, dest_rel_path), extra=logger_extra) raise else: # TODO: implement copy between two different # machines! raise NotImplementedError( "[presubmission of calculation {}] " "Remote copy between two different machines is " "not implemented yet".format(calculation.pk)) if remote_symlink_list is not None: for (remote_computer_uuid, remote_abs_path, dest_rel_path) in remote_symlink_list: if remote_computer_uuid == computer.uuid: execlogger.debug( "[submission of calculation {}] " "copying {} remotely, directly on the machine " "{}".format(calculation.pk, dest_rel_path, computer.name)) try: transport.symlink(remote_abs_path, dest_rel_path) except (IOError, OSError): execlogger.warning( "[submission of calculation {}] " "Unable to create remote symlink from {} to {}! " "Stopping.".format(calculation.pk, remote_abs_path, dest_rel_path), extra=logger_extra) raise else: raise IOError("It is not possible to create a symlink " "between two different machines for " "calculation {}".format(calculation.pk)) remotedata = RemoteData(computer=computer, remote_path=workdir) remotedata.add_link_from(calculation, label='remote_folder', link_type=LinkType.CREATE) remotedata.store() job_id = s.submit_from_script(transport.getcwd(), script_filename) calculation._set_job_id(job_id)
def upload_calculation(node, transport, calc_info, folder, inputs=None, dry_run=False): """Upload a `CalcJob` instance :param node: the `CalcJobNode`. :param transport: an already opened transport to use to submit the calculation. :param calc_info: the calculation info datastructure returned by `CalcJob.presubmit` :param folder: temporary local file system folder containing the inputs written by `CalcJob.prepare_for_submission` """ # pylint: disable=too-many-locals,too-many-branches,too-many-statements from logging import LoggerAdapter from tempfile import NamedTemporaryFile from aiida.orm import load_node, Code, RemoteData # If the calculation already has a `remote_folder`, simply return. The upload was apparently already completed # before, which can happen if the daemon is restarted and it shuts down after uploading but before getting the # chance to perform the state transition. Upon reloading this calculation, it will re-attempt the upload. link_label = 'remote_folder' if node.get_outgoing(RemoteData, link_label_filter=link_label).first(): execlogger.warning( f'CalcJobNode<{node.pk}> already has a `{link_label}` output: skipping upload' ) return calc_info computer = node.computer codes_info = calc_info.codes_info input_codes = [ load_node(_.code_uuid, sub_classes=(Code, )) for _ in codes_info ] logger_extra = get_dblogger_extra(node) transport.set_logger_extra(logger_extra) logger = LoggerAdapter(logger=execlogger, extra=logger_extra) if not dry_run and node.has_cached_links(): raise ValueError( 'Cannot submit calculation {} because it has cached input links! If you just want to test the ' 'submission, set `metadata.dry_run` to True in the inputs.'.format( node.pk)) # If we are performing a dry-run, the working directory should actually be a local folder that should already exist if dry_run: workdir = transport.getcwd() else: remote_user = transport.whoami() remote_working_directory = computer.get_workdir().format( username=remote_user) if not remote_working_directory.strip(): raise exceptions.ConfigurationError( "[submission of calculation {}] No remote_working_directory configured for computer '{}'" .format(node.pk, computer.label)) # If it already exists, no exception is raised try: transport.chdir(remote_working_directory) except IOError: logger.debug( '[submission of calculation {}] Unable to chdir in {}, trying to create it' .format(node.pk, remote_working_directory)) try: transport.makedirs(remote_working_directory) transport.chdir(remote_working_directory) except EnvironmentError as exc: raise exceptions.ConfigurationError( '[submission of calculation {}] ' 'Unable to create the remote directory {} on ' "computer '{}': {}".format(node.pk, remote_working_directory, computer.label, exc)) # Store remotely with sharding (here is where we choose # the folder structure of remote jobs; then I store this # in the calculation properties using _set_remote_dir # and I do not have to know the logic, but I just need to # read the absolute path from the calculation properties. transport.mkdir(calc_info.uuid[:2], ignore_existing=True) transport.chdir(calc_info.uuid[:2]) transport.mkdir(calc_info.uuid[2:4], ignore_existing=True) transport.chdir(calc_info.uuid[2:4]) try: # The final directory may already exist, most likely because this function was already executed once, but # failed and as a result was rescheduled by the eninge. In this case it would be fine to delete the folder # and create it from scratch, except that we cannot be sure that this the actual case. Therefore, to err on # the safe side, we move the folder to the lost+found directory before recreating the folder from scratch transport.mkdir(calc_info.uuid[4:]) except OSError: # Move the existing directory to lost+found, log a warning and create a clean directory anyway path_existing = os.path.join(transport.getcwd(), calc_info.uuid[4:]) path_lost_found = os.path.join(remote_working_directory, REMOTE_WORK_DIRECTORY_LOST_FOUND) path_target = os.path.join(path_lost_found, calc_info.uuid) logger.warning( f'tried to create path {path_existing} but it already exists, moving the entire folder to {path_target}' ) # Make sure the lost+found directory exists, then copy the existing folder there and delete the original transport.mkdir(path_lost_found, ignore_existing=True) transport.copytree(path_existing, path_target) transport.rmtree(path_existing) # Now we can create a clean folder for this calculation transport.mkdir(calc_info.uuid[4:]) finally: transport.chdir(calc_info.uuid[4:]) # I store the workdir of the calculation for later file retrieval workdir = transport.getcwd() node.set_remote_workdir(workdir) # I first create the code files, so that the code can put # default files to be overwritten by the plugin itself. # Still, beware! The code file itself could be overwritten... # But I checked for this earlier. for code in input_codes: if code.is_local(): # Note: this will possibly overwrite files for filename in code.list_object_names(): # Note, once #2579 is implemented, use the `node.open` method instead of the named temporary file in # combination with the new `Transport.put_object_from_filelike` # Since the content of the node could potentially be binary, we read the raw bytes and pass them on with NamedTemporaryFile(mode='wb+') as handle: handle.write(code.get_object_content(filename, mode='rb')) handle.flush() transport.put(handle.name, filename) transport.chmod(code.get_local_executable(), 0o755) # rwxr-xr-x # local_copy_list is a list of tuples, each with (uuid, dest_rel_path) # NOTE: validation of these lists are done inside calculation.presubmit() local_copy_list = calc_info.local_copy_list or [] remote_copy_list = calc_info.remote_copy_list or [] remote_symlink_list = calc_info.remote_symlink_list or [] provenance_exclude_list = calc_info.provenance_exclude_list or [] for uuid, filename, target in local_copy_list: logger.debug( f'[submission of calculation {node.uuid}] copying local file/folder to {target}' ) def find_data_node(inputs, uuid): """Find and return the node with the given UUID from a nested mapping of input nodes. :param inputs: (nested) mapping of nodes :param uuid: UUID of the node to find :return: instance of `Node` or `None` if not found """ from collections.abc import Mapping data_node = None for input_node in inputs.values(): if isinstance(input_node, Mapping): data_node = find_data_node(input_node, uuid) elif isinstance(input_node, Node) and input_node.uuid == uuid: data_node = input_node if data_node is not None: break return data_node try: data_node = load_node(uuid=uuid) except exceptions.NotExistent: data_node = find_data_node(inputs, uuid) if data_node is None: logger.warning( f'failed to load Node<{uuid}> specified in the `local_copy_list`' ) else: dirname = os.path.dirname(target) if dirname: os.makedirs(os.path.join(folder.abspath, dirname), exist_ok=True) with folder.open(target, 'wb') as handle: with data_node.open(filename, 'rb') as source: shutil.copyfileobj(source, handle) provenance_exclude_list.append(target) # In a dry_run, the working directory is the raw input folder, which will already contain these resources if not dry_run: for filename in folder.get_content_list(): logger.debug( f'[submission of calculation {node.pk}] copying file/folder {filename}...' ) transport.put(folder.get_abs_path(filename), filename) for (remote_computer_uuid, remote_abs_path, dest_rel_path) in remote_copy_list: if remote_computer_uuid == computer.uuid: logger.debug( '[submission of calculation {}] copying {} remotely, directly on the machine {}' .format(node.pk, dest_rel_path, computer.label)) try: transport.copy(remote_abs_path, dest_rel_path) except (IOError, OSError): logger.warning( '[submission of calculation {}] Unable to copy remote resource from {} to {}! ' 'Stopping.'.format(node.pk, remote_abs_path, dest_rel_path)) raise else: raise NotImplementedError( '[submission of calculation {}] Remote copy between two different machines is ' 'not implemented yet'.format(node.pk)) for (remote_computer_uuid, remote_abs_path, dest_rel_path) in remote_symlink_list: if remote_computer_uuid == computer.uuid: logger.debug( '[submission of calculation {}] copying {} remotely, directly on the machine {}' .format(node.pk, dest_rel_path, computer.label)) try: transport.symlink(remote_abs_path, dest_rel_path) except (IOError, OSError): logger.warning( '[submission of calculation {}] Unable to create remote symlink from {} to {}! ' 'Stopping.'.format(node.pk, remote_abs_path, dest_rel_path)) raise else: raise IOError( f'It is not possible to create a symlink between two different machines for calculation {node.pk}' ) else: if remote_copy_list: with open(os.path.join(workdir, '_aiida_remote_copy_list.txt'), 'w') as handle: for remote_computer_uuid, remote_abs_path, dest_rel_path in remote_copy_list: handle.write( 'would have copied {} to {} in working directory on remote {}' .format(remote_abs_path, dest_rel_path, computer.label)) if remote_symlink_list: with open(os.path.join(workdir, '_aiida_remote_symlink_list.txt'), 'w') as handle: for remote_computer_uuid, remote_abs_path, dest_rel_path in remote_symlink_list: handle.write( 'would have created symlinks from {} to {} in working directory on remote {}' .format(remote_abs_path, dest_rel_path, computer.label)) # Loop recursively over content of the sandbox folder copying all that are not in `provenance_exclude_list`. Note # that directories are not created explicitly. The `node.put_object_from_filelike` call will create intermediate # directories for nested files automatically when needed. This means though that empty folders in the sandbox or # folders that would be empty when considering the `provenance_exclude_list` will *not* be copied to the repo. The # advantage of this explicit copying instead of deleting the files from `provenance_exclude_list` from the sandbox # first before moving the entire remaining content to the node's repository, is that in this way we are guaranteed # not to accidentally move files to the repository that should not go there at all cost. Note that all entries in # the provenance exclude list are normalized first, just as the paths that are in the sandbox folder, otherwise the # direct equality test may fail, e.g.: './path/file.txt' != 'path/file.txt' even though they reference the same file provenance_exclude_list = [ os.path.normpath(entry) for entry in provenance_exclude_list ] for root, _, filenames in os.walk(folder.abspath): for filename in filenames: filepath = os.path.join(root, filename) relpath = os.path.normpath( os.path.relpath(filepath, folder.abspath)) if relpath not in provenance_exclude_list: with open(filepath, 'rb') as handle: node._repository.put_object_from_filelike(handle, relpath, 'wb', force=True) # pylint: disable=protected-access if not dry_run: # Make sure that attaching the `remote_folder` with a link is the last thing we do. This gives the biggest # chance of making this method idempotent. That is to say, if a runner gets interrupted during this action, it # will simply retry the upload, unless we got here and managed to link it up, in which case we move to the next # task. Because in that case, the check for the existence of this link at the top of this function will exit # early from this command. remotedata = RemoteData(computer=computer, remote_path=workdir) remotedata.add_incoming(node, link_type=LinkType.CREATE, link_label='remote_folder') remotedata.store()