def logger(self): """ Return the logger, also with automatic extras of the associated extras of the calculation """ return logging.LoggerAdapter(logger=self._logger, extra=get_dblogger_extra(self._calc))
def logger(self): """ Get the logger of the Calculation object, so that it also logs to the DB. :return: LoggerAdapter object, that works like a logger, but also has the 'extra' embedded """ return logging.LoggerAdapter(logger=self._logger, extra=get_dblogger_extra(self))
def logger(self): """ Return the logger, also with automatic extras of the associated extras of the calculation """ import logging from aiida.common.log import get_dblogger_extra return logging.LoggerAdapter(logger=self._logger, extra=get_dblogger_extra(self._calc))
def logger(self): """ Get the logger of the Workflow object, so that it also logs to the DB. :return: LoggerAdapter object, that works like a logger, but also has the 'extra' embedded """ import logging from aiida.common.log import get_dblogger_extra return logging.LoggerAdapter(logger=self._logger, extra=get_dblogger_extra(self))
def get_log_messages(obj): from aiida.backends.djsite.db.models import DbLog import json extra = get_dblogger_extra(obj) # convert to list, too log_messages = list( DbLog.objects.filter(**extra).order_by('time').values( 'loggername', 'levelname', 'message', 'metadata', 'time')) # deserialize metadata for log in log_messages: log.update({'metadata': json.loads(log['metadata'])}) return log_messages
def get_log_messages(obj): """ Get the log messages for the object. """ from aiida.backends.sqlalchemy.models.log import DbLog from aiida.backends.sqlalchemy import get_scoped_session session = get_scoped_session() extra = get_dblogger_extra(obj) log_messages = [] for log_message in ( session.query(DbLog).filter_by(**extra).order_by('time').all()): val_dict = log_message.__dict__ updated_val_dict = { "loggername": val_dict["loggername"], "levelname": val_dict["levelname"], "message": val_dict["message"], "metadata": val_dict["_metadata"], "time": val_dict["time"]} log_messages.append(updated_val_dict) return log_messages
def parse_results(job, retrieved_temporary_folder=None): """ Parse the results for a given JobCalculation (job) :returns: integer exit code, where 0 indicates success and non-zero failure """ from aiida.orm.calculation.job import JobCalculationExitStatus from aiida.work import ExitCode logger_extra = get_dblogger_extra(job) job._set_state(calc_states.PARSING) Parser = job.get_parserclass() exit_code = ExitCode() if retrieved_temporary_folder: files = [] for root, directories, filenames in os.walk( retrieved_temporary_folder): for directory in directories: files.append("- [D] {}".format(os.path.join(root, directory))) for filename in filenames: files.append("- [F] {}".format(os.path.join(root, filename))) execlogger.debug("[parsing of calc {}] " "Content of the retrieved_temporary_folder: \n" "{}".format(job.pk, "\n".join(files)), extra=logger_extra) else: execlogger.debug("[parsing of calc {}] " "No retrieved_temporary_folder.".format(job.pk), extra=logger_extra) if Parser is not None: parser = Parser(job) exit_code, new_nodes_tuple = parser.parse_from_calc( retrieved_temporary_folder) # Some implementations of parse_from_calc may still return a plain boolean or integer for the exit_code. # In the case of a boolean: True should be mapped to the default ExitCode which corresponds to an exit # status of 0. False values are mapped to the value that is mapped onto the FAILED calculation state # throught the JobCalculationExitStatus. Plain integers are directly used to construct an ExitCode tuple if isinstance(exit_code, bool) and exit_code is True: exit_code = ExitCode(0) elif isinstance(exit_code, bool) and exit_code is False: exit_code = ExitCode( JobCalculationExitStatus[calc_states.FAILED].value) elif isinstance(exit_code, int): exit_code = ExitCode(exit_code) elif isinstance(exit_code, ExitCode): pass else: raise ValueError( "parse_from_calc returned an 'exit_code' of invalid_type: {}. It should " "return a boolean, integer or ExitCode instance".format( type(exit_code))) for label, n in new_nodes_tuple: n.add_link_from(job, label=label, link_type=LinkType.CREATE) n.store() try: if exit_code.status == 0: job._set_state(calc_states.FINISHED) else: job._set_state(calc_states.FAILED) except exceptions.ModificationNotAllowed: # I should have been the only one to set it, but # in order to avoid useless error messages, I just ignore pass if exit_code.status is not 0: execlogger.error("[parsing of calc {}] " "The parser returned an error, but it should have " "created an output node with some partial results " "and warnings. Check there for more information on " "the problem".format(job.pk), extra=logger_extra) return exit_code
def submit_calculation(calculation, transport): """ Submit a calculation :param calculation: the instance of JobCalculation to submit. :param transport: an already opened transport to use to submit the calculation. """ from aiida.orm import Code from aiida.common.exceptions import InputValidationError from aiida.orm.data.remote import RemoteData computer = calculation.get_computer() if not computer.is_enabled(): return logger_extra = get_dblogger_extra(calculation) transport._set_logger_extra(logger_extra) if calculation._has_cached_links(): raise ValueError("Cannot submit calculation {} because it has " "cached input links! If you " "just want to test the submission, use the " "test_submit() method, otherwise store all links" "first".format(calculation.pk)) s = computer.get_scheduler() s.set_transport(transport) with SandboxFolder() as folder: calcinfo, script_filename = calculation._presubmit( folder, use_unstored_links=False) codes_info = calcinfo.codes_info input_codes = [ load_node(_.code_uuid, sub_class=Code) for _ in codes_info ] for code in input_codes: if not code.can_run_on(computer): raise InputValidationError( "The selected code {} for calculation " "{} cannot run on computer {}".format( code.pk, calculation.pk, computer.name)) # After this call, no modifications to the folder should be done calculation._store_raw_input_folder(folder.abspath) # NOTE: some logic is partially replicated in the 'test_submit' # method of JobCalculation. If major logic changes are done # here, make sure to update also the test_submit routine remote_user = transport.whoami() # TODO Doc: {username} field # TODO: if something is changed here, fix also 'verdi computer test' remote_working_directory = computer.get_workdir().format( username=remote_user) if not remote_working_directory.strip(): raise exceptions.ConfigurationError( "[submission of calculation {}] " "No remote_working_directory configured for computer " "'{}'".format(calculation.pk, computer.name)) # If it already exists, no exception is raised try: transport.chdir(remote_working_directory) except IOError: execlogger.debug( "[submission of calculation {}] " "Unable to chdir in {}, trying to create it".format( calculation.pk, remote_working_directory), extra=logger_extra) try: transport.makedirs(remote_working_directory) transport.chdir(remote_working_directory) except (IOError, OSError) as e: raise exceptions.ConfigurationError( "[submission of calculation {}] " "Unable to create the remote directory {} on " "computer '{}': {}".format(calculation.pk, remote_working_directory, computer.name, e.message)) # Store remotely with sharding (here is where we choose # the folder structure of remote jobs; then I store this # in the calculation properties using _set_remote_dir # and I do not have to know the logic, but I just need to # read the absolute path from the calculation properties. transport.mkdir(calcinfo.uuid[:2], ignore_existing=True) transport.chdir(calcinfo.uuid[:2]) transport.mkdir(calcinfo.uuid[2:4], ignore_existing=True) transport.chdir(calcinfo.uuid[2:4]) transport.mkdir(calcinfo.uuid[4:]) transport.chdir(calcinfo.uuid[4:]) workdir = transport.getcwd() # I store the workdir of the calculation for later file # retrieval calculation._set_remote_workdir(workdir) # I first create the code files, so that the code can put # default files to be overwritten by the plugin itself. # Still, beware! The code file itself could be overwritten... # But I checked for this earlier. for code in input_codes: if code.is_local(): # Note: this will possibly overwrite files for f in code.get_folder_list(): transport.put(code.get_abs_path(f), f) transport.chmod(code.get_local_executable(), 0o755) # rwxr-xr-x # copy all files, recursively with folders for f in folder.get_content_list(): execlogger.debug("[submission of calculation {}] " "copying file/folder {}...".format( calculation.pk, f), extra=logger_extra) transport.put(folder.get_abs_path(f), f) # local_copy_list is a list of tuples, # each with (src_abs_path, dest_rel_path) # NOTE: validation of these lists are done # inside calculation._presubmit() local_copy_list = calcinfo.local_copy_list remote_copy_list = calcinfo.remote_copy_list remote_symlink_list = calcinfo.remote_symlink_list if local_copy_list is not None: for src_abs_path, dest_rel_path in local_copy_list: execlogger.debug("[submission of calculation {}] " "copying local file/folder to {}".format( calculation.pk, dest_rel_path), extra=logger_extra) transport.put(src_abs_path, dest_rel_path) if remote_copy_list is not None: for (remote_computer_uuid, remote_abs_path, dest_rel_path) in remote_copy_list: if remote_computer_uuid == computer.uuid: execlogger.debug( "[submission of calculation {}] " "copying {} remotely, directly on the machine " "{}".format(calculation.pk, dest_rel_path, computer.name)) try: transport.copy(remote_abs_path, dest_rel_path) except (IOError, OSError): execlogger.warning( "[submission of calculation {}] " "Unable to copy remote resource from {} to {}! " "Stopping.".format(calculation.pk, remote_abs_path, dest_rel_path), extra=logger_extra) raise else: # TODO: implement copy between two different # machines! raise NotImplementedError( "[presubmission of calculation {}] " "Remote copy between two different machines is " "not implemented yet".format(calculation.pk)) if remote_symlink_list is not None: for (remote_computer_uuid, remote_abs_path, dest_rel_path) in remote_symlink_list: if remote_computer_uuid == computer.uuid: execlogger.debug( "[submission of calculation {}] " "copying {} remotely, directly on the machine " "{}".format(calculation.pk, dest_rel_path, computer.name)) try: transport.symlink(remote_abs_path, dest_rel_path) except (IOError, OSError): execlogger.warning( "[submission of calculation {}] " "Unable to create remote symlink from {} to {}! " "Stopping.".format(calculation.pk, remote_abs_path, dest_rel_path), extra=logger_extra) raise else: raise IOError("It is not possible to create a symlink " "between two different machines for " "calculation {}".format(calculation.pk)) remotedata = RemoteData(computer=computer, remote_path=workdir) remotedata.add_link_from(calculation, label='remote_folder', link_type=LinkType.CREATE) remotedata.store() job_id = s.submit_from_script(transport.getcwd(), script_filename) calculation._set_job_id(job_id)
def retrieve_calculation(calculation, transport, retrieved_temporary_folder): """ Retrieve all the files of a completed job calculation using the given transport. If the job defined anything in the `retrieve_temporary_list`, those entries will be stored in the `retrieved_temporary_folder`. The caller is responsible for creating and destroying this folder. :param calculation: the instance of JobCalculation to update. :param transport: an already opened transport to use for the retrieval. :param retrieved_temporary_folder: the absolute path to a directory in which to store the files listed, if any, in the `retrieved_temporary_folder` of the jobs CalcInfo """ logger_extra = get_dblogger_extra(calculation) execlogger.debug("Retrieving calc {}".format(calculation.pk), extra=logger_extra) workdir = calculation._get_remote_workdir() execlogger.debug("[retrieval of calc {}] chdir {}".format( calculation.pk, workdir), extra=logger_extra) # Create the FolderData node to attach everything to retrieved_files = FolderData() retrieved_files.add_link_from(calculation, label=calculation._get_linkname_retrieved(), link_type=LinkType.CREATE) with transport: transport.chdir(workdir) # First, retrieve the files of folderdata retrieve_list = calculation._get_retrieve_list() retrieve_temporary_list = calculation._get_retrieve_temporary_list() retrieve_singlefile_list = calculation._get_retrieve_singlefile_list() with SandboxFolder() as folder: retrieve_files_from_list(calculation, transport, folder.abspath, retrieve_list) # Here I retrieved everything; now I store them inside the calculation retrieved_files.replace_with_folder(folder.abspath, overwrite=True) # Second, retrieve the singlefiles with SandboxFolder() as folder: _retrieve_singlefiles(calculation, transport, folder, retrieve_singlefile_list, logger_extra) # Retrieve the temporary files in the retrieved_temporary_folder if any files were # specified in the 'retrieve_temporary_list' key if retrieve_temporary_list: retrieve_files_from_list(calculation, transport, retrieved_temporary_folder, retrieve_temporary_list) # Log the files that were retrieved in the temporary folder for filename in os.listdir(retrieved_temporary_folder): execlogger.debug( "[retrieval of calc {}] Retrieved temporary file or folder '{}'" .format(calculation.pk, filename), extra=logger_extra) # Store everything execlogger.debug("[retrieval of calc {}] " "Storing retrieved_files={}".format( calculation.pk, retrieved_files.dbnode.pk), extra=logger_extra) retrieved_files.store()
def _prepare_for_submission(self, tempfolder, inputdict): """ This is the routine to be called when you want to create the input files and related stuff with a plugin. :param tempfolder: a aiida.common.folders.Folder subclass where the plugin should put all its files. :param inputdict: a dictionary with the input nodes, as they would be returned by get_inputs_dict (without the Code!) """ local_copy_list = [] remote_copy_list = [] remote_symlink_list = [] try: parameters = inputdict.pop(self.get_linkname('parameters')) except KeyError: raise InputValidationError( "No parameters specified for this calculation") if not isinstance(parameters, ParameterData): raise InputValidationError( "parameters is not of type ParameterData") try: structure = inputdict.pop(self.get_linkname('structure')) except KeyError: raise InputValidationError( "No structure specified for this calculation") if not isinstance(structure, StructureData): raise InputValidationError( "structure is not of type StructureData") if self._use_kpoints: try: kpoints = inputdict.pop(self.get_linkname('kpoints')) except KeyError: raise InputValidationError( "No kpoints specified for this calculation") if not isinstance(kpoints, KpointsData): raise InputValidationError( "kpoints is not of type KpointsData") else: kpoints = None # Settings can be undefined, and defaults to an empty dictionary settings = inputdict.pop(self.get_linkname('settings'), None) if settings is None: settings_dict = {} else: if not isinstance(settings, ParameterData): raise InputValidationError( "settings, if specified, must be of " "type ParameterData") # Settings converted to uppercase settings_dict = _uppercase_dict(settings.get_dict(), dict_name='settings') pseudos = {} # I create here a dictionary that associates each kind name to a pseudo for link in inputdict.keys(): if link.startswith(self._get_linkname_pseudo_prefix()): kindstring = link[len(self._get_linkname_pseudo_prefix()):] kinds = kindstring.split('_') the_pseudo = inputdict.pop(link) if not isinstance(the_pseudo, UpfData): raise InputValidationError( "Pseudo for kind(s) {} is not of " "type UpfData".format(",".join(kinds))) for kind in kinds: if kind in pseudos: raise InputValidationError( "Pseudo for kind {} passed " "more than one time".format(kind)) pseudos[kind] = the_pseudo parent_calc_folder = inputdict.pop(self.get_linkname('parent_folder'), None) if parent_calc_folder is not None: if not isinstance(parent_calc_folder, RemoteData): raise InputValidationError("parent_calc_folder, if specified, " "must be of type RemoteData") vdw_table = inputdict.pop(self.get_linkname('vdw_table'), None) if vdw_table is not None: if not isinstance(vdw_table, SinglefileData): raise InputValidationError("vdw_table, if specified, " "must be of type SinglefileData") try: code = inputdict.pop(self.get_linkname('code')) except KeyError: raise InputValidationError( "No code specified for this calculation") # Here, there should be no more parameters... if inputdict: raise InputValidationError("The following input data nodes are " "unrecognized: {}".format( inputdict.keys())) # Check structure, get species, check peudos kindnames = [k.name for k in structure.kinds] if set(kindnames) != set(pseudos.keys()): err_msg = ("Mismatch between the defined pseudos and the list of " "kinds of the structure. Pseudos: {}; kinds: {}".format( ",".join(pseudos.keys()), ",".join(list(kindnames)))) raise InputValidationError(err_msg) ############################## # END OF INITIAL INPUT CHECK # ############################## # I create the subfolder that will contain the pseudopotentials tempfolder.get_subfolder(self._PSEUDO_SUBFOLDER, create=True) # I create the subfolder with the output data (sometimes Quantum # Espresso codes crash if an empty folder is not already there tempfolder.get_subfolder(self._OUTPUT_SUBFOLDER, create=True) # If present, add also the Van der Waals table to the pseudo dir # Note that the name of the table is not checked but should be the # one expected by QE. if vdw_table: local_copy_list.append( (vdw_table.get_file_abs_path(), os.path.join(self._PSEUDO_SUBFOLDER, os.path.split( vdw_table.get_file_abs_path())[1]))) input_filecontent, local_copy_pseudo_list = self._generate_PWCPinputdata( parameters, settings_dict, pseudos, structure, kpoints) local_copy_list += local_copy_pseudo_list input_filename = tempfolder.get_abs_path(self._INPUT_FILE_NAME) with open(input_filename, 'w') as infile: infile.write(input_filecontent) # operations for restart symlink = settings_dict.pop('PARENT_FOLDER_SYMLINK', self._default_symlink_usage) # a boolean if symlink: if parent_calc_folder is not None: # I put the symlink to the old parent ./out folder remote_symlink_list.append( (parent_calc_folder.get_computer().uuid, os.path.join(parent_calc_folder.get_remote_path(), self._restart_copy_from), self._restart_copy_to)) else: # copy remote output dir, if specified if parent_calc_folder is not None: remote_copy_list.append( (parent_calc_folder.get_computer().uuid, os.path.join(parent_calc_folder.get_remote_path(), self._restart_copy_from), self._restart_copy_to)) # here we may create an aiida.EXIT file create_exit_file = settings_dict.pop('ONLY_INITIALIZATION', False) if create_exit_file: exit_filename = tempfolder.get_abs_path('{}.EXIT'.format( self._PREFIX)) with open(exit_filename, 'w') as f: f.write('\n') # Check if specific inputs for the ENVIRON module where specified environ_namelist = settings_dict.pop('ENVIRON', None) if environ_namelist is not None: if not isinstance(environ_namelist, dict): raise InputValidationError( "ENVIRON namelist should be specified as a dictionary") # We first add the environ flag to the command-line options (if not already present) try: if '-environ' not in settings_dict['CMDLINE']: settings_dict['CMDLINE'].append('-environ') except KeyError: settings_dict['CMDLINE'] = ['-environ'] # To create a mapping from the species to an incremental fortran 1-based index # we use the alphabetical order as in the inputdata generation mapping_species = { sp_name: (idx + 1) for idx, sp_name in enumerate( sorted([kind.name for kind in structure.kinds])) } environ_input_filename = tempfolder.get_abs_path( self._ENVIRON_INPUT_FILE_NAME) with open(environ_input_filename, 'w') as environ_infile: environ_infile.write("&ENVIRON\n") for k, v in sorted(environ_namelist.iteritems()): environ_infile.write( get_input_data_text(k, v, mapping=mapping_species)) environ_infile.write("/\n") # Check for the deprecated 'ALSO_BANDS' setting and if present fire a deprecation log message also_bands = settings_dict.pop('ALSO_BANDS', None) if also_bands: import logging from aiida.common.log import get_dblogger_extra logger = logging.LoggerAdapter(logger=self.logger, extra=get_dblogger_extra(self)) logger.warning( "The '{}' setting is deprecated as bands are now parsed by default. " "If you do not want the bands to be parsed set the '{}' to True {}. " "Note that the eigenvalue.xml files are also no longer stored in the repository" .format('also_bands', 'no_bands', type(self))) calcinfo = CalcInfo() calcinfo.uuid = self.uuid # Empty command line by default cmdline_params = settings_dict.pop('CMDLINE', []) # we commented calcinfo.stin_name and added it here in cmdline_params # in this way the mpirun ... pw.x ... < aiida.in # is replaced by mpirun ... pw.x ... -in aiida.in # in the scheduler, _get_run_line, if cmdline_params is empty, it # simply uses < calcinfo.stin_name calcinfo.cmdline_params = (list(cmdline_params) + ["-in", self._INPUT_FILE_NAME]) codeinfo = CodeInfo() codeinfo.cmdline_params = (list(cmdline_params) + ["-in", self._INPUT_FILE_NAME]) codeinfo.stdout_name = self._OUTPUT_FILE_NAME codeinfo.code_uuid = code.uuid calcinfo.codes_info = [codeinfo] calcinfo.local_copy_list = local_copy_list calcinfo.remote_copy_list = remote_copy_list calcinfo.remote_symlink_list = remote_symlink_list # Retrieve by default the output file and the xml file calcinfo.retrieve_list = [] calcinfo.retrieve_list.append(self._OUTPUT_FILE_NAME) calcinfo.retrieve_list.append(self._DATAFILE_XML) calcinfo.retrieve_list += settings_dict.pop('ADDITIONAL_RETRIEVE_LIST', []) calcinfo.retrieve_list += self._internal_retrieve_list # Retrieve the k-point directories with the xml files to the temporary folder # to parse the band eigenvalues and occupations but not to have to save the raw files # if and only if the 'no_bands' key was not set to true in the settings no_bands = settings_dict.pop('NO_BANDS', False) if no_bands is False: xmlpaths = os.path.join(self._OUTPUT_SUBFOLDER, self._PREFIX + '.save', 'K*[0-9]', 'eigenval*.xml') calcinfo.retrieve_temporary_list = [[xmlpaths, '.', 2]] try: Parserclass = self.get_parserclass() parser = Parserclass(self) parser_opts = parser.get_parser_settings_key().upper() settings_dict.pop(parser_opts) except (KeyError, AttributeError): # the key parser_opts isn't inside the dictionary pass if settings_dict: raise InputValidationError( "The following keys have been found in " "the settings input node, but were not understood: {}".format( ",".join(settings_dict.keys()))) return calcinfo
def retrieve_computed_for_authinfo(authinfo): from aiida.orm import JobCalculation from aiida.common.folders import SandboxFolder from aiida.orm.data.folder import FolderData from aiida.common.log import get_dblogger_extra from aiida.orm import DataFactory from aiida.backends.utils import QueryFactory import os if not authinfo.enabled: return qmanager = QueryFactory()() # I create a unique set of pairs (computer, aiidauser) calcs_to_retrieve = qmanager.query_jobcalculations_by_computer_user_state( state=calc_states.COMPUTED, computer=authinfo.dbcomputer, user=authinfo.aiidauser) retrieved = [] # I avoid to open an ssh connection if there are no # calcs with state not COMPUTED if len(calcs_to_retrieve): # Open connection with authinfo.get_transport() as t: for calc in calcs_to_retrieve: logger_extra = get_dblogger_extra(calc) t._set_logger_extra(logger_extra) try: calc._set_state(calc_states.RETRIEVING) except ModificationNotAllowed: # Someone else has already started to retrieve it, # just log and continue execlogger.debug("Attempting to retrieve more than once " "calculation {}: skipping!".format( calc.pk), extra=logger_extra) continue # with the next calculation to retrieve try: execlogger.debug("Retrieving calc {}".format(calc.pk), extra=logger_extra) workdir = calc._get_remote_workdir() retrieve_list = calc._get_retrieve_list() retrieve_temporary_list = calc._get_retrieve_temporary_list( ) retrieve_singlefile_list = calc._get_retrieve_singlefile_list( ) execlogger.debug("[retrieval of calc {}] " "chdir {}".format(calc.pk, workdir), extra=logger_extra) t.chdir(workdir) retrieved_files = FolderData() retrieved_files.add_link_from( calc, label=calc._get_linkname_retrieved(), link_type=LinkType.CREATE) # First, retrieve the files of folderdata with SandboxFolder() as folder: retrieve_files_from_list(calc, t, folder, retrieve_list) # Here I retrieved everything; now I store them inside the calculation retrieved_files.replace_with_folder(folder.abspath, overwrite=True) # Second, retrieve the singlefiles with SandboxFolder() as folder: singlefile_list = [] for (linkname, subclassname, filename) in retrieve_singlefile_list: execlogger.debug( "[retrieval of calc {}] Trying " "to retrieve remote singlefile '{}'".format( calc.pk, filename), extra=logger_extra) localfilename = os.path.join( folder.abspath, os.path.split(filename)[1]) t.get(filename, localfilename, ignore_nonexisting=True) singlefile_list.append( (linkname, subclassname, localfilename)) # ignore files that have not been retrieved singlefile_list = [ i for i in singlefile_list if os.path.exists(i[2]) ] # after retrieving from the cluster, I create the objects singlefiles = [] for (linkname, subclassname, filename) in singlefile_list: SinglefileSubclass = DataFactory(subclassname) singlefile = SinglefileSubclass() singlefile.set_file(filename) singlefile.add_link_from(calc, label=linkname, link_type=LinkType.CREATE) singlefiles.append(singlefile) # Retrieve the temporary files in a separate temporary folder if any files were # specified in the 'retrieve_temporary_list' key if retrieve_temporary_list: retrieved_temporary_folder = FolderData() with SandboxFolder() as folder: retrieve_files_from_list(calc, t, folder, retrieve_temporary_list) retrieved_temporary_folder.replace_with_folder( folder.abspath, overwrite=True) # Log the files that were retrieved in the temporary folder for entry in retrieved_temporary_folder.get_folder_list( ): execlogger.debug( "[retrieval of calc {}] Retrieved temporary file or folder '{}'" .format(calc.pk, entry), extra=logger_extra) else: retrieved_temporary_folder = None # Finally, store the retrieved_files node. The retrieved_temporary_folder node # is explicitly not stored, but will just be passed to the parser.parse_from calc call execlogger.debug( "[retrieval of calc {}] Storing retrieved_files={}". format(calc.pk, retrieved_files.dbnode.pk), extra=logger_extra) retrieved_files.store() for fil in singlefiles: execlogger.debug( "[retrieval of calc {}] Storing retrieved_singlefile={}" .format(calc.pk, fil.dbnode.pk), extra=logger_extra) fil.store() # If I was the one retrieving, I should also be the only one parsing! I do not check calc._set_state(calc_states.PARSING) Parser = calc.get_parserclass() # If no parser is set, the calculation is successful successful = True if Parser is not None: parser = Parser(calc) successful, new_nodes_tuple = parser.parse_from_calc( retrieved_temporary_folder) for label, n in new_nodes_tuple: n.add_link_from(calc, label=label, link_type=LinkType.CREATE) n.store() if successful: try: calc._set_state(calc_states.FINISHED) except ModificationNotAllowed: # I should have been the only one to set it, but # in order to avoid unuseful error messages, I # just ignore pass else: try: calc._set_state(calc_states.FAILED) except ModificationNotAllowed: # I should have been the only one to set it, but # in order to avoid unuseful error messages, I # just ignore pass execlogger.error( "[parsing of calc {}] " "The parser returned an error, but it should have " "created an output node with some partial results " "and warnings. Check there for more information on " "the problem".format(calc.pk), extra=logger_extra) retrieved.append(calc) except Exception: import traceback tb = traceback.format_exc() newextradict = logger_extra.copy() newextradict['full_traceback'] = tb if calc.get_state() == calc_states.PARSING: execlogger.error("Error parsing calc {}. " "Traceback: {}".format(calc.pk, tb), extra=newextradict) # TODO: add a 'comment' to the calculation try: calc._set_state(calc_states.PARSINGFAILED) except ModificationNotAllowed: pass else: execlogger.error("Error retrieving calc {}. " "Traceback: {}".format(calc.pk, tb), extra=newextradict) try: calc._set_state(calc_states.RETRIEVALFAILED) except ModificationNotAllowed: pass raise return retrieved
def submit_calc(calc, authinfo, transport=None): """ Submit a calculation :note: if no transport is passed, a new transport is opened and then closed within this function. If you want to use an already opened transport, pass it as further parameter. In this case, the transport has to be already open, and must coincide with the transport of the the computer defined by the authinfo. :param calc: the calculation to submit (an instance of the aiida.orm.JobCalculation class) :param authinfo: the authinfo for this calculation. :param transport: if passed, must be an already opened transport. No checks are done on the consistency of the given transport with the transport of the computer defined in the authinfo. """ from aiida.orm import Code, Computer from aiida.common.folders import SandboxFolder from aiida.common.exceptions import (InputValidationError) from aiida.orm.data.remote import RemoteData from aiida.common.log import get_dblogger_extra if not authinfo.enabled: return logger_extra = get_dblogger_extra(calc) if transport is None: t = authinfo.get_transport() must_open_t = True else: t = transport must_open_t = False t._set_logger_extra(logger_extra) if calc._has_cached_links(): raise ValueError("Cannot submit calculation {} because it has " "cached input links! If you " "just want to test the submission, use the " "test_submit() method, otherwise store all links" "first".format(calc.pk)) # Double check, in the case the calculation was 'killed' (and therefore # put in the 'FAILED' state) in the meantime # Do it as near as possible to the state change below (it would be # even better to do it with some sort of transaction) if calc.get_state() != calc_states.TOSUBMIT: raise ValueError("Can only submit calculations with state=TOSUBMIT! " "(state of calc {} is {} instead)".format( calc.pk, calc.get_state())) # I start to submit the calculation: I set the state try: calc._set_state(calc_states.SUBMITTING) except ModificationNotAllowed: raise ValueError("The calculation has already been submitted by " "someone else!") try: if must_open_t: t.open() s = Computer(dbcomputer=authinfo.dbcomputer).get_scheduler() s.set_transport(t) computer = calc.get_computer() with SandboxFolder() as folder: calcinfo, script_filename = calc._presubmit( folder, use_unstored_links=False) codes_info = calcinfo.codes_info input_codes = [ load_node(_.code_uuid, parent_class=Code) for _ in codes_info ] for code in input_codes: if not code.can_run_on(computer): raise InputValidationError( "The selected code {} for calculation " "{} cannot run on computer {}".format( code.pk, calc.pk, computer.name)) # After this call, no modifications to the folder should be done calc._store_raw_input_folder(folder.abspath) # NOTE: some logic is partially replicated in the 'test_submit' # method of JobCalculation. If major logic changes are done # here, make sure to update also the test_submit routine remote_user = t.whoami() # TODO Doc: {username} field # TODO: if something is changed here, fix also 'verdi computer test' remote_working_directory = authinfo.get_workdir().format( username=remote_user) if not remote_working_directory.strip(): raise ConfigurationError( "[submission of calc {}] " "No remote_working_directory configured for computer " "'{}'".format(calc.pk, computer.name)) # If it already exists, no exception is raised try: t.chdir(remote_working_directory) except IOError: execlogger.debug( "[submission of calc {}] " "Unable to chdir in {}, trying to create it".format( calc.pk, remote_working_directory), extra=logger_extra) try: t.makedirs(remote_working_directory) t.chdir(remote_working_directory) except (IOError, OSError) as e: raise ConfigurationError( "[submission of calc {}] " "Unable to create the remote directory {} on " "computer '{}': {}".format(calc.pk, remote_working_directory, computer.name, e.message)) # Store remotely with sharding (here is where we choose # the folder structure of remote jobs; then I store this # in the calculation properties using _set_remote_dir # and I do not have to know the logic, but I just need to # read the absolute path from the calculation properties. t.mkdir(calcinfo.uuid[:2], ignore_existing=True) t.chdir(calcinfo.uuid[:2]) t.mkdir(calcinfo.uuid[2:4], ignore_existing=True) t.chdir(calcinfo.uuid[2:4]) t.mkdir(calcinfo.uuid[4:]) t.chdir(calcinfo.uuid[4:]) workdir = t.getcwd() # I store the workdir of the calculation for later file # retrieval calc._set_remote_workdir(workdir) # I first create the code files, so that the code can put # default files to be overwritten by the plugin itself. # Still, beware! The code file itself could be overwritten... # But I checked for this earlier. for code in input_codes: if code.is_local(): # Note: this will possibly overwrite files for f in code.get_folder_list(): t.put(code.get_abs_path(f), f) t.chmod(code.get_local_executable(), 0755) # rwxr-xr-x # copy all files, recursively with folders for f in folder.get_content_list(): execlogger.debug("[submission of calc {}] " "copying file/folder {}...".format( calc.pk, f), extra=logger_extra) t.put(folder.get_abs_path(f), f) # local_copy_list is a list of tuples, # each with (src_abs_path, dest_rel_path) # NOTE: validation of these lists are done # inside calc._presubmit() local_copy_list = calcinfo.local_copy_list remote_copy_list = calcinfo.remote_copy_list remote_symlink_list = calcinfo.remote_symlink_list if local_copy_list is not None: for src_abs_path, dest_rel_path in local_copy_list: execlogger.debug("[submission of calc {}] " "copying local file/folder to {}".format( calc.pk, dest_rel_path), extra=logger_extra) t.put(src_abs_path, dest_rel_path) if remote_copy_list is not None: for (remote_computer_uuid, remote_abs_path, dest_rel_path) in remote_copy_list: if remote_computer_uuid == computer.uuid: execlogger.debug( "[submission of calc {}] " "copying {} remotely, directly on the machine " "{}".format(calc.pk, dest_rel_path, computer.name)) try: t.copy(remote_abs_path, dest_rel_path) except (IOError, OSError): execlogger.warning( "[submission of calc {}] " "Unable to copy remote resource from {} to {}! " "Stopping.".format(calc.pk, remote_abs_path, dest_rel_path), extra=logger_extra) raise else: # TODO: implement copy between two different # machines! raise NotImplementedError( "[presubmission of calc {}] " "Remote copy between two different machines is " "not implemented yet".format(calc.pk)) if remote_symlink_list is not None: for (remote_computer_uuid, remote_abs_path, dest_rel_path) in remote_symlink_list: if remote_computer_uuid == computer.uuid: execlogger.debug( "[submission of calc {}] " "copying {} remotely, directly on the machine " "{}".format(calc.pk, dest_rel_path, computer.name)) try: t.symlink(remote_abs_path, dest_rel_path) except (IOError, OSError): execlogger.warning( "[submission of calc {}] " "Unable to create remote symlink from {} to {}! " "Stopping.".format(calc.pk, remote_abs_path, dest_rel_path), extra=logger_extra) raise else: raise IOError("It is not possible to create a symlink " "between two different machines for " "calculation {}".format(calc.pk)) remotedata = RemoteData(computer=computer, remote_path=workdir) remotedata.add_link_from(calc, label='remote_folder', link_type=LinkType.CREATE) remotedata.store() job_id = s.submit_from_script(t.getcwd(), script_filename) calc._set_job_id(job_id) # This should always be possible, because we should be # the only ones submitting this calculations, # so I do not check the ModificationNotAllowed calc._set_state(calc_states.WITHSCHEDULER) ## I do not set the state to queued; in this way, if the ## daemon is down, the user sees '(unknown)' as last state ## and understands that the daemon is not running. # if job_tmpl.submit_as_hold: # calc._set_scheduler_state(job_states.QUEUED_HELD) #else: # calc._set_scheduler_state(job_states.QUEUED) execlogger.debug("submitted calculation {} on {} with " "jobid {}".format(calc.pk, computer.name, job_id), extra=logger_extra) except Exception as e: import traceback try: calc._set_state(calc_states.SUBMISSIONFAILED) except ModificationNotAllowed: # Someone already set it, just skip pass execlogger.error("Submission of calc {} failed, check also the " "log file! Traceback: {}".format( calc.pk, traceback.format_exc()), extra=logger_extra) raise finally: # close the transport, but only if it was opened within this function if must_open_t: t.close()
def submit_jobs_with_authinfo(authinfo): """ Submit jobs in TOSUBMIT status belonging to user and machine as defined in the 'dbauthinfo' table. """ from aiida.orm import JobCalculation from aiida.common.log import get_dblogger_extra from aiida.backends.utils import QueryFactory if not authinfo.enabled: return execlogger.debug("Submitting jobs for user {} " "and machine {}".format(authinfo.aiidauser.email, authinfo.dbcomputer.name)) qmanager = QueryFactory()() # I create a unique set of pairs (computer, aiidauser) calcs_to_inquire = qmanager.query_jobcalculations_by_computer_user_state( state=calc_states.TOSUBMIT, computer=authinfo.dbcomputer, user=authinfo.aiidauser) # I avoid to open an ssh connection if there are # no calcs with state WITHSCHEDULER if len(calcs_to_inquire): # Open connection try: # I do it here so that the transport is opened only once per computer with authinfo.get_transport() as t: for c in calcs_to_inquire: logger_extra = get_dblogger_extra(c) t._set_logger_extra(logger_extra) try: submit_calc(calc=c, authinfo=authinfo, transport=t) except Exception as e: # TODO: implement a counter, after N retrials # set it to a status that # requires the user intervention execlogger.warning("There was an exception for " "calculation {} ({}): {}".format( c.pk, e.__class__.__name__, e.message)) # I just proceed to the next calculation continue # Catch exceptions also at this level (this happens only if there is # a problem opening the transport in the 'with t' statement, # because any other exception is caught and skipped above except Exception as e: import traceback from aiida.common.log import get_dblogger_extra for calc in calcs_to_inquire: logger_extra = get_dblogger_extra(calc) try: calc._set_state(calc_states.SUBMISSIONFAILED) except ModificationNotAllowed: # Someone already set it, just skip pass execlogger.error( "Submission of calc {} failed, check also the " "log file! Traceback: {}".format(calc.pk, traceback.format_exc()), extra=logger_extra) raise
def update_running_calcs_status(authinfo): """ Update the states of calculations in WITHSCHEDULER status belonging to user and machine as defined in the 'dbauthinfo' table. """ from aiida.orm import JobCalculation, Computer from aiida.scheduler.datastructures import JobInfo from aiida.common.log import get_dblogger_extra from aiida.backends.utils import QueryFactory if not authinfo.enabled: return execlogger.debug("Updating running calc status for user {} " "and machine {}".format(authinfo.aiidauser.email, authinfo.dbcomputer.name)) qmanager = QueryFactory()() calcs_to_inquire = qmanager.query_jobcalculations_by_computer_user_state( state=calc_states.WITHSCHEDULER, computer=authinfo.dbcomputer, user=authinfo.aiidauser) #~ calcs_to_inquire = list(JobCalculation._get_all_with_state( #~ state=calc_states.WITHSCHEDULER, #~ computer=authinfo.dbcomputer, #~ user=authinfo.aiidauser) #~ ) # NOTE: no further check is done that machine and # aiidauser are correct for each calc in calcs s = Computer(dbcomputer=authinfo.dbcomputer).get_scheduler() t = authinfo.get_transport() computed = [] # I avoid to open an ssh connection if there are # no calcs with state WITHSCHEDULER if len(calcs_to_inquire): jobids_to_inquire = [str(c.get_job_id()) for c in calcs_to_inquire] # Open connection with t: s.set_transport(t) # TODO: Check if we are ok with filtering by job (to make this work, # I had to remove the check on the retval for getJobs, # because if the job has computed and is not in the output of # qstat, it gives a nonzero retval) # TODO: catch SchedulerError exception and do something # sensible (at least, skip this computer but continue with # following ones, and set a counter; set calculations to # UNKNOWN after a while? if s.get_feature('can_query_by_user'): found_jobs = s.getJobs(user="******", as_dict=True) else: found_jobs = s.getJobs(jobs=jobids_to_inquire, as_dict=True) # I update the status of jobs for c in calcs_to_inquire: try: logger_extra = get_dblogger_extra(c) t._set_logger_extra(logger_extra) jobid = c.get_job_id() if jobid is None: execlogger.error("JobCalculation {} is WITHSCHEDULER " "but no job id was found!".format( c.pk), extra=logger_extra) continue # I check if the calculation to be checked (c) # is in the output of qstat if jobid in found_jobs: # jobinfo: the information returned by # qstat for this job jobinfo = found_jobs[jobid] execlogger.debug("Inquirying calculation {} (jobid " "{}): it has job_state={}".format( c.pk, jobid, jobinfo.job_state), extra=logger_extra) # For the moment, FAILED is not defined if jobinfo.job_state in [job_states.DONE ]: # , job_states.FAILED]: computed.append(c) try: c._set_state(calc_states.COMPUTED) except ModificationNotAllowed: # Someone already set it, just skip pass ## Do not set the WITHSCHEDULER state multiple times, ## this would raise a ModificationNotAllowed # else: # c._set_state(calc_states.WITHSCHEDULER) c._set_scheduler_state(jobinfo.job_state) c._set_last_jobinfo(jobinfo) else: execlogger.debug("Inquirying calculation {} (jobid " "{}): not found, assuming " "job_state={}".format( c.pk, jobid, job_states.DONE), extra=logger_extra) # calculation c is not found in the output of qstat computed.append(c) c._set_scheduler_state(job_states.DONE) except Exception as e: # TODO: implement a counter, after N retrials # set it to a status that # requires the user intervention execlogger.warning("There was an exception for " "calculation {} ({}): {}".format( c.pk, e.__class__.__name__, e.message), extra=logger_extra) continue for c in computed: try: logger_extra = get_dblogger_extra(c) try: detailed_jobinfo = s.get_detailed_jobinfo( jobid=c.get_job_id()) except NotImplementedError: detailed_jobinfo = ( u"AiiDA MESSAGE: This scheduler does not implement " u"the routine get_detailed_jobinfo to retrieve " u"the information on " u"a job after it has finished.") last_jobinfo = c._get_last_jobinfo() if last_jobinfo is None: last_jobinfo = JobInfo() last_jobinfo.job_id = c.get_job_id() last_jobinfo.job_state = job_states.DONE last_jobinfo.detailedJobinfo = detailed_jobinfo c._set_last_jobinfo(last_jobinfo) except Exception as e: execlogger.warning("There was an exception while " "retrieving the detailed jobinfo " "for calculation {} ({}): {}".format( c.pk, e.__class__.__name__, e.message), extra=logger_extra) continue finally: # Set the state to COMPUTED as the very last thing # of this routine; no further change should be done after # this, so that in general the retriever can just # poll for this state, if we want to. try: c._set_state(calc_states.COMPUTED) except ModificationNotAllowed: # Someone already set it, just skip pass return computed
def submit_jobs(): """ Submit all jobs in the TOSUBMIT state. """ from aiida.orm import JobCalculation, Computer, User from aiida.common.log import get_dblogger_extra from aiida.backends.utils import get_authinfo, QueryFactory qmanager = QueryFactory()() # I create a unique set of pairs (computer, aiidauser) computers_users_to_check = qmanager.query_jobcalculations_by_computer_user_state( state=calc_states.TOSUBMIT, only_computer_user_pairs=True, only_enabled=True) for computer, aiidauser in computers_users_to_check: execlogger.debug("({},{}) pair to submit".format( aiidauser.email, computer.name)) try: try: authinfo = get_authinfo(computer.dbcomputer, aiidauser._dbuser) except AuthenticationError: # TODO!! # Put each calculation in the SUBMISSIONFAILED state because # I do not have AuthInfo to submit them calcs_to_inquire = qmanager.query_jobcalculations_by_computer_user_state( state=calc_states.TOSUBMIT, computer=computer, user=aiidauser) #~ calcs_to_inquire = JobCalculation._get_all_with_state( #~ state=calc_states.TOSUBMIT, #~ computer=computer, user=aiidauser) for calc in calcs_to_inquire: try: calc._set_state(calc_states.SUBMISSIONFAILED) except ModificationNotAllowed: # Someone already set it, just skip pass logger_extra = get_dblogger_extra(calc) execlogger.error("Submission of calc {} failed, " "computer pk= {} ({}) is not configured " "for aiidauser {}".format( calc.pk, computer.pk, computer.get_name(), aiidauser.email), extra=logger_extra) # Go to the next (dbcomputer,aiidauser) pair continue submitted_calcs = submit_jobs_with_authinfo(authinfo) except Exception as e: import traceback msg = ("Error while submitting jobs " "for aiidauser={} on computer={}, " "error type is {}, traceback: {}".format( aiidauser.email, computer.name, e.__class__.__name__, traceback.format_exc())) print msg execlogger.error(msg) # Continue with next computer continue