Beispiel #1
0
 def kill_processing_force(self, processing):
     try:
         if processing:
             from pandaclient import Client
             proc = processing['processing_metadata']['processing']
             task_id = proc.workload_id
             # task_id = processing['processing_metadata']['task_id']
             Client.killTask(task_id)
             # Client.finishTask(task_id, soft=True)
     except Exception as ex:
         msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex))
         raise exceptions.IDDSException(msg)
Beispiel #2
0
    def get_panda_task_id(self, processing):
        from pandaclient import Client

        start_time = datetime.datetime.utcnow() - datetime.timedelta(hours=10)
        start_time = start_time.strftime('%Y-%m-%d %H:%M:%S')
        status, results = Client.getJobIDsJediTasksInTimeRange(
            start_time, task_type=self.task_type, verbose=False)
        if status != 0:
            self.logger.warn(
                "Error to poll latest tasks in last ten hours: %s, %s" %
                (status, results))
            return None

        proc = processing['processing_metadata']['processing']
        task_id = None
        for req_id in results:
            task_name = results[req_id]['taskName']
            if proc.workload_id is None and task_name == self.task_name:
                task_id = results[req_id]['jediTaskID']
                # processing['processing_metadata']['task_id'] = task_id
                # processing['processing_metadata']['workload_id'] = task_id
                proc.workload_id = task_id
                if task_id:
                    proc.submitted_at = datetime.datetime.utcnow()

        return task_id
Beispiel #3
0
    def submit_panda_task(self, processing):
        try:
            from pandaclient import Client

            proc = processing['processing_metadata']['processing']
            task_param = proc.processing_metadata['task_param']
            return_code = Client.insertTaskParams(task_param, verbose=True)
            if return_code[0] == 0:
                try:
                    task_id = int(return_code[1][1])
                    return task_id
                except Exception as ex:
                    self.logger.warn(
                        "task id is not retruned: (%s) is not task id: %s" %
                        (return_code[1][1], str(ex)))
                    # jediTaskID=26468582
                    if return_code[1][1] and 'jediTaskID=' in return_code[1][1]:
                        parts = return_code[1][1].split(" ")
                        for part in parts:
                            if 'jediTaskID=' in part:
                                task_id = int(part.split("=")[1])
                                return task_id
            else:
                self.logger.warn("submit_panda_task, return_code: %s" %
                                 str(return_code))
        except Exception as ex:
            self.logger.error(ex)
            self.logger.error(traceback.format_exc())
            # raise exceptions.AgentPluginError('%s: %s' % (str(ex), traceback.format_exc()))
        return None
Beispiel #4
0
def get_refresh_token_string(verbose=False):
    try:
        curl = Client._Curl()
        curl.verbose = verbose
        tmp_log = PLogger.getPandaLogger()
        oidc = curl.get_oidc(tmp_log)
        token_file = oidc.get_token_path()
        if os.path.exists(token_file):
            with open(token_file) as f:
                data = json.load(f)
                enc = data['id_token'].split('.')[1]
                enc += '=' * (-len(enc) % 4)
                dec = json.loads(base64.urlsafe_b64decode(enc.encode()))
                exp_time = datetime.datetime.utcfromtimestamp(dec['exp'])
                delta = exp_time - datetime.datetime.utcnow()
                minutes = delta.total_seconds() / 60
                print('Token will expire in %s minutes.' % minutes)
                print('Token expiration time : {0} UTC'.format(
                    exp_time.strftime("%Y-%m-%d %H:%M:%S")))
                if delta < datetime.timedelta(minutes=0):
                    print("Token already expired. Cannot refresh.")
                    return False, None, None
                return True, data['refresh_token'], delta
        else:
            print("Cannot find token file.")
    except Exception as e:
        print('failed to decode cached token with {0}'.format(e))
    return False, None, None
Beispiel #5
0
    def poll_panda_task_output(self, processing=None, input_output_maps=None):
        task_id = None
        try:
            from pandaclient import Client

            if processing:
                output_metadata = {}
                proc = processing['processing_metadata']['processing']
                task_id = proc.workload_id
                if task_id is None:
                    task_id = self.get_panda_task_id(processing)

                if task_id:
                    # ret_ids = Client.getPandaIDsWithTaskID(task_id, verbose=False)
                    task_info = Client.getJediTaskDetails(
                        {'jediTaskID': task_id}, True, True, verbose=False)
                    self.logger.info("poll_panda_task, task_info: %s" %
                                     str(task_info))
                    if task_info[0] != 0:
                        self.logger.warn(
                            "poll_panda_task %s, error getting task status, task_info: %s"
                            % (task_id, str(task_info)))
                        return ProcessingStatus.Submitting, [], {}

                    task_info = task_info[1]

                    processing_status = self.get_processing_status_from_panda_status(
                        task_info["status"])

                    if processing_status in [ProcessingStatus.SubFinished]:
                        if self.retry_number < self.num_retries:
                            self.reactivate_processing(processing)
                            processing_status = ProcessingStatus.Submitted
                            self.retry_number += 1
                    if processing_status in [
                            ProcessingStatus.SubFinished,
                            ProcessingStatus.Finished
                    ]:
                        output_status, output_metadata = self.process_outputs(
                            processing)
                        if not output_status:
                            err = "Failed to process processing(processing_id: %s, task_id: %s) outputs" % (
                                processing['processing_id'], task_id)
                            self.logger.error(err)
                            self.add_errors(err)
                            processing_status = ProcessingStatus.Failed

                    return processing_status, [], {}, output_metadata
                else:
                    return ProcessingStatus.Failed, [], {}, output_metadata
        except Exception as ex:
            msg = "Failed to check the processing (%s) status: %s" % (str(
                processing['processing_id']), str(ex))
            self.logger.error(msg)
            self.logger.error(ex)
            self.logger.error(traceback.format_exc())
            # raise exceptions.IDDSException(msg)
        return ProcessingStatus.Submitting, [], {}, {}
Beispiel #6
0
    def poll_panda_task_status(self, processing):
        if 'processing' in processing['processing_metadata']:
            from pandaclient import Client

            proc = processing['processing_metadata']['processing']
            status, task_status = Client.getTaskStatus(proc.workload_id)
            if status == 0:
                return task_status
        else:
            return 'failed'
        return None
Beispiel #7
0
def get_token_info(verbose=False):
    # c = panda_api.get_api()
    curl = Client._Curl()
    curl.verbose = verbose
    token_info = curl.get_token_info()
    # print(token_info)
    if token_info and type(token_info) in [dict]:
        for key in token_info:
            print("%s: %s" % (key, token_info[key]))
        get_expire_time()
    else:
        print(token_info)
Beispiel #8
0
    def submit_panda_task(self, processing):
        try:
            from pandaclient import Client

            task_param = processing['processing_metadata']['task_param']
            return_code = Client.insertTaskParams(task_param, verbose=True)
            if return_code[0] == 0:
                return return_code[1][1]
        except Exception as ex:
            self.logger.error(ex)
            self.logger.error(traceback.format_exc())
            # raise exceptions.AgentPluginError('%s: %s' % (str(ex), traceback.format_exc()))
        return None
Beispiel #9
0
def refresh_token(minutes=30, verbose=False):
    curl = Client._Curl()
    curl.verbose = verbose
    tmp_log = PLogger.getPandaLogger()
    oidc = curl.get_oidc(tmp_log)

    status, refresh_token, delta = get_refresh_token_string()
    if not status:
        print("Cannot refresh token.")
        return False

    print("Fetching auth configuration from: %s" % str(oidc.auth_config_url))
    s, o = oidc.fetch_page(oidc.auth_config_url)
    if not s:
        print("Failed to get Auth configuration: " + o)
        return False
    auth_config = o

    print("Fetching endpoint configuration from: %s" %
          str(auth_config['oidc_config_url']))
    s, o = oidc.fetch_page(auth_config['oidc_config_url'])
    if not s:
        print("Failed to get endpoint configuration: " + o)
        return False
    endpoint_config = o

    # s, o = oidc.refresh_token(endpoint_config['token_endpoint'], auth_config['client_id'],
    #                           auth_config['client_secret'], refresh_token)
    s, o = oidc_refresh_token(oidc, endpoint_config['token_endpoint'],
                              auth_config['client_id'],
                              auth_config['client_secret'], refresh_token)
    if not s:
        print("Failed to refresh token: " + o)
        if delta < datetime.timedelta(minutes=minutes):
            print(
                "The left lifetime of the token is less than required %s minutes"
                % minutes)
            return False
        else:
            return True
    else:
        print("Success to refresh token: " + o)
        if delta < datetime.timedelta(minutes=minutes):
            print(
                "The left lifetime of the token is less than required %s minutes"
                % minutes)
            return False
        else:
            return True
    return True
Beispiel #10
0
    def reactivate_processing(self, processing):
        try:
            if processing:
                from pandaclient import Client
                # task_id = processing['processing_metadata']['task_id']
                proc = processing['processing_metadata']['processing']
                task_id = proc.workload_id

                # Client.retryTask(task_id)
                status, out = Client.retryTask(task_id, newParams={})
                self.logger.warn("Retry processing(%s) with task id(%s): %s, %s" % (processing['processing_id'], task_id, status, out))
                # Client.reactivateTask(task_id)
                # Client.resumeTask(task_id)
        except Exception as ex:
            msg = "Failed to check the processing (%s) status: %s" % (str(processing['processing_id']), str(ex))
            raise exceptions.IDDSException(msg)
Beispiel #11
0
def get_expire_time(verbose=False):
    try:
        # token_file = openidc_utils.OpenIdConnect_Utils().get_token_path()
        curl = Client._Curl()
        curl.verbose = verbose
        tmp_log = PLogger.getPandaLogger()
        oidc = curl.get_oidc(tmp_log)
        token_file = oidc.get_token_path()
        if os.path.exists(token_file):
            with open(token_file) as f:
                data = json.load(f)
                enc = data['id_token'].split('.')[1]
                enc += '=' * (-len(enc) % 4)
                dec = json.loads(base64.urlsafe_b64decode(enc.encode()))
                exp_time = datetime.datetime.utcfromtimestamp(dec['exp'])
                delta = exp_time - datetime.datetime.utcnow()
                minutes = delta.total_seconds() / 60
                print('Token will expire in %s minutes.' % minutes)
                print('Token expiration time : {0} UTC'.format(
                    exp_time.strftime("%Y-%m-%d %H:%M:%S")))
        else:
            print("Cannot find token file.")
    except Exception as e:
        print('failed to decode cached token with {0}'.format(e))
Beispiel #12
0
def main(get_taskparams=False, ext_args=None, dry_mode=False):

    # tweak sys.argv
    sys.argv.pop(0)
    sys.argv.insert(0, 'phpo')

    usage = """phpo [options]
    """

    optP = GroupArgParser(usage=usage, conflict_handler="resolve")

    group_input = optP.add_group('input', 'input dataset(s)/files/format')
    group_output = optP.add_group('output', 'output dataset/files')
    group_config = optP.add_group(
        'config', 'single configuration file to set multiple options')
    group_submit = optP.add_group('submit', 'job submission/site/retry')
    group_expert = optP.add_group('expert', 'for experts/developers only')

    optP.add_helpGroup()

    group_config.add_argument('--version',
                              action='store_const',
                              const=True,
                              dest='version',
                              default=False,
                              help='Displays version')
    group_config.add_argument(
        '--loadJson',
        action='store',
        dest='loadJson',
        default=None,
        help=
        'Read task parameters from a json file. Some parameters can be overridden '
        'by using command-line arguments')
    group_config.add_argument(
        '--dumpJson',
        action='store',
        dest='dumpJson',
        default=None,
        help='Dump all command-line parameters and submission result '
        'such as returnCode, returnOut, and jediTaskID to a json file')
    group_config.add_argument(
        '--nParallelEvaluation',
        action='store',
        dest='nParallelEvaluation',
        default=1,
        type=int,
        help=
        'The number of hyperparameter points being evaluated concurrently. 1 by default'
    )
    group_config.add_argument(
        '--maxPoints',
        action='store',
        dest='maxPoints',
        default=10,
        type=int,
        help=
        'The max number of hyperparameter points to be evaluated in the entire search '
        '(for each segment in segmented HPO). '
        '10 by default')
    group_config.add_argument(
        '--maxEvaluationJobs',
        action='store',
        dest='maxEvaluationJobs',
        default=None,
        type=int,
        help='The max number of evaluation jobs in the entire search '
        '(for each segment in segmented HPO). 2*maxPoints by default. '
        'The task is terminated when all hyperparameter points are evaluated or '
        'the number of evaluation jobs reaches maxEvaluationJobs')
    group_config.add_argument(
        '--maxPointsPerEvaluationJob',
        action='store',
        dest='maxPointsPerEvaluationJob',
        default=None,
        type=int,
        help=
        'The max number of hyperparameter points taken in each evaluation job')
    group_config.add_argument(
        '--nPointsPerIteration',
        action='store',
        dest='nPointsPerIteration',
        default=2,
        type=int,
        help=
        'The max number of hyperparameter points generated in each iteration. 2 by default '
        'Simply speaking, the steering container is executed maxPoints/nPointsPerIteration '
        'times when minUnevaluatedPoints is 0. The number of new points is '
        'nPointsPerIteration-minUnevaluatedPoints')
    group_config.add_argument(
        '--minUnevaluatedPoints',
        action='store',
        dest='minUnevaluatedPoints',
        default=None,
        type=int,
        help=
        'The next iteration is triggered to generate new hyperparameter points when the number '
        'of unevaluated hyperparameter points goes below minUnevaluatedPoints. 0 by default'
    )
    group_config.add_argument(
        '--steeringContainer',
        action='store',
        dest='steeringContainer',
        default=None,
        help='The container image for steering run by docker')
    group_config.add_argument(
        '--steeringExec',
        action='store',
        dest='steeringExec',
        default=None,
        help=
        'Execution string for steering. If --steeringContainer is specified, the string '
        'is executed inside of the container. Otherwise, the string is used as command-line '
        'arguments for the docker command')
    group_config.add_argument(
        '--searchSpaceFile',
        action='store',
        dest='searchSpaceFile',
        default=None,
        help=
        'External json filename to define the search space which is described as a dictionary. '
        'None by default. '
        'If this option is used together with --segmentSpecFile the json file contains a list '
        'of search space dictionaries. It is possible to contain only one search space '
        'dictionary if all segments use the same search space. In this case the search space '
        'dictionary is cloned for every segment')
    group_config.add_argument('--evaluationContainer',
                              action='store',
                              dest='evaluationContainer',
                              default=None,
                              help='The container image for evaluation')
    group_config.add_argument(
        '--evaluationExec',
        action='store',
        dest='evaluationExec',
        default=None,
        help='Execution string to run evaluation in singularity')
    group_config.add_argument(
        '--evaluationInput',
        action='store',
        dest='evaluationInput',
        default='input.json',
        help=
        'Input filename for evaluation where a json-formatted hyperparameter point is placed. '
        'input.json by default')
    group_config.add_argument(
        '--evaluationTrainingData',
        action='store',
        dest='evaluationTrainingData',
        default='input_ds.json',
        help=
        'Input filename for evaluation where a json-formatted list of training data filenames '
        'is placed. input_ds.json by default. Can be omitted if the payload directly fetches '
        'the training data using wget or something')
    group_config.add_argument(
        '--evaluationOutput',
        action='store',
        dest='evaluationOutput',
        default='output.json',
        help='Output filename of evaluation. output.json by default')
    group_config.add_argument(
        '--evaluationMeta',
        action='store',
        dest='evaluationMeta',
        default=None,
        help='The name of metadata file produced by evaluation')
    group_config.add_argument(
        '--evaluationMetrics',
        action='store',
        dest='evaluationMetrics',
        default=None,
        help='The name of metrics file produced by evaluation')
    group_config.add_argument('--checkPointToSave', action='store', dest='checkPointToSave', default=None,
                              help='A comma-separated list of files and/or directories to be periodically saved ' \
                                   'to a tarball for checkpointing. Note that those files and directories must be placed ' \
                                   'in the working directory. None by default')
    group_config.add_argument('--checkPointToLoad', action='store', dest='checkPointToLoad', default=None,
                              help='The name of the saved tarball for checkpointing. The tarball is given to ' \
                                   'the evaluation container when the training is resumed, if this option is specified. '
                                   'Otherwise, the tarball is automatically extracted in the working directories')
    group_config.add_argument(
        '--checkPointInterval',
        action='store',
        dest='checkPointInterval',
        default=None,
        type=int,
        help='Frequency to check files for checkpointing in minute. '
        '5 by default')
    group_config.add_argument('--alrbArgs', action='store', dest='alrbArgs', default=None,
                              help='Additional arguments for ALRB to run the evaluation container. ' \
                                   '"setupATLAS -c --help" shows available ALRB arguments. For example, ' \
                                   '--alrbArgs "--nocvmfs --nohome" to skip mounting /cvmfs and $HOME. ' \
                                   'This option is mainly for experts who know how the system and the container ' \
                                   'communicates with each other and how additional ALRB arguments affect ' \
                                   'the consequence')
    group_config.add_argument(
        '--architecture',
        action='store',
        dest='architecture',
        default='',
        help=
        "CPU and/or GPU requirements. #CPU_spec&GPU_spec where CPU or GPU spec can be "
        "omitted. CPU_spec = architecture<-vendor<-instruction set>>, "
        "GPU_spec = vendor<-model>. A wildcards can be used if there is no special "
        "requirement for the attribute. E.g., #x86_64-*-avx2&nvidia to ask for x86_64 "
        "CPU with avx2 support and nvidia GPU")
    group_config.add_argument(
        '--segmentSpecFile',
        action='store',
        dest='segmentSpecFile',
        default=None,
        help=
        'External json filename to define segments for segmented HPO which has one model '
        'for each segment to be optimized independently. The file '
        "contains a list of dictionaries {'name': arbitrary_unique_segment_name, "
        "'files': [filename_used_for_the_segment_in_the_training_dataset, ... ]}. "
        "It is possible to specify 'datasets' instead of 'files' in those dictionaries "
        "if the training dataset has constituent datasets and "
        "is partitioned with the constituent dataset boundaries. "
        'None by default')
    group_config.add_argument('-v',
                              action='store_const',
                              const=True,
                              dest='verbose',
                              default=False,
                              help='Verbose')

    group_input.add_argument('--trainingDS',
                             action='store',
                             dest='trainingDS',
                             default=None,
                             help='Name of training dataset')

    group_output.add_argument(
        '--outDS',
        action='store',
        dest='outDS',
        default=None,
        help='Name of the dataset for output and log files')
    group_output.add_argument('--official',
                              action='store_const',
                              const=True,
                              dest='official',
                              default=False,
                              help='Produce official dataset')

    group_submit.add_argument(
        '--site',
        action='store',
        dest='site',
        default=None,
        help=
        'The site name where jobs are sent. If omitted, jobs are automatically sent to sites '
        'where input is available. A comma-separated list of sites can be specified '
        '(e.g. siteA,siteB,siteC), so that best sites are chosen from the given site list'
    )
    group_submit.add_argument('--workingGroup',
                              action='store',
                              dest='workingGroup',
                              default=None,
                              help="set working group")
    group_submit.add_argument('--noSubmit',
                              action='store_const',
                              const=True,
                              dest='noSubmit',
                              default=False,
                              help="Dry-run")
    group_submit.add_argument("-3",
                              action="store_true",
                              dest="python3",
                              default=False,
                              help="Use python3")
    group_submit.add_argument(
        '--voms',
        action='store',
        dest='vomsRoles',
        default=None,
        type=str,
        help="generate proxy with paticular roles. "
        "e.g., atlas:/atlas/ca/Role=production,atlas:/atlas/fr/Role=pilot")
    group_submit.add_argument('--noEmail',
                              action='store_const',
                              const=True,
                              dest='noEmail',
                              default=False,
                              help='Suppress email notification')

    group_expert.add_argument(
        '--intrSrv',
        action='store_const',
        const=True,
        dest='intrSrv',
        default=False,
        help=
        "Please don't use this option. Only for developers to use the intr panda server"
    )

    # get logger
    tmpLog = PLogger.getPandaLogger()

    options = optP.parse_args(ext_args)
    option_names = set(vars(options).keys())

    jsonExecStr = ''
    if options.loadJson is not None:
        with open(os.path.expanduser(options.loadJson)) as f:
            json_options = json.load(f)
            for k in json_options:
                if k in option_names:
                    v = json_options[k]
                    if isinstance(v, (str, unicode)):
                        try:
                            v = int(v)
                        except Exception:
                            pass
                    setattr(options, k, v)
                    if v is True:
                        jsonExecStr += ' --{0}'.format(k)
                    else:
                        if isinstance(v, (str, unicode)):
                            jsonExecStr += " --{0}='{1}'".format(k, v)
                        else:
                            jsonExecStr += " --{0}={1}".format(k, v)
                else:
                    tmpLog.warning('ignore unknown option {0} in {1}'.format(
                        k, options.loadJson))

    if options.version:
        print("Version: %s" % PandaToolsPkgInfo.release_version)
        sys.exit(0)

    # check grid-proxy
    if not dry_mode:
        PsubUtils.check_proxy(options.verbose, options.vomsRoles)

    # check options
    # non_null_opts = ['outDS', 'evaluationContainer', 'evaluationExec', 'steeringContainer', 'steeringExec']
    non_null_opts = [
        'outDS', 'evaluationContainer', 'evaluationExec', 'steeringExec'
    ]
    for opt_name in non_null_opts:
        if getattr(options, opt_name) is None:
            tmpLog.error('--{0} is not specified'.format(opt_name))
            sys.exit(1)

    if not options.outDS.endswith('/'):
        options.outDS += '/'

    if options.maxEvaluationJobs is None:
        options.maxEvaluationJobs = 2 * options.maxPoints

    # check output name

    if not dry_mode:
        nickName = PsubUtils.getNickname()
        if not PsubUtils.checkOutDsName(
                options.outDS, options.official, nickName,
                verbose=options.verbose):
            tmpStr = "invalid output dataset name: %s" % options.outDS
            tmpLog.error(tmpStr)
            sys.exit(1)

    # full execution string
    fullExecString = PsubUtils.convSysArgv(ext_args)
    fullExecString += jsonExecStr

    # use INTR server
    if options.intrSrv:
        Client.useIntrServer()

    # create tmp dir
    curDir = os.getcwd()
    tmpDir = os.path.join(curDir, MiscUtils.wrappedUuidGen())
    os.makedirs(tmpDir)

    # exit action
    def _onExit(dir, del_command):
        del_command('rm -rf %s' % dir)

    atexit.register(_onExit, tmpDir, MiscUtils.commands_get_output)

    # sandbox
    if options.verbose:
        tmpLog.debug("=== making sandbox ===")
    archiveName = 'jobO.%s.tar' % MiscUtils.wrappedUuidGen()
    archiveFullName = os.path.join(tmpDir, archiveName)
    if not dry_mode:
        extensions = ['json', 'py', 'sh', 'yaml']
        find_opt = ' -o '.join(['-name "*.{0}"'.format(e) for e in extensions])
        tmpOut = MiscUtils.commands_get_output(
            'find . {0} | tar cvfz {1} --files-from - '.format(
                find_opt, archiveFullName))

        if options.verbose:
            print(tmpOut + '\n')
            tmpLog.debug("=== checking sandbox ===")
            tmpOut = MiscUtils.commands_get_output(
                'tar tvfz {0}'.format(archiveFullName))
            print(tmpOut + '\n')

        if not options.noSubmit:
            if options.verbose:
                tmpLog.debug("=== uploading sandbox===")
            os.chdir(tmpDir)
            status, out = Client.putFile(archiveName,
                                         options.verbose,
                                         useCacheSrv=True,
                                         reuseSandbox=True)
            os.chdir(curDir)
            if out.startswith('NewFileName:'):
                # found the same input sandbox to reuse
                archiveName = out.split(':')[-1]
            elif out != 'True':
                # failed
                print(out)
                tmpLog.error("Failed with %s" % status)
                sys.exit(1)

    matchURL = re.search("(http.*://[^/]+)/", Client.baseURLCSRVSSL)
    sourceURL = matchURL.group(1)

    # making task params
    taskParamMap = {}

    taskParamMap['noInput'] = True
    taskParamMap['nEventsPerJob'] = 1
    taskParamMap['nEvents'] = options.nParallelEvaluation
    taskParamMap['maxNumJobs'] = options.maxEvaluationJobs
    taskParamMap['totNumJobs'] = options.maxPoints
    taskParamMap['taskName'] = options.outDS
    taskParamMap['vo'] = 'atlas'
    taskParamMap['architecture'] = options.architecture
    taskParamMap['hpoWorkflow'] = True
    taskParamMap['transUses'] = ''
    taskParamMap['transHome'] = ''
    taskParamMap[
        'transPath'] = 'http://pandaserver.cern.ch:25080/trf/user/runHPO-00-00-01'
    taskParamMap['processingType'] = 'panda-client-{0}-jedi-hpo'.format(
        PandaToolsPkgInfo.release_version)
    taskParamMap['prodSourceLabel'] = 'user'
    taskParamMap['useLocalIO'] = 1
    taskParamMap['cliParams'] = fullExecString
    taskParamMap['skipScout'] = True
    if options.noEmail:
        taskParamMap['noEmail'] = True
    if options.workingGroup is not None:
        taskParamMap['workingGroup'] = options.workingGroup
    taskParamMap['coreCount'] = 1
    if options.site is not None:
        if ',' in options.site:
            taskParamMap[
                'includedSite'] = PsubUtils.splitCommaConcatenatedItems(
                    [options.site])
        else:
            taskParamMap['site'] = options.site
    if options.evaluationContainer is not None:
        taskParamMap['container_name'] = options.evaluationContainer

    taskParamMap['multiStepExec'] = {
        'preprocess': {
            'command': '${TRF}',
            'args': '--preprocess ${TRF_ARGS}'
        },
        'postprocess': {
            'command': '${TRF}',
            'args': '--postprocess ${TRF_ARGS}'
        },
        'containerOptions': {
            'containerExec':
            'while [ ! -f __payload_in_sync_file__ ]; do sleep 5; done; '
            'echo "=== cat exec script ==="; '
            'cat __run_main_exec.sh; '
            'echo; '
            'echo "=== exec script ==="; '
            '/bin/sh __run_main_exec.sh; '
            'REAL_MAIN_RET_CODE=$?; '
            'touch __payload_out_sync_file__; '
            'exit $REAL_MAIN_RET_CODE ',
            'containerImage':
            options.evaluationContainer
        }
    }
    if options.checkPointToSave is not None:
        taskParamMap['multiStepExec']['coprocess'] = {
            'command': '${TRF}',
            'args': '--coprocess ${TRF_ARGS}'
        }

    if options.alrbArgs is not None:
        taskParamMap['multiStepExec']['containerOptions'][
            'execArgs'] = options.alrbArgs

    logDatasetName = re.sub('/$', '.log/', options.outDS)

    taskParamMap['log'] = {
        'dataset': logDatasetName,
        'container': logDatasetName,
        'type': 'template',
        'param_type': 'log',
        'value': '{0}.$JEDITASKID.${{SN}}.log.tgz'.format(logDatasetName[:-1])
    }

    taskParamMap['hpoRequestData'] = {
        'sandbox': options.steeringContainer,
        'executable': 'docker',
        'arguments': options.steeringExec,
        'output_json': 'output.json',
        'max_points': options.maxPoints,
        'num_points_per_generation': options.nPointsPerIteration,
    }
    if options.minUnevaluatedPoints is not None:
        taskParamMap['hpoRequestData'][
            'min_unevaluated_points'] = options.minUnevaluatedPoints

    if options.searchSpaceFile is not None:
        with open(options.searchSpaceFile) as json_file:
            taskParamMap['hpoRequestData']['opt_space'] = json.load(json_file)

    taskParamMap['jobParameters'] = [
        {
            'type':
            'constant',
            'value':
            '-o {0} -j "" --inSampleFile {1}'.format(options.evaluationOutput,
                                                     options.evaluationInput)
        },
        {
            'type': 'constant',
            'value': '-a {0} --sourceURL {1}'.format(archiveName, sourceURL)
        },
    ]

    taskParamMap['jobParameters'] += [
        {
            'type': 'constant',
            'value': '-p "',
            'padding': False,
        },
    ]
    taskParamMap['jobParameters'] += PsubUtils.convertParamStrToJediParam(
        options.evaluationExec, {}, '', True, False, includeIO=False)

    taskParamMap['jobParameters'] += [
        {
            'type': 'constant',
            'value': '"',
        },
    ]

    if options.checkPointToSave is not None:
        taskParamMap['jobParameters'] += [
            {
                'type': 'constant',
                'value':
                '--checkPointToSave {0}'.format(options.checkPointToSave)
            },
        ]
        if options.checkPointInterval is not None:
            taskParamMap['jobParameters'] += [
                {
                    'type':
                    'constant',
                    'value':
                    '--checkPointInterval {0}'.format(
                        options.checkPointInterval)
                },
            ]

    if options.checkPointToLoad is not None:
        taskParamMap['jobParameters'] += [
            {
                'type': 'constant',
                'value':
                '--checkPointToLoad {0}'.format(options.checkPointToLoad)
            },
        ]

    if options.trainingDS is not None:
        taskParamMap['jobParameters'] += [
            {
                'type':
                'constant',
                'value':
                '--writeInputToTxt IN_DATA:{0}'.format(
                    options.evaluationTrainingData)
            },
            {
                'type': 'template',
                'param_type': 'input',
                'value': '-i "${IN_DATA/T}"',
                'dataset': options.trainingDS,
                'attributes': 'nosplit,repeat',
            },
            {
                'type': 'constant',
                'value': '--inMap "{\'IN_DATA\': ${IN_DATA/T}}"'
            },
        ]

    if options.evaluationMeta is not None:
        taskParamMap['jobParameters'] += [
            {
                'type': 'constant',
                'value': '--outMetaFile={0}'.format(options.evaluationMeta),
            },
        ]

    if options.segmentSpecFile is not None:
        taskParamMap['segmentedWork'] = True

        with open(options.segmentSpecFile) as f:
            # read segments
            segments = json.load(f)
            # search space
            if 'opt_space' in taskParamMap['hpoRequestData'] and \
                    isinstance(taskParamMap['hpoRequestData']['opt_space'], dict):
                space = taskParamMap['hpoRequestData']['opt_space']
                taskParamMap['hpoRequestData']['opt_space'] = []
            else:
                space = None
            # set model ID to each segment
            for i in range(len(segments)):
                segments[i].update({'id': i})
                # make clone of search space if needed
                if space is not None:
                    new_space = dict()
                    new_space['model_id'] = i
                    new_space['search_space'] = copy.deepcopy(space)
                    taskParamMap['hpoRequestData']['opt_space'].append(
                        new_space)
            taskParamMap['segmentSpecs'] = segments
            # multiply by num of segments
            taskParamMap['maxNumJobs'] *= len(segments)
            taskParamMap['totNumJobs'] *= len(segments)
            taskParamMap['hpoRequestData']['max_points'] *= len(segments)

        taskParamMap['jobParameters'] += [
            {
                'type': 'constant',
                'value': '--segmentID=${SEGMENT_ID}',
            },
        ]

    if options.evaluationMetrics is not None:
        lfn = '$JEDITASKID.metrics.${SN}.tgz'
        if options.segmentSpecFile is not None:
            lfn = '${MIDDLENAME}.' + lfn
        taskParamMap['jobParameters'] += [
            {
                'type': 'template',
                'param_type': 'output',
                'value': lfn,
                'dataset': options.outDS,
                'hidden': True,
                'allowNoOutput': True,
            },
            {
                'type':
                'constant',
                'value':
                '--outMetricsFile=${{OUTPUT0}}^{0}'.format(
                    options.evaluationMetrics),
            },
        ]

    if options.maxPointsPerEvaluationJob:
        taskParamMap['jobParameters'] += [
            {
                'type':
                'constant',
                'value':
                '--maxLoopCount={}'.format(options.maxPointsPerEvaluationJob),
            },
        ]

    if options.noSubmit:
        if options.noSubmit:
            if options.verbose:
                tmpLog.debug("==== taskParams ====")
                tmpKeys = list(taskParamMap)
                tmpKeys.sort()
                for tmpKey in tmpKeys:
                    print('%s : %s' % (tmpKey, taskParamMap[tmpKey]))
        sys.exit(0)

    if get_taskparams:
        return taskParamMap

    tmpLog.info("submit {0}".format(options.outDS))
    tmpStat, tmpOut = Client.insertTaskParams(taskParamMap, options.verbose,
                                              True)
    # result
    taskID = None
    exitCode = None
    if tmpStat != 0:
        tmpStr = "task submission failed with {0}".format(tmpStat)
        tmpLog.error(tmpStr)
        exitCode = 1
    else:
        if tmpOut[0] in [0, 3]:
            tmpStr = tmpOut[1]
            tmpLog.info(tmpStr)
            try:
                m = re.search('jediTaskID=(\d+)', tmpStr)
                taskID = int(m.group(1))
            except Exception:
                pass
        else:
            tmpStr = "task submission failed. {0}".format(tmpOut[1])
            tmpLog.error(tmpStr)
            exitCode = 1

    dumpItem = copy.deepcopy(vars(options))
    dumpItem['returnCode'] = exitCode
    dumpItem['returnOut'] = tmpStr
    dumpItem['jediTaskID'] = taskID

    # dump
    if options.dumpJson is not None:
        with open(os.path.expanduser(options.dumpJson), 'w') as f:
            json.dump(dumpItem, f)
Beispiel #13
0
def main():
    # tweak sys.argv
    sys.argv.pop(0)
    sys.argv.insert(0, 'pchain')

    usage = """pchain [options]
    """

    optP = GroupArgParser(usage=usage, conflict_handler="resolve")

    group_output = optP.add_group('output', 'output dataset/files')
    group_config = optP.add_group(
        'config', 'single configuration file to set multiple options')
    group_submit = optP.add_group('submit', 'job submission/site/retry')
    group_expert = optP.add_group('expert', 'for experts/developers only')
    group_build = optP.add_group('build',
                                 'build/compile the package and env setup')
    group_check = optP.add_group('check', 'check workflow description')

    optP.add_helpGroup()

    group_config.add_argument('--version',
                              action='store_const',
                              const=True,
                              dest='version',
                              default=False,
                              help='Displays version')
    group_config.add_argument('-v',
                              action='store_const',
                              const=True,
                              dest='verbose',
                              default=False,
                              help='Verbose')
    group_check.add_argument('--check',
                             action='store_const',
                             const=True,
                             dest='checkOnly',
                             default=False,
                             help='Check workflow description locally')
    group_check.add_argument(
        '--debug',
        action='store_const',
        const=True,
        dest='debugCheck',
        default=False,
        help='verbose mode when checking workflow description locally')

    group_output.add_argument(
        '--cwl',
        action='store',
        dest='cwl',
        default=None,
        help='Name of the main CWL file to describe the workflow')
    group_output.add_argument(
        '--yaml',
        action='store',
        dest='yaml',
        default=None,
        help='Name of the yaml file for workflow parameters')

    group_build.add_argument(
        '--useAthenaPackages',
        action='store_const',
        const=True,
        dest='useAthenaPackages',
        default=False,
        help=
        'One or more tasks in the workflow uses locally-built Athena packages')
    group_build.add_argument('--vo',
                             action='store',
                             dest='vo',
                             default=None,
                             help="virtual organization name")
    group_build.add_argument(
        '--extFile',
        action='store',
        dest='extFile',
        default='',
        help='root or large files under WORKDIR are not sent to WNs by default. '
        'If you want to send some skipped files, specify their names, '
        'e.g., data.root,big.tgz,*.o')

    group_output.add_argument(
        '--outDS',
        action='store',
        dest='outDS',
        default=None,
        help='Name of the dataset for output and log files')
    group_output.add_argument('--official',
                              action='store_const',
                              const=True,
                              dest='official',
                              default=False,
                              help='Produce official dataset')

    group_submit.add_argument('--noSubmit',
                              action='store_const',
                              const=True,
                              dest='noSubmit',
                              default=False,
                              help="Dry-run")
    group_submit.add_argument("-3",
                              action="store_true",
                              dest="python3",
                              default=False,
                              help="Use python3")
    group_submit.add_argument(
        '--voms',
        action='store',
        dest='vomsRoles',
        default=None,
        type=str,
        help="generate proxy with paticular roles. "
        "e.g., atlas:/atlas/ca/Role=production,atlas:/atlas/fr/Role=pilot")
    group_submit.add_argument('--noEmail',
                              action='store_const',
                              const=True,
                              dest='noEmail',
                              default=False,
                              help='Suppress email notification')
    group_submit.add_argument('--prodSourceLabel',
                              action='store',
                              dest='prodSourceLabel',
                              default='',
                              help="set prodSourceLabel")
    group_submit.add_argument('--workingGroup',
                              action='store',
                              dest='workingGroup',
                              default=None,
                              help="set workingGroup")

    group_expert.add_argument(
        '--intrSrv',
        action='store_const',
        const=True,
        dest='intrSrv',
        default=False,
        help=
        "Please don't use this option. Only for developers to use the intr panda server"
    )
    group_expert.add_argument(
        '--relayHost',
        action='store',
        dest='relayHost',
        default=None,
        help=
        "Please don't use this option. Only for developers to use the relay host"
    )

    # get logger
    tmpLog = PLogger.getPandaLogger()

    # show version
    if '--version' in sys.argv:
        print("Version: %s" % PandaToolsPkgInfo.release_version)
        sys.exit(0)

    # parse args
    options = optP.parse_args()

    # check
    for arg_name in ['cwl', 'yaml', 'outDS']:
        if not getattr(options, arg_name):
            tmpStr = "argument --{0} is required".format(arg_name)
            tmpLog.error(tmpStr)
            sys.exit(1)

    # check grid-proxy
    PsubUtils.check_proxy(options.verbose, options.vomsRoles)

    # check output name
    nickName = PsubUtils.getNickname()
    if not PsubUtils.checkOutDsName(
            options.outDS, options.official, nickName,
            verbose=options.verbose):
        tmpStr = "invalid output dataset name: %s" % options.outDS
        tmpLog.error(tmpStr)
        sys.exit(1)

    # create tmp dir
    curDir = os.getcwd()
    tmpDir = os.path.join(curDir, MiscUtils.wrappedUuidGen())
    os.makedirs(tmpDir)

    # exit action
    def _onExit(dir, del_command):
        del_command('rm -rf %s' % dir)

    atexit.register(_onExit, tmpDir, MiscUtils.commands_get_output)

    # sandbox
    if options.verbose:
        tmpLog.debug("making sandbox")
    archiveName = 'jobO.%s.tar.gz' % MiscUtils.wrappedUuidGen()
    archiveFullName = os.path.join(tmpDir, archiveName)
    extensions = ['cwl', 'yaml', 'json']
    find_opt = ' -o '.join(['-name "*.{0}"'.format(e) for e in extensions])
    tmpOut = MiscUtils.commands_get_output(
        'find . {0} | tar cvfz {1} --files-from - '.format(
            find_opt, archiveFullName))

    if options.verbose:
        print(tmpOut + '\n')
        tmpLog.debug("checking sandbox")
        tmpOut = MiscUtils.commands_get_output(
            'tar tvfz {0}'.format(archiveFullName))
        print(tmpOut + '\n')

    if not options.noSubmit:
        tmpLog.info("uploading workflow sandbox")
        if options.vo:
            use_cache_srv = False
        else:
            use_cache_srv = True
        os.chdir(tmpDir)
        status, out = Client.putFile(archiveName,
                                     options.verbose,
                                     useCacheSrv=use_cache_srv,
                                     reuseSandbox=True)
        os.chdir(curDir)
        if out.startswith('NewFileName:'):
            # found the same input sandbox to reuse
            archiveName = out.split(':')[-1]
        elif out != 'True':
            # failed
            print(out)
            tmpLog.error("Failed with %s" % status)
            sys.exit(1)

    matchURL = re.search("(http.*://[^/]+)/", Client.baseURLCSRVSSL)
    sourceURL = matchURL.group(1)

    params = {
        'taskParams': {},
        'sourceURL': sourceURL,
        'sandbox': archiveName,
        'workflowSpecFile': options.cwl,
        'workflowInputFile': options.yaml,
        'language': 'cwl',
        'outDS': options.outDS,
        'base_platform': os.environ.get('ALRB_USER_PLATFORM', 'centos7')
    }

    # making task params with dummy exec
    task_type_args = {'container': '--containerImage __dummy_container__'}
    if options.useAthenaPackages:
        task_type_args['athena'] = '--useAthenaPackages'
    for task_type in task_type_args:
        prun_exec_str = '--exec __dummy_exec_str__ --outDS {0} {1}'.format(
            options.outDS, task_type_args[task_type])
        if options.noSubmit:
            prun_exec_str += ' --noSubmit'
        if options.verbose:
            prun_exec_str += ' -v'
        if options.vo:
            prun_exec_str += ' --vo {0}'.format(options.vo)
        if options.prodSourceLabel:
            prun_exec_str += ' --prodSourceLabel {0}'.format(
                options.prodSourceLabel)
        if options.workingGroup:
            prun_exec_str += ' --workingGroup {0}'.format(options.workingGroup)
        if options.extFile:
            prun_exec_str += ' --extFile {0}'.format(options.extFile)
        arg_dict = {
            'get_taskparams': True,
            'ext_args': shlex.split(prun_exec_str)
        }
        if options.checkOnly:
            arg_dict['dry_mode'] = True

        taskParamMap = PrunScript.main(**arg_dict)
        del taskParamMap['noInput']
        del taskParamMap['nEvents']
        del taskParamMap['nEventsPerJob']

        params['taskParams'][task_type] = taskParamMap

    if options.noSubmit:
        if options.noSubmit:
            if options.verbose:
                tmpLog.debug("==== taskParams ====")
                tmpKeys = list(taskParamMap)
                tmpKeys.sort()
                for tmpKey in tmpKeys:
                    if tmpKey in ['taskParams']:
                        continue
                    print('%s : %s' % (tmpKey, taskParamMap[tmpKey]))
        sys.exit(0)

    data = {'relay_host': options.relayHost, 'verbose': options.verbose}
    if not options.checkOnly:
        action_type = 'submit'
    else:
        action_type = 'check'
        data['check'] = True

    # set to use INTR server just before taking action so that sandbox files go to the regular place
    if options.intrSrv:
        Client.useIntrServer()

    # action
    tmpLog.info("{0} workflow {1}".format(action_type, options.outDS))
    tmpStat, tmpOut = Client.send_workflow_request(params, **data)

    # result
    exitCode = None
    if tmpStat != 0:
        tmpStr = "workflow {0} failed with {1}".format(action_type, tmpStat)
        tmpLog.error(tmpStr)
        exitCode = 1
    if tmpOut[0]:
        if not options.checkOnly:
            tmpStr = tmpOut[1]
            tmpLog.info(tmpStr)
        else:
            check_stat = tmpOut[1]['status']
            check_log = 'messages from the server\n' + tmpOut[1]['log']
            tmpLog.info(check_log)
            if check_stat:
                tmpLog.info('successfully verified workflow description')
            else:
                tmpLog.error('workflow description is corrupted')
    else:
        tmpStr = "workflow {0} failed. {1}".format(action_type, tmpOut[1])
        tmpLog.error(tmpStr)
        exitCode = 1
    return exitCode
Beispiel #14
0
def get_panda_task_paramsmap(panda_task_id):
    status, task_param_map = Client.getTaskParamsMap(panda_task_id)
    if status == 0:
        task_param_map = json.loads(task_param_map)
        return task_param_map
    return None
Beispiel #15
0
    try:
        # except for macOS X
        readline.read_history_file(historyFile)
    except Exception:
        pass
readline.set_history_length(1024)

# set dummy CMTSITE
if 'CMTSITE' not in os.environ:
    os.environ['CMTSITE'] = ''

# make tmp dir
tmpDir = tempfile.mkdtemp()

# set tmp dir in Client
Client.setGlobalTmpDir(tmpDir)

# fork PID
fork_child_pid = None


# exit action
def _onExit(dirName, hFile):
    # save history only for master process
    if fork_child_pid == 0:
        readline.write_history_file(hFile)
    # remove tmp dir
    commands_get_output('rm -rf %s' % dirName)


atexit.register(_onExit, tmpDir, historyFile)
Beispiel #16
0
def main():
    # parse option
    parser = argparse.ArgumentParser(conflict_handler="resolve")
    parser.add_argument("-v",
                        action="store_true",
                        dest="verbose",
                        default=False,
                        help="Verbose")
    parser.add_argument('-c',
                        action='store',
                        dest='comString',
                        default='',
                        type=str,
                        help='Execute a command in the batch mode')
    parser.add_argument("-3",
                        action="store_true",
                        dest="python3",
                        default=False,
                        help="Use python3")
    parser.add_argument('--version',
                        action='store_const',
                        const=True,
                        dest='version',
                        default=False,
                        help='Displays version')
    parser.add_argument('--devSrv',
                        action='store_const',
                        const=True,
                        dest='devSrv',
                        default=False,
                        help=argparse.SUPPRESS)
    parser.add_argument('--intrSrv',
                        action='store_const',
                        const=True,
                        dest='intrSrv',
                        default=False,
                        help=argparse.SUPPRESS)
    # option for jupyter notebook
    parser.add_argument('--prompt_with_newline',
                        action='store_const',
                        const=True,
                        dest='prompt_with_newline',
                        default=False,
                        help=argparse.SUPPRESS)

    options, args = parser.parse_known_args()

    # display version
    if options.version:
        print("Version: %s" % PandaToolsPkgInfo.release_version)
        sys.exit(0)

    # use dev server
    if options.devSrv:
        Client.useDevServer()

    # use INTR server
    if options.intrSrv:
        Client.useIntrServer()

    # fork for Ctl-c
    global fork_child_pid
    fork_child_pid = os.fork()
    if fork_child_pid == -1:
        print("ERROR : Failed to fork")
        sys.exit(1)
    if fork_child_pid == 0:
        # main
        # instantiate core
        if options.verbose:
            print(options)
        if options.prompt_with_newline:
            sys.ps1 = ">>> \n"
        pbookCore = PBookCore.PBookCore(verbose=options.verbose)

        # CUI
        intmain(pbookCore, options.comString)
    else:
        # set handler
        signal.signal(signal.SIGINT, catch_sig)
        signal.signal(signal.SIGHUP, catch_sig)
        signal.signal(signal.SIGTERM, catch_sig)
        pid, status = os.wait()
        if os.WIFSIGNALED(status):
            sys.exit(-os.WTERMSIG(status))
        elif os.WIFEXITED(status):
            sys.exit(os.WEXITSTATUS(status))
        else:
            sys.exit(0)
Beispiel #17
0
        print(job_info)
        print(job_info.attemptNr)
        print(job_info.maxAttempt)
        print(job_info.Files)
        print(job_info.Files[0])
        for f in job_info.Files:
            # print(dir(f))
            print(f._attributes)
            print(f.values())
            print(f.type)
"""

jediTaskID = 10517  # 10607
jediTaskID = 59725
ret = Client.getJediTaskDetails({'jediTaskID': jediTaskID},
                                True,
                                True,
                                verbose=False)
print(ret)

# ret = Client.getTaskStatus(jediTaskID, verbose=False)
# print(ret)

task_info = ret[1]
jobids = task_info['PandaID']
ret = Client.getJobStatus(ids=jobids, verbose=False)
print(ret)

if ret[0] == 0:
    jobs = ret[1]
    left_jobids = []
    ret_jobs = []