Ejemplo n.º 1
0
def test_bacct_done1():
    """Test parsing `bacct -l` output for a not-so-trivial job."""
    lsf = LsfLrms(
        name='test',
        architecture=gc3libs.Run.Arch.X86_64,
        max_cores=1,
        max_cores_per_job=1,
        max_memory_per_core=1 * GB,
        max_walltime=1 * hours,
        auth=None,  # ignored if `transport` is `local`
        frontend='localhost',
        transport='local',
        bacct='bacct')
    acct = lsf._parse_acct_output("""
Accounting information about jobs that are:
  - submitted by all users.
  - accounted on all projects.
  - completed normally or exited
  - executed on all hosts.
  - submitted to all queues.
  - accounted on all service classes.
------------------------------------------------------------------------------

Job <3329618>, User <rmurri>, Project <default>, Status <DONE>, Queue <pub.1h>,
                     Command <md5sum lsf.o3224113 lsf.o3224132>, Share group ch
                     arged </lsf_biol_all/lsf_aeber/rmurri>
Mon Oct  8 17:08:54: Submitted from host <brutus4>, CWD <$HOME>, Output File <l
                     sf.o%J>;
Mon Oct  8 17:10:01: Dispatched to <a3041>;
Mon Oct  8 17:10:07: Completed <done>.

Accounting information about this job:
     Share group charged </lsf_biol_all/lsf_aeber/rmurri>
     CPU_T     WAIT     TURNAROUND   STATUS     HOG_FACTOR    MEM    SWAP
      0.04       67             73     done         0.0005     3M     34M
------------------------------------------------------------------------------

SUMMARY:      ( time unit: second )
 Total number of done jobs:       1      Total number of exited jobs:     0
 Total CPU time consumed:       0.0      Average CPU time consumed:     0.0
 Maximum CPU time of a job:     0.0      Minimum CPU time of a job:     0.0
 Total wait time in queues:    67.0
 Average wait time in queue:   67.0
 Maximum wait time in queue:   67.0      Minimum wait time in queue:   67.0
 Average turnaround time:        73 (seconds/job)
 Maximum turnaround time:        73      Minimum turnaround time:        73
 Average hog factor of a job:  0.00 ( cpu time / turnaround time )
 Maximum hog factor of a job:  0.00      Minimum hog factor of a job:  0.00
    """)
    assert_equal(acct['duration'], Duration('6s'))
    assert_equal(acct['used_cpu_time'], Duration('0.04s'))
    assert_equal(acct['max_used_memory'], Memory('37MB'))
    # timestamps
    year = datetime.date.today().year
    assert_equal(acct['lsf_submission_time'],
                 datetime.datetime(year, 10, 8, 17, 8, 54))
    assert_equal(acct['lsf_start_time'],
                 datetime.datetime(year, 10, 8, 17, 10, 1))
    assert_equal(acct['lsf_completion_time'],
                 datetime.datetime(year, 10, 8, 17, 10, 7))
Ejemplo n.º 2
0
 def jwt_expiration_delta(self):
     '''datetime.timedelta: time interval until JSON web token expires
     (default: ``datetime.timedelta(hours=72)``)
     '''
     t = Duration(self._config.get(
         self._section, 'jwt_expiration_delta'))
     return datetime.timedelta(seconds=t.amount(Duration.second))
Ejemplo n.º 3
0
def test_bacct_killed():
    """Test parsing `bacct -l` output for a canceled job."""
    lsf = LsfLrms(
        name='test',
        architecture=gc3libs.Run.Arch.X86_64,
        max_cores=1,
        max_cores_per_job=1,
        max_memory_per_core=1 * GB,
        max_walltime=1 * hours,
        auth=None,  # ignored if `transport` is `local`
        frontend='localhost',
        transport='local',
        bacct='bacct')
    acct = lsf._parse_acct_output("""
Accounting information about jobs that are:
  - submitted by all users.
  - accounted on all projects.
  - completed normally or exited
  - executed on all hosts.
  - submitted to all queues.
  - accounted on all service classes.
------------------------------------------------------------------------------

Job <3224113>, User <rmurri>, Project <default>, Status <EXIT>, Queue <pub.1h>,
                     Command <sleep 300>, Share group charged </lsf_biol_all/ls
                     f_aeber/rmurri>
Fri Oct  5 17:49:35: Submitted from host <brutus4>, CWD <$HOME>, Output File <l
                     sf.o%J>;
Fri Oct  5 17:50:35: Dispatched to <a3191>;
Fri Oct  5 17:51:30: Completed <exit>; TERM_OWNER: job killed by owner.

Accounting information about this job:
     Share group charged </lsf_biol_all/lsf_aeber/rmurri>
     CPU_T     WAIT     TURNAROUND   STATUS     HOG_FACTOR    MEM    SWAP
      0.04       60            115     exit         0.0003     1M     34M
------------------------------------------------------------------------------

SUMMARY:      ( time unit: second )
 Total number of done jobs:       0      Total number of exited jobs:     1
 Total CPU time consumed:       0.0      Average CPU time consumed:     0.0
 Maximum CPU time of a job:     0.0      Minimum CPU time of a job:     0.0
 Total wait time in queues:    60.0
 Average wait time in queue:   60.0
 Maximum wait time in queue:   60.0      Minimum wait time in queue:   60.0
 Average turnaround time:       115 (seconds/job)
 Maximum turnaround time:       115      Minimum turnaround time:       115
 Average hog factor of a job:  0.00 ( cpu time / turnaround time )
 Maximum hog factor of a job:  0.00      Minimum hog factor of a job:  0.00
""")
    assert_equal(acct['duration'], Duration('55s'))
    assert_equal(acct['used_cpu_time'], Duration('0.04s'))
    assert_equal(acct['max_used_memory'], Memory('35MB'))
    # timestamps
    year = datetime.date.today().year
    assert_equal(acct['lsf_submission_time'],
                 datetime.datetime(year, 10, 5, 17, 49, 35))
    assert_equal(acct['lsf_start_time'],
                 datetime.datetime(year, 10, 5, 17, 50, 35))
    assert_equal(acct['lsf_completion_time'],
                 datetime.datetime(year, 10, 5, 17, 51, 30))
Ejemplo n.º 4
0
def test_divide_duration2():
    n = randint(1, 100)
    d1 = Duration(2 * n, unit=Duration.days)
    d2 = d1 / 2
    assert d2 == Duration(n, unit=Duration.days)
    assert 2 * d2 == d1
    assert d2 * 2 == d1
Ejemplo n.º 5
0
    def __init__(self, path, default_walltime='3h'):
        self.path = path
        self.cfg = {}
        self.defaults = {}
        with open(path, 'rU') as fd:
            log.debug("Reading CSV configuration file %s", path)
            cr = csv.reader(fd)
            lineno = 0
            for line in cr:
                lineno += 1
                if len(line) != 9:
                    log.warning(
                        "Ignoring line '%d' in csv configuration file %s: wrong number of fields (%d != 9)",
                        lineno, path, len(line))
                    continue
                if line[0] in self.cfg:
                    log.warning(
                        "Overwriting dupliacate key in '%s' csv configuration file: '%s'",
                        csvcfgfile, line[0])

                try:
                    # Check if this is an header line. These values
                    # should always be integers.
                    int(line[1])
                    int(line[3])
                    int(line[4])
                    int(line[5])
                except ValueError:
                    log.debug(
                        "Ignoring line '%d' of file %s, some values do not convert to integer as expected.",
                        lineno, path)
                    continue
                data = {
                    'fps': line[1],
                    'pixel_to_scale': line[2],
                    'difference_lag': line[3],
                    'threshold1': line[4],
                    'threshold2': line[5],
                    'video_is_needed':
                    False if line[7].lower() == 'optional' else True,
                    'email_to': line[8],
                }
                try:
                    data['requested_walltime'] = Duration(line[6])
                except ValueError as ex:
                    log.error(
                        "Unable to parse walltime '%s' for key %s in file"
                        " %s: %s. Using default value of %s", line[6], line[0],
                        path, ex, default_walltime)
                    data['requested_walltime'] = Duration(default_walltime)
                key = line[0]
                if key.lower() == "default":
                    self.defaults = data
                else:
                    self.cfg[key] = data
Ejemplo n.º 6
0
    def update_job_state(self, app):
        """
        Query the running status of the local process whose PID is
        stored into `app.execution.lrms_jobid`, and map the POSIX
        process status to GC3Libs `Run.State`.
        """
        self.transport.connect()
        pid = app.execution.lrms_jobid
        exit_code, stdout, stderr = self.transport.execute_command(
            "ps ax | grep -E '^ *%d '" % pid)
        if exit_code == 0:
            log.debug("Process with PID %s found."
                      " Checking its running status ...", pid)
            # Process exists. Check the status
            status = stdout.split()[2]
            if status[0] == 'T':
                # Job stopped
                app.execution.state = Run.State.STOPPED
            elif status[0] in ['R', 'I', 'U', 'S', 'D', 'W']:
                # Job is running. Check manpage of ps both on linux
                # and BSD to know the meaning of these statuses.
                app.execution.state = Run.State.RUNNING
                # if `requested_walltime` is set, enforce it as a
                # running time limit
                if app.requested_walltime is not None:
                    exit_code2, stdout2, stderr2 = self.transport.execute_command(
                        "ps -p %d -o etimes=" % pid)
                    if exit_code2 != 0:
                        # job terminated already, do cleanup and return
                        self._cleanup_terminating_task(app, pid)
                        return app.execution.state
                    cancel = False
                    elapsed = Duration(stdout2.strip() + 'seconds')
                    if elapsed > self.max_walltime:
                        log.warning("Task %s ran for %s, exceeding max_walltime %s of resource %s: cancelling it.",
                                    app, elapsed.to_timedelta(), self.max_walltime, self.name)
                        cancel = True
                    if elapsed > app.requested_walltime:
                        log.warning("Task %s ran for %s, exceeding own `requested_walltime` %s: cancelling it.",
                                    app, elapsed.to_timedelta(), app.requested_walltime)
                        cancel = True
                    if cancel:
                        self.cancel_job(app)
                        # set signal to SIGTERM in termination status
                        self._cleanup_terminating_task(app, pid, termstatus=(15, -1))
                        return app.execution.state
        else:
            log.debug(
                "Process with PID %d not found,"
                " assuming task %s has finished running.",
                pid, app)
            self._cleanup_terminating_task(app, pid)

        self._get_persisted_resource_state()
        return app.execution.state
Ejemplo n.º 7
0
def test_bjobs_output_for_accounting():
    lsf = LsfLrms(
        name='test',
        architecture=gc3libs.Run.Arch.X86_64,
        max_cores=1,
        max_cores_per_job=1,
        max_memory_per_core=1 * GB,
        max_walltime=1 * hours,
        auth=None,  # ignored if `transport` is `local`
        frontend='localhost',
        transport='local')
    bjobs_output = """
Job <131851>, Job Name <ChromaExtractShort>, User <wwolski>, Project <default>,
                     Status <DONE>, Queue <pub.8h>, Job Priority <50>, Command
                     <ChromatogramExtractor -in /cluster/scratch/malars/openswa
                     th/data/AQUA_fixed_water/split_napedro_L120224_001_SW-400A
                     QUA_no_background_2ul_dilution_10/split_napedro_L120224_00
                     1_SW-400AQUA_no_background_2ul_dilution_10_28.mzML.gz -tr
                     /cluster/scratch/malars/openswath/assays/iRT/DIA_iRT.TraML
                      -out split_napedro_L120224_001_SW-400AQUA_no_background_2
                     ul_dilution_10_28._rtnorm.chrom.mzML -is_swath -min_upper_
                     edge_dist 1 -threads 2>, Share group charged </lsf_biol_al
                     l/lsf_biol_other/wwolski>
Tue Jul 24 10:03:15: Submitted from host <brutus3>, CWD <$HOME/.gc3pie_jobs/lrm
                     s_job.YNZmU17755/.>, Output File <lsf.o%J>, Requested Reso
                     urces <select[mem<70000 && lustre] order[-ut] rusage[mem=1
                     000,m=1]>, Login Shell </bin/sh>, Specified Hosts <thin+9>
                     , <single+8>, <smp16+6>, <smp24+5>, <smp48+4>;

 RUNLIMIT
 480.0 min of a6122
Tue Jul 24 10:04:19: Started on <a6122>, Execution Home </cluster/home/biol/wwo
                     lski>, Execution CWD </cluster/home/biol/wwolski/.gc3pie_j
                     obs/lrms_job.YNZmU17755/.>;
Tue Jul 24 10:05:45: Done successfully. The CPU time used is 2.1 seconds.

 MEMORY USAGE:
 MAX MEM: 41 Mbytes;  AVG MEM: 41 Mbytes

 SCHEDULING PARAMETERS:
           r15s   r1m  r15m   ut      pg    io   ls    it    tmp    swp    mem
 loadSched   -     -     -     -       -     -    -     -  1000M     -      -
 loadStop    -     -     -     -       -     -    -     -     -      -      -

          scratch      xs       s       m       l      xl      sp
 loadSched 4000.0      -       -       -       -       -       -
 loadStop      -       -       -       -       -       -       -
"""

    # Also parse the output of jobs to get accounting information
    acct = lsf._parse_acct_output(bjobs_output)
    assert_equal(acct['duration'], Duration('86s'))
    assert_equal(acct['used_cpu_time'], Duration('2.1s'))
    assert_equal(acct['max_used_memory'], Memory('41MB'))
Ejemplo n.º 8
0
    def __parse_acct_output_w_bjobs(stdout):
        data = dict()

        # Try to parse used cputime
        match = LsfLrms._cpu_time_re.search(stdout)
        if match:
            cpu_time = match.group('cputime')
            data['used_cpu_time'] = Duration(float(cpu_time), unit=seconds)

        # Parse memory usage
        match = LsfLrms._mem_used_re.search(stdout)
        if match:
            mem_used = match.group('mem_used')
            # mem_unit should always be Mbytes
            data['max_used_memory'] = Memory(float(mem_used), unit=MB)

        # Find submission time and completion time
        lines = iter(stdout.split('\n'))
        for line in lines:
            match = LsfLrms._EVENT_RE.match(line)
            if match:
                timestamp = line.split(': ')[0]
                event = match.group('event')
                if event == 'Submitted':
                    data['lsf_submission_time'] = \
                        LsfLrms._parse_timespec(timestamp)
                elif event in ['Dispatched', 'Started']:
                    data['lsf_start_time'] = \
                        LsfLrms._parse_timespec(timestamp)
                elif event in ['Completed', 'Done successfully']:
                    data['lsf_completion_time'] = \
                        LsfLrms._parse_timespec(timestamp)
                continue
        if 'lsf_completion_time' in data and 'lsf_start_time' in data:
            data['duration'] = Duration(data['lsf_completion_time'] -
                                        data['lsf_start_time'])
        else:
            # XXX: what should we use for jobs that did not run at all?
            data['duration'] = Duration(0, unit=seconds)

        return data
Ejemplo n.º 9
0
def _legacy_parse_duration(duration_str):
    try:
        # old-style config: integral number of hours
        val = int(duration_str) * hours
        gc3libs.log.warning("'max_walltime' should always have a "
                            "valid unit format (e.g. '24 hours'). Using "
                            "default unit: hours")
        return val
    except ValueError:
        # apply `Duration` parsing rules; if this fails, users will
        # see the error message from the `Duration` parser.
        return Duration(duration_str)
Ejemplo n.º 10
0
 def __parse_acct_output_w_bacct(stdout):
     acctinfo = {}
     lines = iter(
         stdout.split('\n'))  # need to lookup next line in the loop
     for line in lines:
         match = LsfLrms._EVENT_RE.match(line)
         if match:
             timestamp = line.split(': ')[0]
             event = match.group('event')
             if event == 'Submitted':
                 acctinfo['lsf_submission_time'] = \
                     LsfLrms._parse_timespec(timestamp)
             elif event == 'Dispatched':
                 acctinfo['lsf_start_time'] = \
                     LsfLrms._parse_timespec(timestamp)
             elif event == 'Completed':
                 acctinfo['lsf_completion_time'] = \
                     LsfLrms._parse_timespec(timestamp)
             continue
         match = LsfLrms._RESOURCE_USAGE_RE.match(line)
         if match:
             # actual resource usage is on next line
             rusage = next(lines)
             cpu_t, wait, turnaround, status, hog_factor, mem, swap = \
                 rusage.split()
             # common backend attrs (see Issue 78)
             if 'lsf_completion_time' in acctinfo and 'lsf_start_time' in acctinfo:
                 acctinfo['duration'] = Duration(
                     acctinfo['lsf_completion_time'] -
                     acctinfo['lsf_start_time'])
             else:
                 # XXX: what should we use for jobs that did not run at all?
                 acctinfo['duration'] = Duration(0, unit=seconds)
             acctinfo['used_cpu_time'] = Duration(float(cpu_t),
                                                  unit=seconds)
             acctinfo['max_used_memory'] = LsfLrms._parse_memspec(mem)\
                 + LsfLrms._parse_memspec(swap)
             # the resource usage line is the last interesting line
             break
     return acctinfo
Ejemplo n.º 11
0
    def create_debug_run_jobs(self, user_name, job_collection, batches,
                              verbosity, duration, memory, cores):
        '''Creates debug jobs for the parallel "run" phase of the step.

        Parameters
        ----------
        user_name: str
            name of the submitting user
        job_collection: tmlib.workflow.job.RunPhase
            empty collection of *run* jobs that should be populated
        batches: List[dict]
            job descriptions
        verbosity: int
            logging verbosity for jobs
        duration: str
            computational time that should be allocated for a single job;
            in HH:MM:SS format
        memory: int
            amount of memory in Megabyte that should be allocated for a single
        cores: int
            number of CPU cores that should be allocated for a single job

        Returns
        -------
        tmlib.workflow.jobs.RunPhase
            run jobs
        '''
        logger.info('create "debug" run jobs for submission %d',
                    job_collection.submission_id)
        logger.debug('allocated time for debug run jobs: %s', duration)
        logger.debug('allocated memory for debug run jobs: %s MB', memory)
        logger.debug('allocated cores for debug run jobs: %d', cores)

        for b in batches:
            job = DebugRunJob(step_name=self.step_name,
                              arguments=self._build_debug_run_command(
                                  b['site_id'], verbosity),
                              output_dir=self.log_location,
                              job_id=b['site_id'],
                              submission_id=job_collection.submission_id,
                              parent_id=job_collection.persistent_id,
                              user_name=user_name)
            job.requested_walltime = Duration(duration)
            job.requested_memory = Memory(memory, Memory.MB)
            if not isinstance(cores, int):
                raise TypeError('Argument "cores" must have type int.')
            if not cores > 0:
                raise ValueError('The value of "cores" must be positive.')
            job.requested_cores = cores
            job_collection.add(job)
        return job_collection
Ejemplo n.º 12
0
    def create_init_job(self,
                        user_name,
                        job_collection,
                        batch_args,
                        verbosity,
                        duration='12:00:00'):
        '''Creates job for the "init" phase of the step.

        Parameters
        ----------
        user_name: str
            name of the submitting user
        job_collection: tmlib.workflow.job.InitPhase
            empty collection of *init* jobs that should be populated
        batch_args: tmlib.workflow.args.BatchArguments
            step-specific implementation of
            :class:`BatchArguments <tmlib.workflow.args.BatchArguments>`
        duration: str, optional
            computational time that should be allocated for the job
            in HH:MM:SS format (default: ``"12:00:00"``)
        verbosity: int
            logging verbosity for job

        Returns
        -------
        tmlib.workflow.jobs.InitPhase
            init job

        '''
        logger.info('create "init" job for submission %d',
                    job_collection.submission_id)
        # FIXME: this should depend on batch and total size and on the
        # program being run; although 2'500MB seems a lot, it has
        # shown to be barely enough for `illuminati init` jobs in
        # large experiments.  (But is plenty for most other uses.)
        memory = 2500  # MB
        cores = 1
        job = InitJob(step_name=self.step_name,
                      arguments=self._build_init_command(
                          batch_args, verbosity),
                      output_dir=self.log_location,
                      submission_id=job_collection.submission_id,
                      user_name=user_name,
                      parent_id=job_collection.persistent_id)
        job.requested_walltime = Duration(duration)
        job.requested_memory = Memory(memory, Memory.MB)
        job.requested_cores = cores
        job_collection.add(job)
        return job_collection
Ejemplo n.º 13
0
    def create_init_job(self,
                        user_name,
                        job_collection,
                        batch_args,
                        verbosity,
                        duration='12:00:00'):
        '''Creates job for the "init" phase of the step.

        Parameters
        ----------
        user_name: str
            name of the submitting user
        job_collection: tmlib.workflow.job.InitPhase
            empty collection of *init* jobs that should be populated
        batch_args: tmlib.workflow.args.BatchArguments
            step-specific implementation of
            :class:`BatchArguments <tmlib.workflow.args.BatchArguments>`
        duration: str, optional
            computational time that should be allocated for the job
            in HH:MM:SS format (default: ``"12:00:00"``)
        verbosity: int
            logging verbosity for job

        Returns
        -------
        tmlib.workflow.jobs.InitPhase
            init job

        '''
        logger.info('create "init" job for submission %d',
                    job_collection.submission_id)
        memory = cfg.resource.max_memory_per_core
        cores = 1
        logger.debug('allocated time for "init" job: %s', duration)
        logger.debug('allocated memory for "init" job: %s', memory)
        logger.debug('allocated cores for "init" job: %d', cores)
        job = InitJob(step_name=self.step_name,
                      arguments=self._build_init_command(
                          batch_args, verbosity),
                      output_dir=self.log_location,
                      submission_id=job_collection.submission_id,
                      user_name=user_name,
                      parent_id=job_collection.persistent_id)
        job.requested_walltime = Duration(duration)
        job.requested_memory = Memory(memory, Memory.MB)
        job.requested_cores = cores
        job_collection.add(job)
        return job_collection
Ejemplo n.º 14
0
    def create_collect_job(self,
                           user_name,
                           job_collection,
                           verbosity,
                           duration='06:00:00'):
        '''Creates job for the "collect" phase of the step.

        Parameters
        ----------
        user_name: str
            name of the submitting user
        job_collection: tmlib.workflow.job.CollectPhase
            empty collection of *collect* jobs that should be populated
        verbosity: int
            logging verbosity for jobs
        duration: str, optional
            computational time that should be allocated for a single job;
            in HH:MM:SS format (default: ``"06:00:00"``)

        Returns
        -------
        tmlib.workflow.jobs.CollectJob
            collect job

        '''
        logger.info('create "collect" job for submission %d',
                    job_collection.submission_id)
        memory = cfg.resource.max_memory_per_core
        cores = 1
        logger.debug('allocated time for "collect" job: %s', duration)
        logger.debug('allocated memory for "collect" job: %s', memory)
        logger.debug('allocated cores for "collect" job: %d', cores)
        job = CollectJob(step_name=self.step_name,
                         arguments=self._build_collect_command(verbosity),
                         output_dir=self.log_location,
                         submission_id=job_collection.submission_id,
                         user_name=user_name,
                         parent_id=job_collection.persistent_id)
        job.requested_walltime = Duration(duration)
        job.requested_memory = Memory(memory, Memory.MB)
        job.requested_cores = cores
        job_collection.add(job)
        return job_collection
Ejemplo n.º 15
0
    def create_collect_job(self,
                           user_name,
                           job_collection,
                           verbosity,
                           duration='06:00:00'):
        '''Creates job for the "collect" phase of the step.

        Parameters
        ----------
        user_name: str
            name of the submitting user
        job_collection: tmlib.workflow.job.CollectPhase
            empty collection of *collect* jobs that should be populated
        verbosity: int
            logging verbosity for jobs
        duration: str, optional
            computational time that should be allocated for a single job;
            in HH:MM:SS format (default: ``"06:00:00"``)

        Returns
        -------
        tmlib.workflow.jobs.CollectJob
            collect job

        '''
        logger.info('create "collect" job for submission %d',
                    job_collection.submission_id)
        # FIXME: See similar comment in `create_init_job` about the
        # amount of memory to allocate.
        memory = 2500  # MB
        cores = 1
        job = CollectJob(step_name=self.step_name,
                         arguments=self._build_collect_command(verbosity),
                         output_dir=self.log_location,
                         submission_id=job_collection.submission_id,
                         user_name=user_name,
                         parent_id=job_collection.persistent_id)
        job.requested_walltime = Duration(duration)
        job.requested_memory = Memory(memory, Memory.MB)
        job.requested_cores = cores
        job_collection.add(job)
        return job_collection
Ejemplo n.º 16
0
    def _parse_duration(d):
        """
        Parse a SLURM duration expression, in the form ``DD-HH:MM:SS.UUU``.

        The ``DD``, ``HH`` and ``.UUU`` parts are optional.
        """
        total = Duration(0, unit=seconds)
        if '-' in d:
            # DD-HH:MM:SS
            ndays, d = d.split('-')
            total = Duration(int(ndays), unit=days)
        parts = list(reversed(d.split(':')))
        assert len(parts) > 0
        secs = parts[0]
        if '.' in secs:
            # SS.UUU
            total += Duration(float(secs), unit=seconds)
        else:
            total += Duration(int(secs), unit=seconds)
        if len(parts) > 1:
            total += Duration(int(parts[1]), unit=minutes)
        if len(parts) > 2:
            total += Duration(int(parts[2]), unit=hours)
        return total
Ejemplo n.º 17
0
    def create_job(self,
                   submission_id,
                   user_name,
                   duration='06:00:00',
                   memory=(cfg.resource.max_cores_per_job *
                           cfg.resource.max_memory_per_core.amount(Memory.MB)),
                   cores=cfg.resource.max_cores_per_job):
        '''Creates a job for asynchroneous processing of a client tool request.

        Parameters
        ----------
        submission_id: int
            ID of the corresponding submission
        user_name: str
            name of the submitting user
        duration: str, optional
            computational time that should be allocated for the job
            in HH:MM:SS format (default: ``"06:00:00"``)
        memory: int, optional
            amount of memory in Megabyte that should be allocated for the job
            (defaults to
            :attr:`resource.max_cores_per_job <tmlib.config.LibraryConfig.resource>` x
            :attr:`resource.max_memory_per_core <tmlib.config.LibraryConfig.resource>`)
        cores: int, optional
            number of CPU cores that should be allocated for the job
            (defaults to
            :attr:`resource.max_cores_per_job <tmlib.config.LibraryConfig.resource>`)

        Returns
        -------
        tmlib.tools.jobs.ToolJob
            tool job
        '''
        logger.info('create tool job for submission %d', submission_id)

        if cores > cfg.resource.max_cores_per_job:
            logger.warn('requested cores exceed available cores per node:  %s',
                        cfg.resource.max_cores_per_job)
            logger.debug('setting number of cores to %d',
                         cfg.resource.max_cores_per_job)
            cores = cfg.resource.max_cores_per_job

        max_memory_per_node = (
            cfg.resource.max_cores_per_job *
            cfg.resource.max_memory_per_core.amount(Memory.MB))
        max_memory_per_core = cfg.resource.max_memory_per_core.amount(
            Memory.MB)
        if cores == 1:
            if memory > max_memory_per_core:
                # We just warn here, since this may still work.
                logger.warn(
                    'requested memory exceeds available memory per core: %d MB',
                    max_memory_per_core)
        else:
            if memory > max_memory_per_node:
                logger.warn(
                    'requested memory exceeds available memory per node: %d MB',
                    max_memory_per_node)
                logger.debug('setting memory to %d MB', max_memory_per_node)
                memory = max_memory_per_node

        logger.debug('allocated time for job: %s', duration)
        logger.debug('allocated memory for job: %d MB', memory)
        logger.debug('allocated cores for job: %d', cores)
        job = ToolJob(tool_name=self.tool_name,
                      arguments=self._build_command(submission_id),
                      output_dir=self._log_location,
                      submission_id=submission_id,
                      user_name=user_name)
        job.requested_walltime = Duration(duration)
        job.requested_memory = Memory(memory, Memory.MB)
        if not isinstance(cores, int):
            raise TypeError('Argument "cores" must have type int.')
        if not cores > 0:
            raise ValueError('The value of "cores" must be positive.')
        job.requested_cores = cores
        return job
Ejemplo n.º 18
0
    def _parse_acct_output(self, stdout, stderr):
        acct = {
            'cores': 0,
            'duration': Duration(0, unit=seconds),
            'used_cpu_time': Duration(0, unit=seconds),
            'max_used_memory': Memory(0, unit=bytes)
        }
        exitcode = None
        signal = None
        for line in stdout.split('\n'):
            line = line.strip()
            if line == '':
                continue
            # because of the trailing `|` we have an extra empty field
            jobid, exit, state, ncpus, elapsed, totalcpu, submit,\
                start, end, maxrss, maxvmsize, _ = line.split('|')

            # In some case the state can contain a specification,
            # e.g. "CANCELLED by 1000"
            state = state.split()[0]

            # SLURM job IDs have the form `jobID[.step]`: only the
            # lines with the `step` part carry resource usage records,
            # whereas the total `jobID` line carries the exit codes
            # and overall duration/timing information.
            if '.' not in jobid:
                if state not in [
                        'BOOT_FAIL',
                        'CANCELLED',
                        'COMPLETED',
                        'FAILED',
                        'NODE_FAIL',
                        'PREEMPTED',
                        'TIMEOUT',
                ]:
                    raise gc3libs.exceptions.UnexpectedJobState(
                        "Unexpected SLURM job state '{state}'"
                        " encountered in parsing `sacct` output".format(
                            state=state))
                # master job record
                acct['duration'] = SlurmLrms._parse_duration(elapsed)
                acct['used_cpu_time'] = SlurmLrms._parse_duration(totalcpu)
                if state in ['CANCELLED', 'TIMEOUT']:
                    # In this case, the exit code of the master job is
                    # `0:0` or `0:1`, but we want to keep track of the
                    # fact that the job was killed by the system (or
                    # the user).
                    exitcode = os.EX_TEMPFAIL
                    signal = int(Run.Signals.RemoteKill)
                elif state == 'NODE_FAIL':
                    exitcode = os.EX_TEMPFAIL
                    signal = int(Run.Signals.RemoteError)
                else:
                    # compute POSIX exit status
                    exitcode_, signal_ = exit.split(':')
                    exitcode = int(exitcode_)
                    signal = int(signal_)
                # XXX: the master job record seems to report the
                # *requested* slots, whereas the step records report
                # the actual usage.  In our case these should be the
                # same, as the job script only runs one single step.
                # However, in the general case computing the *actual*
                # CPU usage is a mess, as we would have to check which
                # steps were executed simultaneously and which ones
                # were executed one after the other...
                acct['cores'] = int(ncpus)
                # provide starting point for resource usage records
                acct['max_used_memory'] = Memory(0, unit=MB)
                acct['slurm_max_used_ram'] = Memory(0, unit=MB)
                # XXX: apparently, Ubuntu's SLURM 2.3 has a bug
                # wherein `submit` == `end` in the master job record,
                # and the actual start time must be gathered from the
                # step records... try to work around
                submit = SlurmLrms._parse_timestamp(submit)
                start = SlurmLrms._parse_timestamp(start)
                end = SlurmLrms._parse_timestamp(end)
                acct['slurm_submission_time'] = min(submit, start)
                acct['slurm_start_time'] = end  # actually computed below
                acct['slurm_completion_time'] = max(submit, start, end)
            else:
                # common resource usage records (see Issue 78)
                vmem = SlurmLrms._parse_memspec(maxvmsize)
                if vmem is not None:
                    acct['max_used_memory'] = max(vmem,
                                                  acct['max_used_memory'])
                # SLURM-specific resource usage records
                mem = SlurmLrms._parse_memspec(maxrss)
                if mem is not None:
                    acct['slurm_max_used_ram'] = max(
                        mem, acct['slurm_max_used_ram'])
                # XXX: see above for timestamps
                submit = SlurmLrms._parse_timestamp(submit)
                start = SlurmLrms._parse_timestamp(start)
                acct['slurm_submission_time'] = min(
                    submit, acct['slurm_submission_time'])
                acct['slurm_start_time'] = min(start, acct['slurm_start_time'])
        # must compute termination status since it's not provided by `squeue`
        if signal is not None and exitcode is not None:
            acct['termstatus'] = (signal & 0x7f) + ((exitcode & 0xff) << 8)
        return acct
Ejemplo n.º 19
0
    def create_job(
            self,
            submission_id,
            user_name,
            duration='06:00:00',
            # if all cores are used, we should allocate all available memory as well
            memory=cfg.resource.max_memory_per_core.amount(Memory.MB),
            cores=cfg.resource.max_cores_per_job):
        '''Creates a job for asynchroneous processing of a client tool request.

        Parameters
        ----------
        submission_id: int
            ID of the corresponding submission
        user_name: str
            name of the submitting user
        duration: str, optional
            computational time that should be allocated for the job
            in HH:MM:SS format (default: ``"06:00:00"``)
        memory: int, optional
            amount of memory in Megabyte that should be allocated for the job
            (defaults to
            :attr:`resource.max_memory_per_core <tmlib.config.LibraryConfig.resource>`)
        cores: int, optional
            number of CPU cores that should be allocated for the job
            (defaults to
            :attr:`resource.max_cores_per_job <tmlib.config.LibraryConfig.resource>`)

        Returns
        -------
        tmlib.tools.jobs.ToolJob
            tool job
        '''
        logger.info('create tool job for submission %d', submission_id)

        try:
            cores = int(cores)
        except (ValueError, TypeError) as err:
            raise TypeError(
                'Argument "cores" cannot be converted to type `int`: {err}'.
                format(err=err))
        if not cores > 0:
            raise ValueError('The value of "cores" must be positive.')

        if cores > cfg.resource.max_cores_per_job:
            logger.warn('requested cores exceed available cores per node:  %s',
                        cfg.resource.max_cores_per_job)
            logger.warn('lowering number of cores to %d (max available)',
                        cfg.resource.max_cores_per_job)
            cores = cfg.resource.max_cores_per_job

        # FIXME: this needs to be revisited when GC3Pie issue #624 is fixed;
        # for the moment, see https://github.com/uzh/gc3pie/issues/624#issuecomment-328122862
        # as to why this is the right way to compute max memory
        max_memory_per_node = cfg.resource.max_memory_per_core.amount(
            Memory.MB)
        max_memory_per_core = max_memory_per_node / cfg.resource.max_cores_per_job
        if memory > max_memory_per_node:
            logger.warn(
                'requested memory exceeds available memory per node: %d MB',
                max_memory_per_node)
            logger.warn('lowering memory to %d MB', max_memory_per_node)
            memory = max_memory_per_node

        logger.debug('allocated time for job: %s', duration)
        logger.debug('allocated memory for job: %s MB', memory)
        logger.debug('allocated cores for job: %d', cores)
        job = ToolJob(tool_name=self.tool_name,
                      arguments=self._build_command(submission_id),
                      output_dir=self._log_location,
                      submission_id=submission_id,
                      user_name=user_name)
        job.requested_walltime = Duration(duration)
        job.requested_memory = Memory(memory, Memory.MB)
        job.requested_cores = cores
        return job
Ejemplo n.º 20
0
    def _parse_acct_output(self, stdout):
        acct = dict(exitcode=0,
                    cores=0,
                    duration=Duration(0, unit=seconds),
                    used_cpu_time=Duration(0, unit=seconds),
                    max_used_memory=Memory(0, unit=bytes))
        for line in stdout.split('\n'):
            line = line.strip()
            if line == '':
                continue
            # because of the trailing `|` we have an extra empty field
            jobid, exit, state, ncpus, elapsed, totalcpu, submit,\
                start, end, maxrss, maxvmsize, _ = line.split('|')

            # In some case the state can contain a specification, as
            # "CANCELLED by 1000"
            state = state.split()[0]

            # SLURM job IDs have the form `jobID[.step]`: only the
            # lines with the `step` part carry resource usage records,
            # whereas the total `jobID` line carries the exit codes
            # and overall duration/timing information.
            if '.' not in jobid:
                assert state in [
                    'CANCELLED', 'COMPLETED', 'FAILED', 'NODE_FAIL',
                    'PREEMPTED', 'TIMEOUT'
                ]
                # master job record
                acct['duration'] = SlurmLrms._parse_duration(elapsed)
                acct['used_cpu_time'] = SlurmLrms._parse_duration(totalcpu)
                if state in ['CANCELLED', 'TIMEOUT']:
                    # In this case, the exit code of the master job is
                    # `0:0` or `0:1`, but we want to keep track of the
                    # fact that the job was killed by the system (or
                    # the user).
                    acct['exitcode'] = os.EX_TEMPFAIL
                    acct['signal'] = int(Run.Signals.RemoteKill)
                elif state == 'NODE_FAIL':
                    acct['exitcode'] = os.EX_TEMPFAIL
                    acct['signal'] = int(Run.Signals.RemoteError)
                else:
                    # compute POSIX exit status
                    acct['exitcode'], acct['signal'] = exit.split(':')
                # XXX: the master job record seems to report the
                # *requested* slots, whereas the step records report
                # the actual usage.  In our case these should be the
                # same, as the job script only runs one single step.
                # However, in the general case computing the *actual*
                # CPU usage is a mess, as we would have to check which
                # steps were executed simultaneously and which ones
                # were executed one after the other...
                acct['cores'] = int(ncpus)
                # provide starting point for resource usage records
                acct['max_used_memory'] = Memory(0, unit=MB)
                acct['slurm_max_used_ram'] = Memory(0, unit=MB)
                # XXX: apparently, Ubuntu's SLURM 2.3 has a bug
                # wherein `submit` == `end` in the master job record,
                # and the actual start time must be gathered from the
                # step records... try to work around
                submit = SlurmLrms._parse_timestamp(submit)
                start = SlurmLrms._parse_timestamp(start)
                end = SlurmLrms._parse_timestamp(end)
                acct['slurm_submission_time'] = min(submit, start)
                acct['slurm_start_time'] = end  # will be set when
                # looping on tasks,
                # see below
                acct['slurm_completion_time'] = max(submit, start, end)
            else:
                # common resource usage records (see Issue 78)
                vmem = SlurmLrms._parse_memspec(maxvmsize)
                acct['max_used_memory'] = max(vmem, acct['max_used_memory'])
                # SLURM-specific resource usage records
                mem = SlurmLrms._parse_memspec(maxrss)
                acct['slurm_max_used_ram'] = max(mem,
                                                 acct['slurm_max_used_ram'])
                # XXX: see above for timestamps
                submit = SlurmLrms._parse_timestamp(submit)
                start = SlurmLrms._parse_timestamp(start)
                acct['slurm_submission_time'] = min(
                    submit, acct['slurm_submission_time'])
                acct['slurm_start_time'] = min(start, acct['slurm_start_time'])
        return acct
Ejemplo n.º 21
0
def test_bacct_done0():
    """Test parsing accounting information of a <sleep 300> job."""
    # gotten with `bacct -l "jobid"`
    lsf = LsfLrms(name='test',
                  architecture=gc3libs.Run.Arch.X86_64,
                  max_cores=1,
                  max_cores_per_job=1,
                  max_memory_per_core=1 * GB,
                  max_walltime=1 * hours,
                  auth=None,  # ignored if `transport` is `local`
                  frontend='localhost',
                  transport='local',
                  bacct='bacct')
    acct = lsf._parse_acct_output("""
Accounting information about jobs that are:
  - submitted by all users.
  - accounted on all projects.
  - completed normally or exited
  - executed on all hosts.
  - submitted to all queues.
  - accounted on all service classes.
------------------------------------------------------------------------------

Job <3329613>, User <rmurri>, Project <default>, Status <DONE>, Queue <pub.1h>,
                     Command <sleep 60>, Share group charged </lsf_biol_all/lsf
                     _aeber/rmurri>
Mon Oct  8 17:07:54: Submitted from host <brutus4>, CWD <$HOME>, Output File <l
                     sf.o%J>;
Mon Oct  8 17:08:44: Dispatched to <a3201>;
Mon Oct  8 17:09:51: Completed <done>.

Accounting information about this job:
     Share group charged </lsf_biol_all/lsf_aeber/rmurri>
     CPU_T     WAIT     TURNAROUND   STATUS     HOG_FACTOR    MEM    SWAP
      0.08       50            117     done         0.0007     5M    222M
------------------------------------------------------------------------------

SUMMARY:      ( time unit: second )
 Total number of done jobs:       1      Total number of exited jobs:     0
 Total CPU time consumed:       0.1      Average CPU time consumed:     0.1
 Maximum CPU time of a job:     0.1      Minimum CPU time of a job:     0.1
 Total wait time in queues:    50.0
 Average wait time in queue:   50.0
 Maximum wait time in queue:   50.0      Minimum wait time in queue:   50.0
 Average turnaround time:       117 (seconds/job)
 Maximum turnaround time:       117      Minimum turnaround time:       117
 Average hog factor of a job:  0.00 ( cpu time / turnaround time )
 Maximum hog factor of a job:  0.00      Minimum hog factor of a job:  0.00

    """,
    # STDERR
    '')
    assert acct['duration'] == Duration('67s')
    assert acct['used_cpu_time'] == Duration('0.08s')
    assert acct['max_used_memory'] == Memory('227MB')
    # timestamps
    year = datetime.date.today().year
    assert (acct['lsf_submission_time'] ==
                 datetime.datetime(year, 10, 8, 17, 7, 54))
    assert (acct['lsf_start_time'] ==
                 datetime.datetime(year, 10, 8, 17, 8, 44))
    assert (acct['lsf_completion_time'] ==
                 datetime.datetime(year, 10, 8, 17, 9, 51))
Ejemplo n.º 22
0
    def __init__(self, simulation_dir, executable=None, **extra_args):
        # remember for later
        self.simulation_dir = simulation_dir
        self.shared_FS = extra_args['shared_FS']

        inputs = dict()

        # execution wrapper needs to be added anyway
        geotop_wrapper_sh = resource_filename(Requirement.parse("gc3pie"),
                                              "gc3libs/etc/geotop_wrap.sh")
        inputs[geotop_wrapper_sh] = os.path.basename(geotop_wrapper_sh)

        _command = "./%s " % os.path.basename(geotop_wrapper_sh)

        # If shared_FS, no inputs are defined
        # as they are already available on the computational nodes
        if not self.shared_FS:
            # compress input folder
            inputs.update(dict(self._scan_and_tar(simulation_dir)))

            # set ANY_OUTPUT for output
            outputs = gc3libs.ANY_OUTPUT

            # Set executable name and include in input list
            if executable is not None:
                # use the specified executable
                # include executable within input list
                executable_name = './' + os.path.basename(executable)
                inputs[executable] = os.path.basename(executable)

            # use '-l' flag for wrapper script for non-shared FS
            _command += "input.tgz "
        else:
            # sharedFS: everything is local
            executable_name = os.path.abspath(executable)
            _command += " %s " % os.path.abspath(self.simulation_dir)
            outputs = []

        _command += "%s" % executable_name

        # set some execution defaults...
        extra_args.setdefault('requested_cores', 1)
        extra_args.setdefault('requested_architecture', Run.Arch.X86_64)
        extra_args.setdefault('requested_walltime', Duration(8, hours))
        # ...and remove excess ones
        extra_args.pop('output_dir', None)
        Application.__init__(
            self,
            # GEOtop requires only one argument: the simulation directory
            # In our case, since all input files are staged to the
            # execution directory, the only argument is fixed to ``.``
            # arguments = ['./'+os.path.basename(geotop_wrapper_sh), 'input.tgz', executable_name ],
            arguments=_command,
            inputs=inputs,
            # outputs = gc3libs.ANY_OUTPUT,
            outputs=outputs,
            output_dir=os.path.join(simulation_dir, 'tmp'),
            stdout='ggeotop.log',
            join=True,
            tags=['APPS/EARTH/GEOTOP-1.224'],
            **extra_args)
Ejemplo n.º 23
0
    def create_run_jobs(self, user_name, job_collection, verbosity, duration,
                        memory, cores):
        '''Creates jobs for the parallel "run" phase of the step.

        Parameters
        ----------
        user_name: str
            name of the submitting user
        job_collection: tmlib.workflow.job.RunPhase
            empty collection of *run* jobs that should be populated
        verbosity: int
            logging verbosity for jobs
        duration: str
            computational time that should be allocated for a single job;
            in HH:MM:SS format
        memory: int
            amount of memory in Megabyte that should be allocated for a single
            job
        cores: int
            number of CPU cores that should be allocated for a single job

        Returns
        -------
        tmlib.workflow.jobs.RunPhase
            collection of jobs
        '''
        logger.info('create "run" jobs for submission %d',
                    job_collection.submission_id)

        if cores > cfg.resource.max_cores_per_job:
            logger.warn('requested cores exceed available cores per job:  %s',
                        cfg.resource.max_cores_per_job)
            logger.debug('lowering number of requested cores to %d',
                         cfg.resource.max_cores_per_job)
            cores = cfg.resource.max_cores_per_job

        # Until issue gc3pie#624 is fixed, `max_memory_per_core`
        # doubles up as "total memory per node"
        max_memory_per_node = cfg.resource.max_memory_per_core.amount(
            Memory.MB)
        if memory > max_memory_per_node:
            logger.warn(
                'requested memory exceeds available memory per node: %d MB',
                max_memory_per_node)
            logger.debug('lowering requested memory to %d MB',
                         max_memory_per_node)
            memory = max_memory_per_node

        logger.debug('allocated time for run jobs: %s', duration)
        logger.debug('allocated memory for run jobs: %s', memory)
        logger.debug('allocated cores for run jobs: %d', cores)

        job_ids = self.get_run_job_ids()
        for j in job_ids:
            job_collection.add(
                RunJob(**self._get_run_job_args(
                    step_name=self.step_name,
                    arguments=self._build_run_command(j, verbosity),
                    output_dir=self.log_location,
                    job_id=j,
                    submission_id=job_collection.submission_id,
                    user_name=user_name,
                    parent_id=job_collection.persistent_id,
                    requested_walltime=Duration(duration),
                    requested_memory=Memory(memory, Memory.MB),
                    requested_cores=cores,
                )))
        return job_collection
Ejemplo n.º 24
0
def _parse_time_duration(val):
    """
    Convert the output of common Linux/UNIX system utilities into a GC3Pie `Duration` object.

    Any of the time formats *DD-HH:MM:SS* (days, hours, minutes, seconds),
    *HH:MM:SS* (hours, minutes, seconds), or *MM:SS* (minutes, seconds), or
    even just the number of seconds are acceptable::

      >>> _parse_time_duration('25-00:31:05') == Duration('25d') + Duration('31m') + Duration('5s')
      True

      >>> _parse_time_duration('1:02:03') == Duration('1h') + Duration('2m') + Duration('3s')
      True

      >>> _parse_time_duration('01:02') == Duration('1m') + Duration('2s')
      True

      >>> _parse_time_duration('42') == Duration(42, unit=Duration.s)
      True

    The *seconds* portion of the time string can be followed by
    decimal digits for greater precision::

      >>> _parse_time_duration('0:00.00') == Duration(0, unit=Duration.s)
      True

      >>> _parse_time_duration('4.20') == Duration(4.20, unit=Duration.s)
      True

    When only the number of seconds is given, an optional trailing
    unit specified `s` is allowed::

      >>> _parse_time_duration('4.20s') == Duration(4.20, unit=Duration.s)
      True

    Among the programs whose output can be parsed by this function, there are:

    - GNU time's `%e` format specifier;
    - output of `ps -o etime=` (on both GNU/Linux and MacOSX)
    """
    n = val.count(':')
    if 2 == n:
        if '-' in val:
            days, timespan = val.split('-')
            return (Duration(days + 'd') + Duration(timespan))
        else:
            # Duration's ctor can natively parse this
            return Duration(val)
    elif 1 == n:
        # AA:BB is rejected as ambiguous by `Duration`'s built-in
        # parser; work around it
        mm, ss = val.split(':')
        return (Duration(int(mm, 10), unit=Duration.m) +
                Duration(float(ss), unit=Duration.s))
    elif 0 == n:
        # remove final unit spec, if present
        if val.endswith('s'):
            val = val[:-1]
        # number of seconds with up to 2 decimal precision
        return Duration(float(val), unit=Duration.s)
    else:
        raise ValueError("Expecting duration in the form HH:MM:SS, MM:SS,"
                         " or just number of seconds,"
                         " got {val} instead".format(val=val))
Ejemplo n.º 25
0
import shutil
# import csv

from pkg_resources import Requirement, resource_filename

import gc3libs
import gc3libs.exceptions
from gc3libs import Application, Run, Task
from gc3libs.cmdline import SessionBasedScript, executable_file
import gc3libs.utils
from gc3libs.quantity import Memory, kB, MB, MiB, GB, Duration, hours, minutes, seconds
from gc3libs.workflow import RetryableTask

DEFAULT_CORES = 1
DEFAULT_MEMORY = Memory(1500, MB)
DEFAULT_WALLTIME = Duration(300, hours)


## custom application class
class GrdockApplication(Application):
    """
    Custom class to wrap the execution of rdock.
    The wrapper script that will be executed on the remote end is
    organised in two steps:
    step 1: cavity creattion (will use rbcavity)
    step 2: docking (will use rbdock)

    Application will take the input ligand file and a ligand index
    ligand index is used to create an output file that maintains the
    same ligand index as a suffix:
    Es: input ligand: Docking1.sd -> output: Docked1.sd
Ejemplo n.º 26
0
    def create_run_jobs(self, user_name, job_collection, verbosity, duration,
                        memory, cores):
        '''Creates jobs for the parallel "run" phase of the step.
        The `illuminati` step is special in the sense that it implements
        multiple sequential runs within the "run" phase to build one pyramid
        zoom level after another.

        Parameters
        ----------
        user_name: str
            name of the submitting user
        job_collection: tmlib.workflow.jobs.RunPhase
            emtpy collection for "run" jobs
        verbosity: int
            logging verbosity for jobs
        duration: str
            computational time that should be allocated for a single job;
            in HH:MM:SS format
        memory: int
            amount of memory in Megabyte that should be allocated for a single
        cores: int
            number of CPU cores that should be allocated for a single job

        Returns
        -------
        tmlib.workflow.jobs.RunPhase
            collection of jobs
        '''
        logger.info('create "run" jobs for submission %d',
                    job_collection.submission_id)
        logger.debug('allocated time for "run" jobs: %s', duration)
        logger.debug('allocated memory for "run" jobs: %d MB', memory)
        logger.debug('allocated cores for "run" jobs: %d', cores)

        multi_run_jobs = collections.defaultdict(list)
        job_ids = self.get_run_job_ids()
        for j in job_ids:
            batch = self.get_run_batch(j)
            multi_run_jobs[batch['index']].append(j)

        for index, job_ids in multi_run_jobs.iteritems():
            subjob_collection = SingleRunPhase(
                step_name=self.step_name,
                index=index,
                submission_id=job_collection.submission_id,
                parent_id=job_collection.persistent_id)

            for j in job_ids:
                job = RunJob(step_name=self.step_name,
                             arguments=self._build_run_command(j, verbosity),
                             output_dir=self.log_location,
                             job_id=j,
                             index=index,
                             submission_id=subjob_collection.submission_id,
                             parent_id=subjob_collection.persistent_id,
                             user_name=user_name)
                if duration:
                    job.requested_walltime = Duration(duration)
                if memory:
                    job.requested_memory = Memory(memory, Memory.MB)
                if cores:
                    if not isinstance(cores, int):
                        raise TypeError('Argument "cores" must have type int.')
                    if not cores > 0:
                        raise ValueError(
                            'The value of "cores" must be positive.')
                    job.requested_cores = cores
                subjob_collection.add(job)
            job_collection.add(subjob_collection)

        return job_collection