Example #1
0
def submitlaunch(args):
    from balsam import setup
    setup()
    from balsam.service import service
    from balsam.core import models
    Job = models.BalsamJob
    from django.db import connection, transaction

    if args.wf_filter and not Job.objects.filter(
            workflow=args.wf_filter).exists():
        raise RuntimeError(
            f"No job with wf_filter={args.wf_filter} registered in local DB")

    # Exclusive Lock on core_queuedlaunch
    with transaction.atomic():
        with connection.cursor() as cursor:
            cursor.execute(
                'LOCK TABLE core_queuedlaunch IN ACCESS EXCLUSIVE MODE;')
            QueuedLaunch = models.QueuedLaunch
            qlaunch = QueuedLaunch(project=args.project,
                                   queue=args.queue,
                                   nodes=args.nodes,
                                   wall_minutes=args.time_minutes,
                                   job_mode=args.job_mode,
                                   wf_filter=args.wf_filter,
                                   sched_flags=args.sched_flags,
                                   prescheduled_only=False)
            qlaunch.save()
            service.submit_qlaunch(qlaunch, verbose=True)
Example #2
0
def ls(args):
    from balsam import setup
    setup()
    # from balsam.core import models
    import balsam.scripts.ls_commands as lscmd

    objects = args.objects
    name = args.name
    history = args.history
    verbose = args.verbose
    state = args.state
    id = args.id
    tree = args.tree
    wf = args.wf
    by_states = args.by_states

    try:
        if objects.startswith('job'):
            lscmd.ls_jobs(name, history, id, verbose, tree, wf, state, by_states)
        elif objects.startswith('app'):
            lscmd.ls_apps(name, id, verbose)
        elif objects.startswith('work') or objects.startswith('wf'):
            lscmd.ls_wf(name, verbose, tree, wf)
        elif objects.startswith('queues'):
            lscmd.ls_queues(verbose)
    except (KeyboardInterrupt, BrokenPipeError):
        pass
Example #3
0
def newapp(args):
    from balsam import setup
    setup()
    from balsam.core.models import ApplicationDefinition as AppDef

    def py_app_path(path):
        if not path:
            return ''
        args = path.split()
        app = args[0]
        if app.endswith('.py'):
            exe = sys.executable
            script_path = os.path.abspath(app)
            args = ' '.join(args[1:])
            path = ' '.join((exe, script_path, args))
        return path

    if AppDef.objects.filter(name=args.name).exists():
        raise RuntimeError(f"An application named {args.name} already exists")

    app = AppDef()
    app.name = args.name
    app.description = ' '.join(args.description) if args.description else ''
    app.executable = py_app_path(args.executable)
    app.preprocess = py_app_path(args.preprocess)
    app.postprocess = py_app_path(args.postprocess)
    app.save()
    print(app)
    print("Added app to database")
Example #4
0
def modify(args):
    from balsam import setup
    setup()
    from balsam.core import models
    Job = models.BalsamJob
    AppDef = models.ApplicationDefinition

    if args.type == 'jobs': cls = Job
    elif args.type == 'apps': cls = AppDef

    item = cls.objects.filter(pk__contains=args.id)
    if item.count() == 0:
        raise RuntimeError(f"no matching {args.type}")
    elif item.count() > 1:
        raise RuntimeError(f"more than one matching {args.type}")
    item = item.first()

    target_type = type(getattr(item, args.attr))
    new_value = target_type(args.value)
    if args.attr == 'state':
        if item.state == 'USER_KILLED':
            print("Cannot mutate state of a killed job")
            return
        item.update_state(new_value, 'User mutated state from command line')
    else:
        setattr(item, args.attr, new_value)
        item.save()
    print(f'{args.type[:-1]} {args.attr} changed to:  {new_value}')
Example #5
0
def rm(args):
    from balsam import setup
    setup()
    from balsam.core import models
    Job = models.BalsamJob
    AppDef = models.ApplicationDefinition

    objects_name = args.objects
    name = args.name
    objid = args.id
    deleteall = args.all
    force = args.force
    wf_filter = args.wf_filter

    # Are we removing jobs or apps?
    if objects_name.startswith('job'): cls = Job
    elif objects_name.startswith('app'): cls = AppDef
    objects = cls.objects

    # Filter: all objects, by name-match (multiple), or by ID (unique)?
    if deleteall:
        deletion_objs = objects.all()
        message = f"ALL {objects_name}"
    elif name is not None:
        # preferable to "elif name:", since "balsam rm jobs --name=" (empty string)
        # can be a valid choice to match all job names. "is not None" distinguishes if
        # flag is present
        if wf_filter is not None:
            if objects_name.startswith('app'):
                raise RuntimeError(
                    "--wf-filter flag incompatible with balsam rm app")
            else:
                # use strict name checking for workflow, otherwise very dangerous
                deletion_objs = objects.filter(name__icontains=name,
                                               workflow=wf_filter)
        else:
            deletion_objs = objects.filter(name__icontains=name)
        message = f"{len(deletion_objs)} {objects_name} matching name {name}"
        if not deletion_objs.exists():
            print(f"No {objects_name} matching query")
            return
    elif objid is not None:
        deletion_objs = objects.filter(pk__icontains=objid)
        if deletion_objs.count() > 1:
            raise RuntimeError(f"Multiple {objects_name} match ID")
        elif deletion_objs.count() == 0:
            raise RuntimeError(f"No {objects_name} match ID")
        else:
            message = f"{objects_name[:-1]} with ID matching {objid}"

    # User confirmation
    if not force:
        if not cmd_confirmation(f"PERMANENTLY remove {message}?"):
            print("Delete aborted")
            return

    # Actually delete things here
    deletion_objs.delete()
    print("Deleted.")
Example #6
0
def log(args):
    from balsam import settings, setup
    setup()
    path = os.path.join(settings.LOGGING_DIRECTORY, '*.log')
    try:
        subprocess.run(f"tail -f {path}", shell=True)
    except (KeyboardInterrupt, BrokenPipeError, ProcessLookupError):
        pass
Example #7
0
def newdep(args):
    from balsam import setup
    setup()
    # from balsam.core import models
    from balsam.launcher import dag

    parent = match_uniq_job(args.parent)
    child = match_uniq_job(args.child)
    dag.add_dependency(parent, child)
    print(f"Created link {parent.cute_id} --> {child.cute_id}")
Example #8
0
def mkchild(args):
    from balsam import setup
    setup()
    from balsam.launcher import dag

    if not dag.current_job:
        raise RuntimeError(f"mkchild requires that BALSAM_JOB_ID is in the environment")
    child_job = newjob(args)
    dag.add_dependency(dag.current_job, child_job)
    print(f"Created link {dag.current_job.cute_id} --> {child_job.cute_id}")
Example #9
0
def match_uniq_job(s):
    from balsam import setup
    setup()
    from balsam.core import models
    Job = models.BalsamJob

    job = Job.objects.filter(job_id__icontains=s)
    count = job.count()
    if count > 1:
        raise ValueError(f"More than one ID matched {s}")
    elif count == 1:
        return job.first()
    else:
        raise ValueError(f"No job in local DB matched {s}")
Example #10
0
def newjob(args):
    from balsam import setup
    setup()
    from balsam.core import models
    Job = models.BalsamJob
    AppDef = models.ApplicationDefinition

    if not AppDef.objects.filter(name=args.application).exists():
        raise RuntimeError(
            f"App {args.application} not registered in local DB")

    job = Job()
    job.name = args.name
    job.description = ' '.join(args.description)
    job.workflow = args.workflow

    job.wall_time_minutes = args.wall_time_minutes
    job.num_nodes = args.num_nodes
    job.coschedule_num_nodes = args.coschedule_num_nodes
    job.node_packing_count = args.node_packing_count
    job.ranks_per_node = args.ranks_per_node
    job.threads_per_rank = args.threads_per_rank
    job.threads_per_core = args.threads_per_core

    job.application = args.application
    job.args = ' '.join(args.args)
    job.mpi_flags = ' '.join(args.mpi_flags)
    job.post_error_handler = args.post_handle_error
    job.post_timeout_handler = args.post_handle_timeout
    job.auto_timeout_retry = not args.disable_auto_timeout_retry
    job.input_files = ' '.join(args.input_files)

    job.stage_in_url = args.url_in
    job.stage_out_url = args.url_out
    job.stage_out_files = ' '.join(args.stage_out_files)
    job.environ_vars = ":".join(args.env)

    print(job)
    if not args.yes:
        num_wf_jobs = Job.objects.filter(workflow=job.workflow).count()
        print(
            f"[INFO]: The workflow \"{job.workflow}\" currently has {num_wf_jobs} jobs in the database"
        )
        if not cmd_confirmation('Confirm adding job to DB'):
            print("Add job aborted")
            return
    job.save()
    return job
    print("Added job to database")
Example #11
0
def submit_jobs(project='',
                queue='debug-cache-quad',
                nodes=1,
                wall_minutes=30,
                job_mode='mpi',
                wf_filter='',
                save=False,
                submit=False):
    """
    Submits a job to the queue with the given parameters.
    Parameters
    ----------
    project: str, name of the project to be charged
    queue: str, queue name, can be: 'default', 'debug-cache-quad', 'debug-flat-quad', 'backfill'
    nodes: int, Number of nodes, can be an integer from 1 to 4096 depending on the queue.
    wall_minutes: int, max wall time in minutes, depends on the queue and the number of nodes, max 1440 minutes
    job_mode: str, Balsam job mode, can be 'mpi', 'serial'
    wf_filter: str, Selects Balsam jobs that matches the given workflow filter.
    """
    from balsam import setup
    setup()
    from balsam.service import service
    from balsam.core import models
    validjob = True
    QueuedLaunch = models.QueuedLaunch
    mylaunch = QueuedLaunch()
    mylaunch.project = project
    mylaunch.queue = queue
    mylaunch.nodes = nodes
    mylaunch.wall_minutes = wall_minutes
    mylaunch.job_mode = job_mode
    mylaunch.wf_filter = wf_filter
    mylaunch.prescheduled_only = False
    if queue.startswith('debug'):
        if wall_minutes > 60:
            validjob = False
            print(f'Max wall time for {queue} queue is 60 minutes')
        if nodes > 8:
            validjob = False
            print(f'Max number of nodes for {queue} queue is 8')
    else:
        if nodes < 128:
            validjob = False
            print(f'Min number of nodes for {queue} queue is 128')
    if save and validjob:
        mylaunch.save()
        print(f'Ready to submit')
        if submit:
            service.submit_qlaunch(mylaunch, verbose=True)
Example #12
0
def rm(args):
    from balsam import setup
    setup()
    from balsam.core import models
    Job = models.BalsamJob
    AppDef = models.ApplicationDefinition

    objects_name = args.objects
    name = args.name
    objid = args.id
    deleteall = args.all
    force = args.force

    # Are we removing jobs or apps?
    if objects_name.startswith('job'): cls = Job
    elif objects_name.startswith('app'): cls = AppDef
    objects = cls.objects

    # Filter: all objects, by name-match (multiple), or by ID (unique)?
    if deleteall:
        deletion_objs = objects.all()
        message = f"ALL {objects_name}"
    elif name:
        deletion_objs = objects.filter(name__icontains=name)
        message = f"{len(deletion_objs)} {objects_name} matching name {name}"
        if not deletion_objs.exists():
            print("No {objects_name} matching query")
            return
    elif objid:
        deletion_objs = objects.filter(pk__icontains=objid)
        if deletion_objs.count() > 1:
            raise RuntimeError(f"Multiple {objects_name} match ID")
        elif deletion_objs.count() == 0:
            raise RuntimeError(f"No {objects_name} match ID")
        else:
            message = f"{objects_name[:-1]} with ID matching {objid}"

    # User confirmation
    if not force:
        if not cmd_confirmation(f"PERMANENTLY remove {message}?"):
            print("Delete aborted")
            return

    # Actually delete things here
    deletion_objs.delete()
    print("Deleted.")
Example #13
0
def run_migrations():
    from django.core.management import call_command
    from balsam.django_config.db_index import refresh_db_index
    setup()
    print(f"DB settings:", settings.DATABASES['default'])
    call_command('makemigrations', interactive=True, verbosity=2)
    call_command('migrate', interactive=True, verbosity=2)
    refresh_db_index()
    try:
        from balsam.core.models import BalsamJob
        j = BalsamJob()
        j.save()
        j.delete()
    except:
        print("BalsamJob table not properly created")
        raise
    else:
        print("BalsamJob table created successfully")
Example #14
0
def make_dummies(args):
    from balsam import setup
    setup()
    from balsam.core import models
    Job = models.BalsamJob
    App = models.ApplicationDefinition
    if not App.objects.filter(name='dummy').exists():
        dummy_app = App(name="dummy", executable="echo")
        dummy_app.save()

    jobs = [
        Job(name=f'dummy{i}',
            description='Added by balsam make_dummies',
            node_packing_count=64,
            workflow='dummy',
            application='dummy',
            args='hello') for i in range(args.num)
    ]
    Job.objects.bulk_create(jobs)
    print(f"Added {args.num} dummy jobs to the DB")
Example #15
0
def kill(args):
    from balsam import setup
    setup()
    from balsam.core import models
    from balsam.launcher import dag
    Job = models.BalsamJob

    job_id = args.id

    job = Job.objects.filter(job_id__startswith=job_id)
    if job.count() > 1:
        raise RuntimeError(f"More than one job matches {job_id}")
    if job.count() == 0:
        print(f"No jobs match the given ID {job_id}")

    job = job.first()

    if cmd_confirmation(f'Really kill job {job.name} {job.cute_id} ??'):
        dag.kill(job, recursive=args.recursive)
        print("Job killed")
Example #16
0
def submitlaunch(args):
    from balsam import setup
    setup()
    from balsam.service import service
    from balsam.core import models
    from django.db import connection, transaction

    # Exclusive Lock on core_queuedlaunch
    with transaction.atomic():
        with connection.cursor() as cursor:
            cursor.execute('LOCK TABLE core_queuedlaunch IN ACCESS EXCLUSIVE MODE;')
            QueuedLaunch = models.QueuedLaunch
            qlaunch = QueuedLaunch(
                    project=args.project,
                    queue=args.queue,
                    nodes=args.nodes,
                    wall_minutes=args.time_minutes,
                    job_mode=args.job_mode,
                    wf_filter=args.wf_filter,
                    sched_flags=args.sched_flags,
                    prescheduled_only=False)
            qlaunch.save()
            service.submit_qlaunch(qlaunch, verbose=True)
Example #17
0
from collections import namedtuple, defaultdict
import time
from balsam import setup
setup()
from balsam.core.models import BalsamJob
from balsam.core.models import END_STATES
from balsam.launcher import dag


class TaskFailed(Exception):
    pass


WaitResult = namedtuple('WaitResult',
                        ['active', 'done', 'failed', 'cancelled'])


def _timer(timeout):
    if timeout is None:
        return lambda: True
    else:
        timeout = max(float(timeout), 0.01)
        start = time.time()
        return lambda: (time.time() - start) < timeout


def _to_state(state):
    if state not in END_STATES:
        return 'active'
    elif state == 'JOB_FINISHED':
        return 'done'