def submitlaunch(args): from balsam import setup setup() from balsam.service import service from balsam.core import models Job = models.BalsamJob from django.db import connection, transaction if args.wf_filter and not Job.objects.filter( workflow=args.wf_filter).exists(): raise RuntimeError( f"No job with wf_filter={args.wf_filter} registered in local DB") # Exclusive Lock on core_queuedlaunch with transaction.atomic(): with connection.cursor() as cursor: cursor.execute( 'LOCK TABLE core_queuedlaunch IN ACCESS EXCLUSIVE MODE;') QueuedLaunch = models.QueuedLaunch qlaunch = QueuedLaunch(project=args.project, queue=args.queue, nodes=args.nodes, wall_minutes=args.time_minutes, job_mode=args.job_mode, wf_filter=args.wf_filter, sched_flags=args.sched_flags, prescheduled_only=False) qlaunch.save() service.submit_qlaunch(qlaunch, verbose=True)
def ls(args): from balsam import setup setup() # from balsam.core import models import balsam.scripts.ls_commands as lscmd objects = args.objects name = args.name history = args.history verbose = args.verbose state = args.state id = args.id tree = args.tree wf = args.wf by_states = args.by_states try: if objects.startswith('job'): lscmd.ls_jobs(name, history, id, verbose, tree, wf, state, by_states) elif objects.startswith('app'): lscmd.ls_apps(name, id, verbose) elif objects.startswith('work') or objects.startswith('wf'): lscmd.ls_wf(name, verbose, tree, wf) elif objects.startswith('queues'): lscmd.ls_queues(verbose) except (KeyboardInterrupt, BrokenPipeError): pass
def newapp(args): from balsam import setup setup() from balsam.core.models import ApplicationDefinition as AppDef def py_app_path(path): if not path: return '' args = path.split() app = args[0] if app.endswith('.py'): exe = sys.executable script_path = os.path.abspath(app) args = ' '.join(args[1:]) path = ' '.join((exe, script_path, args)) return path if AppDef.objects.filter(name=args.name).exists(): raise RuntimeError(f"An application named {args.name} already exists") app = AppDef() app.name = args.name app.description = ' '.join(args.description) if args.description else '' app.executable = py_app_path(args.executable) app.preprocess = py_app_path(args.preprocess) app.postprocess = py_app_path(args.postprocess) app.save() print(app) print("Added app to database")
def modify(args): from balsam import setup setup() from balsam.core import models Job = models.BalsamJob AppDef = models.ApplicationDefinition if args.type == 'jobs': cls = Job elif args.type == 'apps': cls = AppDef item = cls.objects.filter(pk__contains=args.id) if item.count() == 0: raise RuntimeError(f"no matching {args.type}") elif item.count() > 1: raise RuntimeError(f"more than one matching {args.type}") item = item.first() target_type = type(getattr(item, args.attr)) new_value = target_type(args.value) if args.attr == 'state': if item.state == 'USER_KILLED': print("Cannot mutate state of a killed job") return item.update_state(new_value, 'User mutated state from command line') else: setattr(item, args.attr, new_value) item.save() print(f'{args.type[:-1]} {args.attr} changed to: {new_value}')
def rm(args): from balsam import setup setup() from balsam.core import models Job = models.BalsamJob AppDef = models.ApplicationDefinition objects_name = args.objects name = args.name objid = args.id deleteall = args.all force = args.force wf_filter = args.wf_filter # Are we removing jobs or apps? if objects_name.startswith('job'): cls = Job elif objects_name.startswith('app'): cls = AppDef objects = cls.objects # Filter: all objects, by name-match (multiple), or by ID (unique)? if deleteall: deletion_objs = objects.all() message = f"ALL {objects_name}" elif name is not None: # preferable to "elif name:", since "balsam rm jobs --name=" (empty string) # can be a valid choice to match all job names. "is not None" distinguishes if # flag is present if wf_filter is not None: if objects_name.startswith('app'): raise RuntimeError( "--wf-filter flag incompatible with balsam rm app") else: # use strict name checking for workflow, otherwise very dangerous deletion_objs = objects.filter(name__icontains=name, workflow=wf_filter) else: deletion_objs = objects.filter(name__icontains=name) message = f"{len(deletion_objs)} {objects_name} matching name {name}" if not deletion_objs.exists(): print(f"No {objects_name} matching query") return elif objid is not None: deletion_objs = objects.filter(pk__icontains=objid) if deletion_objs.count() > 1: raise RuntimeError(f"Multiple {objects_name} match ID") elif deletion_objs.count() == 0: raise RuntimeError(f"No {objects_name} match ID") else: message = f"{objects_name[:-1]} with ID matching {objid}" # User confirmation if not force: if not cmd_confirmation(f"PERMANENTLY remove {message}?"): print("Delete aborted") return # Actually delete things here deletion_objs.delete() print("Deleted.")
def log(args): from balsam import settings, setup setup() path = os.path.join(settings.LOGGING_DIRECTORY, '*.log') try: subprocess.run(f"tail -f {path}", shell=True) except (KeyboardInterrupt, BrokenPipeError, ProcessLookupError): pass
def newdep(args): from balsam import setup setup() # from balsam.core import models from balsam.launcher import dag parent = match_uniq_job(args.parent) child = match_uniq_job(args.child) dag.add_dependency(parent, child) print(f"Created link {parent.cute_id} --> {child.cute_id}")
def mkchild(args): from balsam import setup setup() from balsam.launcher import dag if not dag.current_job: raise RuntimeError(f"mkchild requires that BALSAM_JOB_ID is in the environment") child_job = newjob(args) dag.add_dependency(dag.current_job, child_job) print(f"Created link {dag.current_job.cute_id} --> {child_job.cute_id}")
def match_uniq_job(s): from balsam import setup setup() from balsam.core import models Job = models.BalsamJob job = Job.objects.filter(job_id__icontains=s) count = job.count() if count > 1: raise ValueError(f"More than one ID matched {s}") elif count == 1: return job.first() else: raise ValueError(f"No job in local DB matched {s}")
def newjob(args): from balsam import setup setup() from balsam.core import models Job = models.BalsamJob AppDef = models.ApplicationDefinition if not AppDef.objects.filter(name=args.application).exists(): raise RuntimeError( f"App {args.application} not registered in local DB") job = Job() job.name = args.name job.description = ' '.join(args.description) job.workflow = args.workflow job.wall_time_minutes = args.wall_time_minutes job.num_nodes = args.num_nodes job.coschedule_num_nodes = args.coschedule_num_nodes job.node_packing_count = args.node_packing_count job.ranks_per_node = args.ranks_per_node job.threads_per_rank = args.threads_per_rank job.threads_per_core = args.threads_per_core job.application = args.application job.args = ' '.join(args.args) job.mpi_flags = ' '.join(args.mpi_flags) job.post_error_handler = args.post_handle_error job.post_timeout_handler = args.post_handle_timeout job.auto_timeout_retry = not args.disable_auto_timeout_retry job.input_files = ' '.join(args.input_files) job.stage_in_url = args.url_in job.stage_out_url = args.url_out job.stage_out_files = ' '.join(args.stage_out_files) job.environ_vars = ":".join(args.env) print(job) if not args.yes: num_wf_jobs = Job.objects.filter(workflow=job.workflow).count() print( f"[INFO]: The workflow \"{job.workflow}\" currently has {num_wf_jobs} jobs in the database" ) if not cmd_confirmation('Confirm adding job to DB'): print("Add job aborted") return job.save() return job print("Added job to database")
def submit_jobs(project='', queue='debug-cache-quad', nodes=1, wall_minutes=30, job_mode='mpi', wf_filter='', save=False, submit=False): """ Submits a job to the queue with the given parameters. Parameters ---------- project: str, name of the project to be charged queue: str, queue name, can be: 'default', 'debug-cache-quad', 'debug-flat-quad', 'backfill' nodes: int, Number of nodes, can be an integer from 1 to 4096 depending on the queue. wall_minutes: int, max wall time in minutes, depends on the queue and the number of nodes, max 1440 minutes job_mode: str, Balsam job mode, can be 'mpi', 'serial' wf_filter: str, Selects Balsam jobs that matches the given workflow filter. """ from balsam import setup setup() from balsam.service import service from balsam.core import models validjob = True QueuedLaunch = models.QueuedLaunch mylaunch = QueuedLaunch() mylaunch.project = project mylaunch.queue = queue mylaunch.nodes = nodes mylaunch.wall_minutes = wall_minutes mylaunch.job_mode = job_mode mylaunch.wf_filter = wf_filter mylaunch.prescheduled_only = False if queue.startswith('debug'): if wall_minutes > 60: validjob = False print(f'Max wall time for {queue} queue is 60 minutes') if nodes > 8: validjob = False print(f'Max number of nodes for {queue} queue is 8') else: if nodes < 128: validjob = False print(f'Min number of nodes for {queue} queue is 128') if save and validjob: mylaunch.save() print(f'Ready to submit') if submit: service.submit_qlaunch(mylaunch, verbose=True)
def rm(args): from balsam import setup setup() from balsam.core import models Job = models.BalsamJob AppDef = models.ApplicationDefinition objects_name = args.objects name = args.name objid = args.id deleteall = args.all force = args.force # Are we removing jobs or apps? if objects_name.startswith('job'): cls = Job elif objects_name.startswith('app'): cls = AppDef objects = cls.objects # Filter: all objects, by name-match (multiple), or by ID (unique)? if deleteall: deletion_objs = objects.all() message = f"ALL {objects_name}" elif name: deletion_objs = objects.filter(name__icontains=name) message = f"{len(deletion_objs)} {objects_name} matching name {name}" if not deletion_objs.exists(): print("No {objects_name} matching query") return elif objid: deletion_objs = objects.filter(pk__icontains=objid) if deletion_objs.count() > 1: raise RuntimeError(f"Multiple {objects_name} match ID") elif deletion_objs.count() == 0: raise RuntimeError(f"No {objects_name} match ID") else: message = f"{objects_name[:-1]} with ID matching {objid}" # User confirmation if not force: if not cmd_confirmation(f"PERMANENTLY remove {message}?"): print("Delete aborted") return # Actually delete things here deletion_objs.delete() print("Deleted.")
def run_migrations(): from django.core.management import call_command from balsam.django_config.db_index import refresh_db_index setup() print(f"DB settings:", settings.DATABASES['default']) call_command('makemigrations', interactive=True, verbosity=2) call_command('migrate', interactive=True, verbosity=2) refresh_db_index() try: from balsam.core.models import BalsamJob j = BalsamJob() j.save() j.delete() except: print("BalsamJob table not properly created") raise else: print("BalsamJob table created successfully")
def make_dummies(args): from balsam import setup setup() from balsam.core import models Job = models.BalsamJob App = models.ApplicationDefinition if not App.objects.filter(name='dummy').exists(): dummy_app = App(name="dummy", executable="echo") dummy_app.save() jobs = [ Job(name=f'dummy{i}', description='Added by balsam make_dummies', node_packing_count=64, workflow='dummy', application='dummy', args='hello') for i in range(args.num) ] Job.objects.bulk_create(jobs) print(f"Added {args.num} dummy jobs to the DB")
def kill(args): from balsam import setup setup() from balsam.core import models from balsam.launcher import dag Job = models.BalsamJob job_id = args.id job = Job.objects.filter(job_id__startswith=job_id) if job.count() > 1: raise RuntimeError(f"More than one job matches {job_id}") if job.count() == 0: print(f"No jobs match the given ID {job_id}") job = job.first() if cmd_confirmation(f'Really kill job {job.name} {job.cute_id} ??'): dag.kill(job, recursive=args.recursive) print("Job killed")
def submitlaunch(args): from balsam import setup setup() from balsam.service import service from balsam.core import models from django.db import connection, transaction # Exclusive Lock on core_queuedlaunch with transaction.atomic(): with connection.cursor() as cursor: cursor.execute('LOCK TABLE core_queuedlaunch IN ACCESS EXCLUSIVE MODE;') QueuedLaunch = models.QueuedLaunch qlaunch = QueuedLaunch( project=args.project, queue=args.queue, nodes=args.nodes, wall_minutes=args.time_minutes, job_mode=args.job_mode, wf_filter=args.wf_filter, sched_flags=args.sched_flags, prescheduled_only=False) qlaunch.save() service.submit_qlaunch(qlaunch, verbose=True)
from collections import namedtuple, defaultdict import time from balsam import setup setup() from balsam.core.models import BalsamJob from balsam.core.models import END_STATES from balsam.launcher import dag class TaskFailed(Exception): pass WaitResult = namedtuple('WaitResult', ['active', 'done', 'failed', 'cancelled']) def _timer(timeout): if timeout is None: return lambda: True else: timeout = max(float(timeout), 0.01) start = time.time() return lambda: (time.time() - start) < timeout def _to_state(state): if state not in END_STATES: return 'active' elif state == 'JOB_FINISHED': return 'done'