def setup_position_scan_files(job, pos_scan_str=None): """ Takes a model.Job entity and creates appropriate files on each CRUNCHING_HOSTS server, ready to run a FoldX PositionScan job. Optionally, pos_scan_str can be provided (eg a <PositionScan> string generated using chain2pos_scan_str). """ if not pos_scan_str: pos_scan_str = chain2pos_scan_str(job.params.chain_pssm, \ job.params.pdbfile, \ mutation_set=job.params.mutation_set) job_sub_path = pjoin(job.path, pos_scan_str.replace(',', '-')) if job.params.predict_water: water = "-PREDICT" else: water = "-CRYSTAL" run_txt = POSITION_SCAN_TEMPLATE % {'position_scan':pos_scan_str, 'water':water} # fill out the qsub script with variables callback_url = get_callback_url(job, part=pos_scan_str) qsub_script = FOLDX_SCRIPT_TEMPLATE % {'foldx':FOLDX_BIN, \ 'callback_url':callback_url,\ 'job_uuid':job.uuid,\ 'part':pos_scan_str} for host in CRUNCHING_HOSTS: with fabsettings(hide('stderr', 'stdout'), \ host_string=host, \ key_filename=SSH_KEYS): with fabsettings(warn_only=True): if not exists(job.path): run('mkdir %s' % (job.path)) run('mkdir %s' % (job_sub_path)) remote_pdb_path = pjoin(job.path, "TEMPLATE.pdb") if not exists(remote_pdb_path): put(job.params.pdbfile.fullpath(), \ remote_pdb_path) run("ln -s %s %s" % (remote_pdb_path, \ pjoin(job_sub_path, "TEMPLATE.pdb"))) run("ln -s %s %s" % (ROTABASE_PATH, \ pjoin(job_sub_path, "rotabase.txt"))) put(StringIO(run_txt), pjoin(job_sub_path,'run.txt')) put(StringIO(LIST_TXT), pjoin(job_sub_path,'list.txt')) put(StringIO(qsub_script), pjoin(job_sub_path,'foldx.qsub')) fabric.network.disconnect_all() return job_sub_path
def wfdeploy(branch='master'): local('git add -p') with fabsettings(warn_only=True): result = local('git commit') local("git push origin %s" % branch) code_dir = '~/webapps/ckwilcox_com' with cd(code_dir): run("git pull origin %s" % branch)
def wfdeploy(branch='master'): local('git add -p') with fabsettings(warn_only=True): result = local('git commit') local("git push origin %s" % branch) code_dir = '~/webapps/piles_app/piles_io/' with cd(code_dir): run("git pull origin %s" % branch) run("../apache2/bin/restart")
def kill_remote_job(self, task_data=None, **kwargs): from ..models import Job _init_fabric_env() environment = task_data.get('environment', {}) job_id = task_data.get('job_id') job = Job.objects.get(id=job_id) master_ip = job.compute_resource.host gateway = job.compute_resource.gateway_server queue_type = job.compute_resource.queue_type private_key = job.compute_resource.private_key remote_username = job.compute_resource.extra.get('username', None) working_dir = job.abs_path_on_compute kill_script_path = join(working_dir, 'kill_job.sh') message = "No message." try: with fabsettings(gateway=gateway, host_string=master_ip, user=remote_username, key=private_key): with cd(working_dir): with shell_env(**environment): # if queue_type == 'slurm': # result = run(f"scancel {job.remote_id}") # else: # result = run(f"kill {job.remote_id}") result = run(f"{kill_script_path} kill") job_killed = result.succeeded except BaseException as e: if hasattr(e, 'message'): message = e.message self.update_state(state=states.FAILURE, meta=message) raise e task_data.update(result=result) return task_data
def poll_job_ps(self, task_data=None, **kwargs): from ..models import Job job_id = task_data.get('job_id') job = Job.objects.get(id=job_id) master_ip = job.compute_resource.host gateway = job.compute_resource.gateway_server _init_fabric_env() private_key = job.compute_resource.private_key remote_username = job.compute_resource.extra.get('username', None) message = "No message." try: with fabsettings(gateway=gateway, host_string=master_ip, user=remote_username, key=private_key): with shell_env(): result = run(f"ps - u {remote_username} -o pid | " f"tr -d ' ' | " f"grep '^{job.remote_id}$'") job_is_not_running = not result.succeeded except BaseException as e: if hasattr(e, 'message'): message = e.message self.update_state(state=states.FAILURE, meta=message) raise e # grab the Job from the database again to minimise race condition # where the status updated while we are running ssh'ing and running 'ps' job = Job.objects.get(id=job_id) if not job.done and job_is_not_running: job.status = Job.STATUS_FAILED job.save() index_remote_files.apply_async(args=(dict(job_id=job_id), )) task_data.update(result=result) return task_data
def get_results(job): """ Pulls raw results files back from the compute node(s). """ for host in CRUNCHING_HOSTS: with fabsettings(hide('stderr', 'stdout'), host_string=host): result = get(job.path, settings.RESULTS_ROOT) """ with cd(job.path): # equivalent to "../"+{uuid}, but safer not to assume #reljobdir = os.path.join("..", \ # os.path.basename( \ # os.path.dirname(job.path))) reljobdir = "../"+job.uuid tarball = uuid+".tar.bz2" result = run("%s %s %s" % (TAR_COMMAND, tarball, reljobdir)) """ if result: return os.path.join(settings.RESULTS_ROOT, job.uuid) else: return None
def check_job_part_done(job, part, filename="FINISHED"): """ Looks for the existence of the file "FINISHED" (or whatever filename is specified) in the job/part directory. Returns True if it exists, else False. Used as a secondary check in case the job_complete webhook fails to notify the server that the job has finished. """ # TODO: rather than just check for finish, also periodically call # another function that looks at the number of lines in each # interfacr/<job_uuid>/*/energies_*_TEMPLATE.txt # file and presents a progress bar to the user for host in CRUNCHING_HOSTS: with fabsettings(hide('stderr', 'stdout'), host_string=host): finpath = os.path.join(*[job.path, part, "FINISHED"]) log.debug("Checking for file: " + finpath) if exists(finpath): log.debug(finpath + " found. Job done !") return True else: log.debug(finpath + " not found. Job isn't finished.") return False
def start_job(self, task_data=None, **kwargs): from ..models import Job if task_data is None: raise InvalidTaskError("task_data is None") job_id = task_data.get('job_id') job = Job.objects.get(id=job_id) result = task_data.get('result') master_ip = job.compute_resource.host gateway = job.compute_resource.gateway_server webhook_notify_url = '' # secret = None environment = task_data.get('environment', {}) job_auth_header = task_data.get('job_auth_header', '') # environment.update(JOB_ID=job_id) _init_fabric_env() private_key = job.compute_resource.private_key remote_username = job.compute_resource.extra.get('username', None) job_script_template_vars = dict(environment) job_script_template_vars['JOB_AUTH_HEADER'] = job_auth_header job_script = BytesIO( render_to_string('job_scripts/run_job.sh', context=job_script_template_vars).encode('utf-8')) kill_script = BytesIO( render_to_string('job_scripts/kill_job.sh', context=job_script_template_vars).encode('utf-8')) curl_headers = BytesIO(b"%s\n" % job_auth_header.encode('utf-8')) config_json = BytesIO(json.dumps(job.params).encode('utf-8')) remote_id = None message = "Failure, without exception." try: with fabsettings( gateway=gateway, host_string=master_ip, user=remote_username, key=private_key, # key_filename=expanduser("~/.ssh/id_rsa"), ): working_dir = job.abs_path_on_compute input_dir = join(working_dir, 'input') output_dir = join(working_dir, 'output') job_script_path = join(input_dir, 'run_job.sh') kill_script_path = join(working_dir, 'kill_job.sh') for d in [working_dir, input_dir, output_dir]: result = run(f'mkdir -p {d} && chmod 700 {d}') result = put(job_script, job_script_path, mode=0o700) result = put(kill_script, kill_script_path, mode=0o700) result = put(curl_headers, join(working_dir, '.private_request_headers'), mode=0o600) result = put(config_json, join(input_dir, 'pipeline_config.json'), mode=0o600) with cd(working_dir): with shell_env(**environment): # NOTE: We can't sbatch the run_job.sh script due to # the local aria2c RPC daemon launched by laxydl # In the future, we may have a DataTransferHost where # the data staging steps run, then we could launch # run_job.sh via sbatch. # if job.compute_resource.queue_type == 'slurm': # result = run(f"sbatch --parsable " # f'--job-name="laxy:{job_id}" ' # f"--output output/run_job.out " # f"{job_script_path} " # f" >>slurm.jids") # remote_id = run(str("head -1 slurm.jids")) # The job script is always run locally on the compute # node (not sbatched), but will itself send jobs # to the queue. result = run(f"nohup bash -l -c '" f"{job_script_path} & " f"echo $! >>job.pids" f"' >output/run_job.out") remote_id = run(str("head -1 job.pids")) succeeded = result.succeeded except BaseException as e: succeeded = False if hasattr(e, 'message'): message = e.message if hasattr(e, '__traceback__'): tb = e.__traceback__ message = '%s - Traceback: %s' % (message, ''.join( traceback.format_list(traceback.extract_tb(tb)))) else: message = repr(e) if not succeeded and job.compute_resource.disposable: job.compute_resource.dispose() job_status = Job.STATUS_RUNNING if succeeded else Job.STATUS_FAILED job = Job.objects.get(id=job_id) job.status = job_status job.remote_id = remote_id job.save() # if webhook_notify_url: # job_status = Job.STATUS_STARTING if succeeded else Job.STATUS_FAILED # resp = request_with_retries( # 'PATCH', callback_url, # json={'status': job_status}, # headers={'Authorization': secret}, # ) if not succeeded: self.update_state(state=states.FAILURE, meta=message) raise Exception(message) # raise Ignore() task_data.update(result=result) return task_data
def estimate_job_tarball_size(self, task_data=None, **kwargs): if task_data is None: raise InvalidTaskError("task_data is None") from ..models import Job _init_fabric_env() environment = task_data.get('environment', {}) job_id = task_data.get('job_id') job = Job.objects.get(id=job_id) master_ip = job.compute_resource.host gateway = job.compute_resource.gateway_server queue_type = job.compute_resource.queue_type private_key = job.compute_resource.private_key remote_username = job.compute_resource.extra.get('username', None) job_path = job.abs_path_on_compute message = "No message." task_result = dict() try: with fabsettings(gateway=gateway, host_string=master_ip, user=remote_username, key=private_key): with cd(job_path): with shell_env(**environment): # if queue_type == 'slurm': # result = run(f"scancel {job.remote_id}") # else: # result = run(f"kill {job.remote_id}") # NOTE: If running tar -czf is too slow / too much extra I/O load, # we could use the placeholder heuristic of # f`du -bc --max-depth=0 "{job_path}"` * 0.66 for RNAsik runs, # stored in job metadata. Or add proper sizes to every File.metdata # and derive it from a query. result = run( f'tar -czf - --directory "{job_path}" . | wc --bytes') if result.succeeded: tarball_size = int(result.stdout.strip()) with transaction.atomic(): job = Job.objects.get(id=job_id) job.params['tarball_size'] = tarball_size job.save() task_result['tarball_size'] = tarball_size else: task_result['stdout'] = result.stdout.strip() task_result['stderr'] = result.stderr.strip() except BaseException as e: if hasattr(e, 'message'): message = e.message self.update_state(state=states.FAILURE, meta=message) raise e task_data.update(result=task_result) return task_data
def index_remote_files(self, task_data=None, **kwargs): if task_data is None: raise InvalidTaskError("task_data is None") job_id = task_data.get('job_id') job = Job.objects.get(id=job_id) clobber = task_data.get('clobber', False) compute_resource = job.compute_resource if compute_resource is not None: master_ip = compute_resource.host gateway = compute_resource.gateway_server else: logger.info(f"Not indexing files for {job_id}, no compute_resource.") return task_data job.log_event('JOB_INFO', 'Indexing all files (backend task)') environment = task_data.get('environment', {}) # environment.update(JOB_ID=job_id) _init_fabric_env() private_key = job.compute_resource.private_key remote_username = job.compute_resource.extra.get('username', None) compute_id = job.compute_resource.id message = "No message." def create_update_file_objects(remote_path, fileset=None, prefix_path='', location_base=''): """ Returns a list of (unsaved) File objects from a recursive 'find' of a remote directory. If a file of the same path exists in the FileSet, update the file object location (if unset) rather than create a new one. :param fileset: :type fileset: :param prefix_path: :type prefix_path: :param remote_path: Path on the remote server. :type remote_path: str :param location_base: Prefix of location URL (eg sftp://127.0.0.1/XxX/) :type location_base: str :return: A list of File objects :rtype: List[File] """ with cd(remote_path): filepaths = remote_list_files('.') urls = [(f'{location_base}/{fpath}', fpath) for fpath in filepaths] file_objs = [] for location, filepath in urls: fname = Path(filepath).name fpath = Path(prefix_path) / Path(filepath).parent if fileset: f = fileset.get_file_by_path(Path(fpath) / Path(fname)) if not f: f = File(location=location, owner=job.owner, name=fname, path=fpath) elif not f.location: f.location = location f.owner = job.owner file_objs.append(f) return file_objs try: with fabsettings( gateway=gateway, host_string=master_ip, user=remote_username, key=private_key, # key_filename=expanduser("~/.ssh/id_rsa"), ): working_dir = job.abs_path_on_compute input_dir = os.path.join(working_dir, 'input') output_dir = os.path.join(working_dir, 'output') output_files = create_update_file_objects( output_dir, fileset=job.output_files, prefix_path='output', location_base=laxy_sftp_url(job, 'output'), ) job.output_files.path = 'output' job.output_files.owner = job.owner if clobber: job.output_files.remove(job.output_files, delete=True) job.output_files.add(output_files) # TODO: This should really be done at job start, or once input data # has been staged on the compute node. input_files = create_update_file_objects( input_dir, fileset=job.input_files, prefix_path='input', location_base=laxy_sftp_url(job, 'input')) job.input_files.path = 'input' job.input_files.owner = job.owner if clobber: job.input_files.remove(job.input_files, delete=True) job.input_files.add(input_files) succeeded = True except BaseException as e: succeeded = False if hasattr(e, 'message'): message = e.message self.update_state(state=states.FAILURE, meta=message) raise e # job_status = Job.STATUS_RUNNING if succeeded else Job.STATUS_FAILED # job = Job.objects.get(id=job_id) # job.status = job_status # job.save() # if not succeeded: # self.update_state(state=states.FAILURE, meta=message) # raise Exception(message) # # raise Ignore() return task_data
def run_position_scan_parallel(job, threads=None): pos_scan_str = chain2pos_scan_str(job.params.chain_pssm, \ job.params.pdbfile, \ mutation_set=job.params.mutation_set) positions = pos_scan_str.split(',') # this re-globs positions to match the number of 'threads' globbed_ps = [] if threads and not (threads > len(positions)): c = 0 ps = "" globsize = len(positions)/threads while 1: if positions: ps += positions.pop()+"," else: break if (c % globsize == 0): ps=ps[:-1] globbed_ps.append(ps) ps = "" c +=1 if ps: globbed_ps.append(ps) positions = globbed_ps #### for pos in positions: # setup remote files on cluster job_sub_path = setup_position_scan_files(job, pos_scan_str=pos) # update the job record so it knows about this 'sub-job' job.running_parts.append(pos) for host in CRUNCHING_HOSTS: with fabsettings(hide('stderr', 'stdout'), host_string=host): with cd(job_sub_path): # stdin, stdout and stderr must all be redirected somewhere # (/dev/null or a file) so that the shell can be properly # backgrounded and detached .. otherwise run() becomes blocking """ cmd = 'nohup %(foldx)s -runfile run.txt >foldx.out 2>foldx.err </dev/null; \ curl "%(url)s" &>callback.out </dev/null &' % \ {'foldx':FOLDX_BIN, 'url':callback_url} """ # submit a job to the remote queing system # note: the sleep seems to be required otherwise # some jobs fail to queue - I guess fabric is killing the # connection too quickly cmd = "qsub foldx.qsub & sleep %i" % (QSUB_SLEEP) result = run(cmd) log.debug(result) # test: this detaches and is non-blocking #result = run("nohup yes >& /dev/null < /dev/null &") fabric.network.disconnect_all() job.start_date = datetime.datetime.now() job.save() return job.uuid