def run_probe( pdb_filename , variants , probe_output_filename = '' , run = True ): """ Runs PROBE on <pdb_filename> on the positions found among <variants> using the default options in PROBE_OPTIONS and writes the output to <probe_output_filename> (also returns this output filename) """ if not probe_output_filename: probe_output_filename = os.path.abspath( pdb_filename ).rstrip( '.pdb' ) + '.probe_out' # get the unique variant positions positions = list( set( [i[1:-1] for i in variants] ) ) positions.sort() # generate the commands to run # command = '#!/bin/sh\nrm ' + probe_output_filename + '\ntouch ' + probe_output_filename + '\n' command = 'rm ' + probe_output_filename + ';touch ' + probe_output_filename + ';' # delete any prior copy since we will append to it for i in positions: probe_options = {} probe_options.update( PROBE_OPTIONS ) probe_options['out'] = pdb_filename probe_options['Q'] = str( i ) command += create_executable_str( PATH_TO_PROBE , [] , probe_options , probe_output_filename , append = True ) +';'#'\n' # run PROBE, store the output if run: run_local_commandline( command ) return probe_output_filename , positions else: # the command, well, get positions etc. too return command , probe_output_filename , positions
def run_psiblast(sequence_filename, run=True): """ Runs PSIBLAST on <sequence_filename> using the default options in PSIBLAST_OPTIONS and returns the relevant output file: "out_ascii_pssm" """ root_filename = os.path.abspath(sequence_filename).rstrip('.fa') # collect the options, set the input, derive the output filenames psiblast_options = {} psiblast_options.update(PSIBLAST_OPTIONS) psiblast_options['query'] = sequence_filename for i in psiblast_options.keys(): if '__call__' in dir(psiblast_options[i]): psiblast_options[i] = psiblast_options[i](root_filename) for i in psiblast_options.keys(): if isinstance(psiblast_options[i], str) and os.path.isfile( psiblast_options[i]): psiblast_options[i] = os.path.abspath(psiblast_options[i]) command = create_executable_str(PATH_TO_PSIBLAST, args=[], options=psiblast_options) if run: run_local_commandline(command) # the only output we need return psiblast_options['out_ascii_pssm'] else: # just send the command return command, psiblast_options['out_ascii_pssm']
def run_rosetta_relax_local( pdb_filename , extra_options = {} , run = True ): root_filename = os.path.abspath( pdb_filename ).replace( '.pdb' , '' ) # collect the options, set the input, derive the output filenames relax_options = {} relax_options.update( ROSETTA_RELAX_OPTIONS ) relax_options.update( extra_options ) relax_options['s'] = pdb_filename relax_options['native'] = pdb_filename # required to get gdtmm scores for i in relax_options.keys(): if '__call__' in dir( relax_options[i] ): relax_options[i] = relax_options[i]( root_filename ) for i in relax_options.keys(): if isinstance( relax_options[i] , str ) and os.path.isfile( relax_options[i] ): relax_options[i] = os.path.abspath( relax_options[i] ) # ...weird Rosetta append behavior... if os.path.isfile( relax_options['out:file:silent'] ): os.remove( relax_options['out:file:silent'] ) if os.path.isfile( relax_options['out:file:scorefile'] ): os.remove( relax_options['out:file:scorefile'] ) command = create_executable_str( PATH_TO_ROSETTA_RELAX , args = [] , options = relax_options ) if run: run_local_commandline( command ) # the only output we need return relax_options['out:file:silent'] else: return command , relax_options['out:file:silent']
def run_rosetta_relax_local(pdb_filename, extra_options={}, run=True): root_filename = os.path.abspath(pdb_filename).replace('.pdb', '') # collect the options, set the input, derive the output filenames relax_options = {} relax_options.update(ROSETTA_RELAX_OPTIONS) relax_options.update(extra_options) relax_options['s'] = pdb_filename relax_options['native'] = pdb_filename # required to get gdtmm scores for i in relax_options.keys(): if '__call__' in dir(relax_options[i]): relax_options[i] = relax_options[i](root_filename) for i in relax_options.keys(): if isinstance(relax_options[i], str) and os.path.isfile( relax_options[i]): relax_options[i] = os.path.abspath(relax_options[i]) # ...weird Rosetta append behavior... if os.path.isfile(relax_options['out:file:silent']): os.remove(relax_options['out:file:silent']) if os.path.isfile(relax_options['out:file:scorefile']): os.remove(relax_options['out:file:scorefile']) command = create_executable_str(PATH_TO_ROSETTA_RELAX, args=[], options=relax_options) if run: run_local_commandline(command) # the only output we need return relax_options['out:file:silent'] else: return command, relax_options['out:file:silent']
def run_psiblast( sequence_filename , run = True ): """ Runs PSIBLAST on <sequence_filename> using the default options in PSIBLAST_OPTIONS and returns the relevant output file: "out_ascii_pssm" """ root_filename = os.path.abspath( sequence_filename ).rstrip( '.fa' ) # collect the options, set the input, derive the output filenames psiblast_options = {} psiblast_options.update( PSIBLAST_OPTIONS ) psiblast_options['query'] = sequence_filename for i in psiblast_options.keys(): if '__call__' in dir( psiblast_options[i] ): psiblast_options[i] = psiblast_options[i]( root_filename ) for i in psiblast_options.keys(): if isinstance( psiblast_options[i] , str ) and os.path.isfile( psiblast_options[i] ): psiblast_options[i] = os.path.abspath( psiblast_options[i] ) command = create_executable_str( PATH_TO_PSIBLAST , args = [] , options = psiblast_options ) if run: run_local_commandline( command ) # the only output we need return psiblast_options['out_ascii_pssm'] else: # just send the command return command , psiblast_options['out_ascii_pssm']
def run_rosetta_ddg_monomer(pdb_filename, mut_filename, out_filename='', out_path='', cleanup=True, run=True): root_filename = os.path.abspath(pdb_filename).rstrip('.pdb') # hardcoded...ddg_monomer is such a painful protocol... out_filename = '' if '/' in root_filename: out_filename += '/'.join(root_filename.split('/')[:-1]) + '/' out_filename += 'ddg_predictions.out' # clear it out if it exists, otherwise it will be appended to... if os.path.exists(out_filename): os.remove(out_filename) # collect the options, set the input, derive the output filenames ddg_monomer_options = {} ddg_monomer_options.update(ROSETTA_DDG_MONOMER_OPTIONS) ddg_monomer_options['in:file:s'] = pdb_filename ddg_monomer_options['ddg::mut_file'] = mut_filename for i in ddg_monomer_options.keys(): if '__call__' in dir(ddg_monomer_options[i]): ddg_monomer_options[i] = ddg_monomer_options[i](root_filename) for i in ddg_monomer_options.keys(): if isinstance(ddg_monomer_options[i], str) and os.path.isfile( ddg_monomer_options[i]): ddg_monomer_options[i] = os.path.abspath(ddg_monomer_options[i]) command = '' # optionally move into the specific directory... if out_path: command += 'cd ' + out_path + '; ' #\n\n' command += create_executable_str(PATH_TO_ROSETTA_DDG_MONOMER, args=[], options=ddg_monomer_options) if run: run_local_commandline(command) # optionally cleanup if cleanup: print 'ddg_monomer writes useless output files, deleting these now...' remove_intermediate_ddg_monomer_files() # the only output we need return out_filename else: return command, out_filename
def run_rosetta_rescore(silent_filename, native_filename, score_filename='', run=True): """ Performs extraction of individual PDB structures from <silent_filename> to <out_dir> (default to current location) using the "score" protocol of Rosetta (built against 3.5) Optionally specify <extra_options> """ root_filename = os.path.abspath(silent_filename).rstrip('.silent') score_options = {} score_options.update(ROSETTA_SCORE_OPTIONS) score_options['in:file:silent'] = silent_filename score_options[ 'in:file:native'] = native_filename # required to get gdtmm scores for i in score_options.keys(): if '__call__' in dir(score_options[i]): score_options[i] = score_options[i](root_filename) # necessary... if 'out:file:scorefile' in score_options.keys( ) and not 'rescore.sc' in score_options['out:file:scorefile']: score_options['out:file:scorefile'] = score_options[ 'out:file:scorefile'].replace('.sc', '_rescore.sc') for i in score_options.keys(): if isinstance(score_options[i], str) and os.path.isfile( score_options[i]): score_options[i] = os.path.abspath(score_options[i]) # ...weird Rosetta append behavior... if os.path.isfile(score_options['out:file:scorefile']): os.remove(score_options['out:file:scorefile']) # default options command = create_executable_str(PATH_TO_ROSETTA_SCORE, args=[], options=score_options) if run: run_local_commandline(command) return score_options['out:file:scorefile'] else: return command, score_options['out:file:scorefile']
def get_slurm_queue_status( user = SLURM_USER , header_lines = 1 , trailer_lines = 0 , only_job_status = True ): # header_lines = 2 for FULL queue, = 5 for USER queue ("-u") command = 'squeue' if user: command += ' -u ' + user # queue_info = subprocess.Popen( command.split( ' ' ) , stdout = subprocess.PIPE , stdin = subprocess.PIPE , stderr = subprocess.STDOUT ).communicate()[0] queue_info = run_local_commandline( command , collect_stdout = True ) # debug # print queue_info # simple parsing queue_info = queue_info.split( '\n' ) if trailer_lines: queue_info = queue_info[header_lines:-1*trailer_lines] elif header_lines: queue_info = queue_info[header_lines:] queue_info = [[j for j in i.split( ' ' ) if j.strip()] for i in queue_info if i.strip()] # debug # print queue_info # optionally only report the job statuses if only_job_status: # queue_info = [(i[0][:i[0].find( '.' )] , i[-2]) for i in queue_info] queue_info = [(i[0] , i[4]) for i in queue_info] # make into a dict? job ids should be unique... queue_info = dict( queue_info ) # debug # print queue_info return queue_info
def run_rosetta_ddg_monomer( pdb_filename , mut_filename , out_filename = '' , out_path = '' , cleanup = True , run = True ): root_filename = os.path.abspath( pdb_filename ).rstrip( '.pdb' ) # hardcoded...ddg_monomer is such a painful protocol... out_filename = '' if '/' in root_filename: out_filename += '/'.join( root_filename.split( '/' )[:-1] ) +'/' out_filename += 'ddg_predictions.out' # clear it out if it exists, otherwise it will be appended to... if os.path.exists( out_filename ): os.remove( out_filename ) # collect the options, set the input, derive the output filenames ddg_monomer_options = {} ddg_monomer_options.update( ROSETTA_DDG_MONOMER_OPTIONS ) ddg_monomer_options['in:file:s'] = pdb_filename ddg_monomer_options['ddg::mut_file'] = mut_filename for i in ddg_monomer_options.keys(): if '__call__' in dir( ddg_monomer_options[i] ): ddg_monomer_options[i] = ddg_monomer_options[i]( root_filename ) for i in ddg_monomer_options.keys(): if isinstance( ddg_monomer_options[i] , str ) and os.path.isfile( ddg_monomer_options[i] ): ddg_monomer_options[i] = os.path.abspath( ddg_monomer_options[i] ) command = '' # optionally move into the specific directory... if out_path: command += 'cd '+ out_path +'; ' command += create_executable_str( PATH_TO_ROSETTA_DDG_MONOMER , args = [] , options = ddg_monomer_options ) if run: run_local_commandline( command ) # optionally cleanup if cleanup: print 'ddg_monomer writes useless output files, deleting these now...' remove_intermediate_ddg_monomer_files() # the only output we need return out_filename else: return command , out_filename
def run_probe(pdb_filename, variants, probe_output_filename='', run=True): """ Runs PROBE on <pdb_filename> on the positions found among <variants> using the default options in PROBE_OPTIONS and writes the output to <probe_output_filename> (also returns this output filename) """ if not probe_output_filename: probe_output_filename = os.path.abspath(pdb_filename).rstrip( '.pdb') + '.probe_out' # get the unique variant positions positions = list(set([i[1:-1] for i in variants])) positions.sort() # generate the commands to run # command = '#!/bin/sh\nrm ' + probe_output_filename + '\ntouch ' + probe_output_filename + '\n' command = 'rm ' + probe_output_filename + ';touch ' + probe_output_filename + ';' # delete any prior copy since we will append to it for i in positions: probe_options = {} probe_options.update(PROBE_OPTIONS) probe_options['out'] = pdb_filename probe_options['Q'] = str(i) command += create_executable_str(PATH_TO_PROBE, [], probe_options, probe_output_filename, append=True) + ';' #'\n' # run PROBE, store the output if run: run_local_commandline(command) return probe_output_filename, positions else: # the command, well, get positions etc. too return command, probe_output_filename, positions
def run_rosetta_rescore( silent_filename , native_filename , score_filename = '' , run = True ): """ Performs extraction of individual PDB structures from <silent_filename> to <out_dir> (default to current location) using the "score" protocol of Rosetta (built against 3.5) Optionally specify <extra_options> """ root_filename = os.path.abspath( silent_filename ).rstrip( '.silent' ) score_options = {} score_options.update( ROSETTA_SCORE_OPTIONS ) score_options['in:file:silent'] = silent_filename score_options['in:file:native'] = native_filename # required to get gdtmm scores for i in score_options.keys(): if '__call__' in dir( score_options[i] ): score_options[i] = score_options[i]( root_filename ) # necessary... if 'out:file:scorefile' in score_options.keys() and not 'rescore.sc' in score_options['out:file:scorefile']: score_options['out:file:scorefile'] = score_options['out:file:scorefile'].replace( '.sc' , '_rescore.sc' ) for i in score_options.keys(): if isinstance( score_options[i] , str ) and os.path.isfile( score_options[i] ): score_options[i] = os.path.abspath( score_options[i] ) # ...weird Rosetta append behavior... if os.path.isfile( score_options['out:file:scorefile'] ): os.remove( score_options['out:file:scorefile'] ) # default options command = create_executable_str( PATH_TO_ROSETTA_SCORE , args = [] , options = score_options ) if run: run_local_commandline( command ) return score_options['out:file:scorefile'] else: return command , score_options['out:file:scorefile']
def run_slurm_job( script_filename , slurm_run_command = 'sbatch' , output_filename = 'temp_slurm.sh' ): # optionally write a file if not os.path.isfile( script_filename ) and isinstance( script_filename , str ): # if os.path.isfile( output_filename ): # should really check if it exists first... f = open( output_filename , 'w' ) f.write( script_filename ) f.close() script_filename = output_filename # submit it, grab the name :) # job_id = run_and_record( slurm_run_command +' '+ script_filename , ) job_id = run_local_commandline( slurm_run_command +' '+ script_filename , collect_stdout = True ) job_id = job_id[0].split( ' ' )[-1] return job_id
def get_pbs_queue_status( user = PBS_USER , header_lines = 5 , trailer_lines = 1 , only_job_status = True ): # header_lines = 2 for FULL queue, = 5 for USER queue ("-u") command = 'qstat' if user: command += ' -u ' + user queue_info = run_local_commandline( command , collect_stdout = True ) # simple parsing queue_info = queue_info.split( '\n' )[header_lines:-1*trailer_lines] queue_info = [[j for j in i.split( ' ' ) if j.strip()] for i in queue_info] # optionally only report the job statuses if only_job_status: queue_info = [(i[0][:i[0].find( '.' )] , i[-2]) if '.' in i[0] else (i[0] , i[-2]) for i in queue_info] # make into a dict? job ids should be unique... queue_info = dict( queue_info ) return queue_info
def get_pbs_queue_status(user=PBS_USER, header_lines=5, trailer_lines=1, only_job_status=True): # header_lines = 2 for FULL queue, = 5 for USER queue ("-u") command = 'qstat' if user: command += ' -u ' + user queue_info = run_local_commandline(command, collect_stdout=True) # simple parsing queue_info = queue_info.split('\n')[header_lines:-1 * trailer_lines] queue_info = [[j for j in i.split(' ') if j.strip()] for i in queue_info] # optionally only report the job statuses if only_job_status: queue_info = [(i[0][:i[0].find('.')], i[-2]) if '.' in i[0] else (i[0], i[-2]) for i in queue_info] # make into a dict? job ids should be unique... queue_info = dict(queue_info) return queue_info
def run_VIPUR_PBS(pdb_filename='', variants_filename='', out_path='', write_numbering_map=True, single_relax=True, delete_intermediate_relax_files=True, demo=False, rerun_preprocessing=False): # for the example input if demo: pdb_filename = PATH_TO_VIPUR + '/example_input/2C35.pdb' variants_filename = PATH_TO_VIPUR + '/example_input/2C35.txt' out_path = PATH_TO_VIPUR + '/example_output' # alternatively, run on an entire directory if not pdb_filename and not variants_filename: # current directory print 'no input provided, assuming you want to run on every (.pdb,.txt) file pair found in the current directory' pdb_filename = os.getcwd() if os.path.isdir(pdb_filename) and not variants_filename: # assume variants_filename from pdb_filename variants_filename = '.txt' if os.path.isdir(pdb_filename) and variants_filename[0] == '.': # look for file extension # instead, run on the directory if not out_path: out_path = os.path.abspath(pdb_filename) fa_filenames = [(out_path + '/') * bool(out_path) + i for i in os.listdir(pdb_filename) if get_file_extension(i) == 'fa'] fa_filenames = [ [i, get_root_filename(i) + variants_filename] for i in fa_filenames if os.path.isfile(get_root_filename(i) + variants_filename) and not os.path.isfile(get_root_filename(i) + '.pdb') ] print 'running VIPUR on all (.pdb,' + variants_filename + ') file pairs found in ' + pdb_filename # find .pdb files pdb_filenames = [(out_path + '/') * bool(out_path) + i for i in os.listdir(pdb_filename) if get_file_extension(i) == 'pdb'] # look for pairs pdb_filenames = [ [i, get_root_filename(i) + variants_filename] for i in pdb_filenames if os.path.isfile(get_root_filename(i) + variants_filename) ] print str(len(pdb_filenames)) + ' pairs found' print str(len(fa_filenames)) + ' pairs found for sequence only mode' # go there... # os.chdir( pdb_filename ) if not pdb_filenames: if not fa_filenames: raise IOError('!!! no (.pdb,' + variants_filename + ') file pairs found in ' + pdb_filename + '!!?!\nAND no (.fa,' + variants_filename + ') file pairs were found...') else: print '...only (.fa,' + variants_filename + ') file pairs were found, running in sequence only mode' else: # file extension etc. file_extension = get_file_extension(pdb_filename) root_filename = get_root_filename(pdb_filename) # normal execution, generalize by turning into list pdb_filenames = [] fa_filenames = [] if file_extension == 'pdb': pdb_filenames = [[ (out_path + '/') * bool(out_path) + pdb_filename, (out_path + '/') * bool(out_path) + variants_filename ]] else: fa_filenames = [[]] # combine all "filenames" to run into unified framework target_proteins = [] #None]*(len( pdb_filenames ) + len( fa_filenames )) for i in pdb_filenames: this_out_path = get_root_filename( i[0]) + '_VIPUR' # directory to create target_proteins.append(i + [False, this_out_path]) for i in fa_filenames: this_out_path = get_root_filename( i[0]) + '_VIPUR' # directory to create target_proteins.append(i + [True, this_out_path]) # setup environment variables BEFORE pre processing # no need to setup a command, just run it if PBS_ENVIRONMENT_SETUP: print 'setting up environment variables' run_local_commandline(PBS_ENVIRONMENT_SETUP) # pre processing task_summaries = [] for i in target_proteins: # guess what the task summary filename 'would' be, if it exists, keep going... task_summary_filename = i[3] * bool(i[3]) + '/' + get_root_filename( i[0]).split('/')[-1] + '.task_summary' if os.path.isfile(task_summary_filename) and not rerun_preprocessing: print 'hmmm, ' + i[ 0] + ' seems to have run preprocessing already, skipping now' #continue # skip this one, do not add to list of tasks...? # actually, skip running pre-processing BUT DO add it to the list of tasks else: task_summary_filename = run_preprocessing( i[0], i[1], sequence_only=i[2], out_path=i[3], task_summary_filename=task_summary_filename, write_numbering_map=write_numbering_map, single_relax=single_relax, pymol_environment_setup=PBS_ENVIRONMENT_SETUP) # modify for PBS script task_summary = load_task_summary(task_summary_filename) for j in xrange(len(task_summary['commands'])): command = task_summary['commands'][j]['command'] # add for relax if task_summary['commands'][j]['feature'].replace( '_native', '') == 'relax' and not 'rescore' in task_summary[ 'commands'][j]['feature']: if not PBS_PARALLEL_ROSETTA_ENDING in command: command = command.replace(ROSETTA_ENDING, PBS_PARALLEL_ROSETTA_ENDING) command = PBS_PARALLEL_ROSETTA_EXECUTION_COMMAND + ' ' * bool( PBS_PARALLEL_ROSETTA_EXECUTION_COMMAND) + command if ROSETTA_RELAX_PARALLEL_OPTIONS: command += ' ' + ' '.join([ '-' + k + (' ' + ROSETTA_RELAX_PARALLEL_OPTIONS[k]) * bool(ROSETTA_RELAX_PARALLEL_OPTIONS[k]) for k in ROSETTA_RELAX_PARALLEL_OPTIONS ]) # command += ' -jd2:mpi_file_buf_job_distributor false' # command += ' -run:multiple_processes_writing_to_one_directory' # also use the parallel options pbs_options = 'parallel' #.update( PBS_PARALLEL_JOB_OPTIONS ) else: pbs_options = 'serial' #.update( PBS_SERIAL_JOB_OPTIONS ) # put "cd" in front command = ('cd ' + i[3] + ';') * bool(i[3]) + command # modify the task summary task_summary['commands'][j]['command'] = command # actually write the script... # don't worry about optional #PBS header info # need to add the variant? no, just use the output_filename for this script_filename = i[3] + '/' * bool(i[3]) + get_root_filename( task_summary['commands'][j]['output_filename'].split('/')[-1] ) + '.' + task_summary['commands'][j]['feature'] + '.pbs_script.sh' task_summary['commands'][j]['script_filename'] = script_filename f = open(script_filename, 'w') f.write(PBS_BASH_SCRIPT(command)) f.close() # use the script filename as the source for any log files # control the output and error paths # also generate the pbs call? might as well, keep it simple # no, uses ":" and "," characters... task_summary['commands'][j]['queue'] = pbs_options # rewrite the task summary write_task_summary(task_summary, task_summary_filename) task_summaries.append(task_summary) #_filename ) # run them all run_VIPUR_task_summaries_PBS( task_summaries, single_relax=single_relax, delete_intermediate_relax_files=delete_intermediate_relax_files) # post processing # this look identical!!! :) for i in xrange(len(task_summaries)): # always okay to rerun post processing...should not make any difference sequence_only = target_proteins[i][2] print '\n\n\nExtracting and Analyzing the Results:\n\n' task_summaries[i] = run_postprocessing(task_summaries[i], sequence_only=sequence_only) return task_summaries
def run_VIPUR_tasks_PBS(task_summaries, task_list, max_pbs_tries=2, ddg_monomer_cleanup=True, single_relax=True, delete_intermediate_relax_files=False): # run the non_rescore tasks completed = [ i for i in task_list if 'run' in task_summaries[i[0]]['commands'][i[1]] and 'success' in task_summaries[i[0]]['commands'][i[1]]['run'] ] # should running_or_queued be saved? written to file? running_or_queued = {} rounds = 0 all_completed_jobs = [ ] # prevents annoying bulk output, only see it the first time it completes while not len(completed) == len(task_list): rounds += 1 print '\n\nQUEUE MONITOR ROUND ' + str(rounds) # debug # print running_or_queued # check queue status queue_status = get_pbs_queue_status() # update "running_or_queued" list (?) # err, no, does not have information on which job it is...:( #for i in queue_status.keys(): # if queue_status[i] in ['R' , 'Q']: queue_space_occupied = len([ i for i in queue_status.values() if not i in ['C', 'R'] ]) # ignore "C"ompleted jobs, "R"unning job quota are not set by us... # if your queue system does not have a separate "R"un quota, remove 'R' from the above! available_space = PBS_QUEUE_QUOTA - queue_space_occupied # launch next jobs in available slots if available_space: print str(queue_space_occupied ) + ' jobs queued or running, could submit up to ' + str( available_space) + ' more' # choose the next job jobs_to_run = [ i for i in task_list if not i in completed and not i in running_or_queued.values() and not ('run' in task_summaries[i[0]]['commands'][i[1]] and ('success' in task_summaries[i[0]]['commands'][ i[1]]['run'] or 'failure' in task_summaries[i[0]] ['commands'][i[1]]['run'])) ] print str( len(jobs_to_run) ) + ' jobs still need to finish (after the currently running jobs complete)' # only the next few for i in jobs_to_run[:available_space]: command_dict = task_summaries[i[0]]['commands'][i[1]] # write scripts as part of pre processing?...yeah... # write the command to a script #script_filename = command_dict['out_path'] +'/'*bool( command_dict['out_path'] )+ #script_filename = command_dict['script_filename'] # if its a rescore and relax jobs were separated, need to recombine them! if 'rescore' in command_dict['feature']: # combine the individual relax runs #relax_commands = [i for i in task_summary['commands'] if i['feature'].replace( '_native' , '' ) == 'relax'] #silent_filenames = [j['output_filename'] for j in relax_commands if j['variant'] == i['variant'] and 'run' in j.keys() and j['run'] == 'success'] silent_filenames = [ j['output_filename'] for j in task_summaries[i[0]]['commands'] if j['feature'].replace('_native', '') == 'relax' and j['variant'] == command_dict['variant'] and 'run' in j.keys() and 'success' in j['run'] ] # actually need to identify the combined_silent_filename, be sure the relax files have not already been merged # which variant target_variant = [ j for j in task_summaries[i[0]]['variants'].keys() if j.split('_')[-1] == command_dict['variant'] and j.split('_')[0] in command_dict['command'] ] if not target_variant: # its native combined_silent_filename = task_summaries[ i[0]]['other']['combined_native_silent_filename'] combined_score_filename = task_summaries[ i[0]]['other']['combined_native_score_filename'] elif len(target_variant) > 1: raise Exception( '??? found more than on matching variant ???\n' + ', '.join(target_variant)) else: # found it combined_silent_filename = task_summaries[ i[0]]['variants'][ target_variant[0]]['combined_silent_filename'] combined_score_filename = task_summaries[ i[0]]['variants'][ target_variant[0]]['combined_score_filename'] #if not single_relax: # AND post processing has not already be run...scan for the combined silent file if not single_relax and not os.path.isfile( combined_silent_filename): if not len(silent_filenames ) == ROSETTA_RELAX_OPTIONS['nstruct']: raise Exception( '??? somehow the matching relax run(s) has failed ???\n' + str(i)) score_filenames = [ j.replace('.silent', '.sc') for j in silent_filenames ] merge_rosetta_relax_output( silent_filenames, combined_silent_filename, score_filenames, combined_score_filename, delete_old_files=delete_intermediate_relax_files) # rescore already knows the proper filename else: # just a single match for each # output filename should be correct as is :) None # submit this script using a queue command # generate it here instead pbs_options = {} if command_dict['queue'] == 'parallel': pbs_options.update(PBS_PARALLEL_JOB_OPTIONS) elif command_dict['queue'] == 'serial': pbs_options.update(PBS_SERIAL_JOB_OPTIONS) # make sure they are satisfied script_filename = command_dict['script_filename'] for k in pbs_options.keys(): if '__call__' in dir(pbs_options[k]): pbs_options[k] = pbs_options[k](script_filename) pbs_command = create_executable_str('qsub', [script_filename], pbs_options) new_job_id = run_local_commandline(pbs_command, collect_stdout=True) new_job_id = new_job_id.strip() if '.' in new_job_id: new_job_id = new_job_id[:new_job_id.find('.')] print 'submitted ' + new_job_id # save the job id # assume its queue running_or_queued[new_job_id] = i else: print 'no new \"positions\" are available' # debug, need to know running_jobs = len([i for i in queue_status.values() if i in ['R']]) if running_jobs: print str( running_jobs ) + ' are still running...(excluding the jobs just submitted and including your other jobs)' # assess outcome of completed jobs for job_id in sorted( queue_status.keys()): # sort in numerical order, right? # debug if not job_id in all_completed_jobs: print '\t' + job_id, queue_status[ job_id] # , job_id in running_or_queued.keys() # could just skip it all now? if queue_status[ job_id] == 'C' and job_id in running_or_queued.keys(): task_id = running_or_queued[job_id][0] command_index = running_or_queued[job_id][1] command_dict = task_summaries[task_id]['commands'][ command_index] check_successful = determine_check_successful_function( command_dict, single_relax=single_relax) success = check_successful(command_dict) failure_summary = '' if isinstance(success, bool): complete = success elif len(success) > 1 and isinstance(success[0], bool): complete = success[0] failure_summary += ' ' + ';'.join( [str(j) for j in success[1:]]) + ' ' print complete, failure_summary, 'try again?' * bool( not complete) # debug # track the number of attempts? # try until failure - how many times? tries = 0 if 'run' in command_dict.keys( ) and command_dict['run'] and not 'success' in command_dict[ 'run'] and not 'failure' in command_dict['run']: tries = int(command_dict['run']) tries += 1 print tries, 'attempts so far' # debug if tries >= max_pbs_tries: # its a failure print job_id + ' completed successfully' * complete + ( ' failed with ' + str(tries) + ' attempts') * (not complete) failure_summary = 'success' * complete + ( str(tries) + ' tries;failure ' + failure_summary) * (not complete) elif complete: print job_id + ' completed successfully' failure_summary = 'success' #+ str( tries ) + ' tries' else: # record the number of tries print job_id + ' completed' + ' successfully' * complete failure_summary = str(tries) # update the record print 'updating with: ' + failure_summary # debug task_summaries[task_id]['commands'][command_index][ 'run'] = failure_summary # optionally cleanup if ddg_monomer_cleanup and command_dict[ 'feature'] == 'ddg_monomer': #'ddg' in i['output_filename']: print 'ddg_monomer writes useless output files, deleting these now...' remove_intermediate_ddg_monomer_files() # jobs that have since been completed - consider them complete? completed.append( running_or_queued[job_id]) # good, so this grows del running_or_queued[job_id] # remove jobs to run? # print 'updating the status...' # debug # write out "completed"? or "running_or_queued"? if queue_status[job_id] == 'C' and not job_id in all_completed_jobs: all_completed_jobs.append( job_id) # prevent redundant update info # update task_summaries e.g. write them! # modified: so the task summary records its own name...bah! for i in task_summaries: if not 'task_summary_filename' in i['filenames'].keys(): raise NotImplementedError( 'should input the task summary filename (not the summary itself)...' ) else: # write it out print 'updating: ' + i['filenames']['task_summary_filename'] write_task_summary(i, i['filenames']['task_summary_filename']) # pause... print '\n', len(completed), 'completed', len( task_list), 'tasks remaining' # debug if len(completed) <= len(task_list): # no need for edge-case end wait print 'waiting ' + str(PBS_QUEUE_MONITOR_DELAY) + 's...' time.sleep(PBS_QUEUE_MONITOR_DELAY) # return anything? # write one last time? for i in task_summaries: if not 'task_summary_filename' in i['filenames'].keys(): raise NotImplementedError( 'should input the task summary filename (not the summary itself)...' ) else: # write it out write_task_summary(i, i['filenames']['task_summary_filename'])
def run_rosetta_relax(pdb_filename, extra_options={}, run=True, parallel=ROSETTA_RELAX_PARALLEL): root_filename = pdb_filename.rstrip('.pdb') # collect the options, set the input, derive the output filenames relax_options = {} relax_options.update(ROSETTA_RELAX_OPTIONS) relax_options.update(extra_options) relax_options['s'] = pdb_filename relax_options['native'] = pdb_filename # required to get gdtmm scores for i in relax_options.keys(): if '__call__' in dir(relax_options[i]): relax_options[i] = relax_options[i](root_filename) # ...weird Rosetta append behavior... # if os.path.isfile( relax_options['out:file:silent'] ): # os.remove( relax_options['out:file:silent'] ) # if os.path.isfile( relax_options['out:file:scorefile'] ): # os.remove( relax_options['out:file:scorefile'] ) # for njc parallelization nstruct = int(relax_options.get('nstruct', '0')) parallel = int(parallel) tmp_file = None if nstruct > 1 and parallel > 1: relax_options['nstruct'] = 1 #TODO: Add chunking option? score_filename = relax_options['out:file:scorefile'] silent_filename = relax_options['out:file:silent'] if 'run:jran' in relax_options: restoreJran = True jran = int(relax_options['run:jran']) else: restoreJran = False jran = 123 tmp_file = tempfile.NamedTemporaryFile(delete=False) print 'Parallel relax commands are in ' + tmp_file.name for s in xrange(nstruct): tag = '_%05d' % s relax_options['run:jran'] = jran * nstruct + s relax_options['out:file:scorefile'] = score_filename + tag relax_options['out:file:silent'] = silent_filename + tag print >> tmp_file, create_executable_str( PATH_TO_ROSETTA_RELAX, args=[], options=relax_options ) + " > %s 2>&1; echo '[[VIPURLOG]]' %s %d" % ( (silent_filename + tag).replace('silent_', 'log_'), pdb_filename, s + 1) tmp_file.close() # the "find ... | xargs ..." idiom is used just in case nstruct is ever a *very* large number. command = '''\ parallel -j %d -a %s find . -name '%s_[0-9]*[0-9]' | xargs cat | awk 'NR == 1 || $2 != "score" {print $0}' > %s find . -name '%s_[0-9]*[0-9]' | xargs rm find . -name '%s_[0-9]*[0-9]' | xargs cat | awk 'NR <= 2 || !($2 == "score" || $1 == "SEQUENCE:") {print $0}' > %s find . -name '%s_[0-9]*[0-9]' | xargs rm ''' % (parallel, tmp_file.name, score_filename, score_filename, score_filename, silent_filename, silent_filename, silent_filename) print 'Parallel relax driver command:', command # restore option values relax_options['nstruct'] = str(nstruct) relax_options['out:file:scorefile'] = score_filename relax_options['out:file:silent'] = silent_filename if restoreJran: relax_options['run:jran'] = jran if run: return (command, tmp_file.name, score_filename, silent_filename) if tmp_file: os.unlink(tmp_file.name) else: command = create_executable_str(PATH_TO_ROSETTA_RELAX, args=[], options=relax_options) if run: run_local_commandline(command) # the only output we need # return relax_options['out:file:scorefile'] return relax_options['out:file:silent']
def run_VIPUR_tasks_PBS( task_summaries , task_list , max_pbs_tries = 2 , ddg_monomer_cleanup = True , single_relax = True , delete_intermediate_relax_files = False ): # run the non_rescore tasks completed = [i for i in task_list if 'run' in task_summaries[i[0]]['commands'][i[1]] and 'success' in task_summaries[i[0]]['commands'][i[1]]['run']] # should running_or_queued be saved? written to file? running_or_queued = {} rounds = 0 all_completed_jobs = [] # prevents annoying bulk output, only see it the first time it completes while not len( completed ) == len( task_list ): rounds += 1 print '\n\nQUEUE MONITOR ROUND ' + str( rounds ) # debug # print running_or_queued # check queue status queue_status = get_pbs_queue_status() # update "running_or_queued" list (?) # err, no, does not have information on which job it is...:( #for i in queue_status.keys(): # if queue_status[i] in ['R' , 'Q']: queue_space_occupied = len( [i for i in queue_status.values() if not i in ['C' , 'R']] ) # ignore "C"ompleted jobs, "R"unning job quota are not set by us... # if your queue system does not have a separate "R"un quota, remove 'R' from the above! available_space = PBS_QUEUE_QUOTA - queue_space_occupied # launch next jobs in available slots if available_space: print str( queue_space_occupied ) + ' jobs queued or running, could submit up to ' + str( available_space ) + ' more' # choose the next job jobs_to_run = [i for i in task_list if not i in completed and not i in running_or_queued.values() and not ( 'run' in task_summaries[i[0]]['commands'][i[1]] and ('success' in task_summaries[i[0]]['commands'][i[1]]['run'] or 'failure' in task_summaries[i[0]]['commands'][i[1]]['run']) ) ] print str( len( jobs_to_run ) ) + ' jobs still need to finish (after the currently running jobs complete)' # only the next few for i in jobs_to_run[:available_space]: command_dict = task_summaries[i[0]]['commands'][i[1]] # write scripts as part of pre processing?...yeah... # write the command to a script #script_filename = command_dict['out_path'] +'/'*bool( command_dict['out_path'] )+ #script_filename = command_dict['script_filename'] # if its a rescore and relax jobs were separated, need to recombine them! if 'rescore' in command_dict['feature']: # combine the individual relax runs #relax_commands = [i for i in task_summary['commands'] if i['feature'].replace( '_native' , '' ) == 'relax'] #silent_filenames = [j['output_filename'] for j in relax_commands if j['variant'] == i['variant'] and 'run' in j.keys() and j['run'] == 'success'] silent_filenames = [j['output_filename'] for j in task_summaries[i[0]]['commands'] if j['feature'].replace( '_native' , '' ) == 'relax' and j['variant'] == command_dict['variant'] and 'run' in j.keys() and 'success' in j['run'] ] # actually need to identify the combined_silent_filename, be sure the relax files have not already been merged # which variant target_variant = [j for j in task_summaries[i[0]]['variants'].keys() if j.split( '_' )[-1] == command_dict['variant'] and j.split( '_' )[0] in command_dict['command']] if not target_variant: # its native combined_silent_filename = task_summaries[i[0]]['other']['combined_native_silent_filename'] combined_score_filename = task_summaries[i[0]]['other']['combined_native_score_filename'] elif len( target_variant ) > 1: raise Exception( '??? found more than on matching variant ???\n' + ', '.join( target_variant ) ) else: # found it combined_silent_filename = task_summaries[i[0]]['variants'][target_variant[0]]['combined_silent_filename'] combined_score_filename = task_summaries[i[0]]['variants'][target_variant[0]]['combined_score_filename'] #if not single_relax: # AND post processing has not already be run...scan for the combined silent file if not single_relax and not os.path.isfile( combined_silent_filename ): if not len( silent_filenames ) == ROSETTA_RELAX_OPTIONS['nstruct']: raise Exception( '??? somehow the matching relax run(s) has failed ???\n' + str( i ) ) score_filenames = [j.replace( '.silent' , '.sc' ) for j in silent_filenames] merge_rosetta_relax_output( silent_filenames , combined_silent_filename , score_filenames , combined_score_filename , delete_old_files = delete_intermediate_relax_files ) # rescore already knows the proper filename else: # just a single match for each # output filename should be correct as is :) None # submit this script using a queue command # generate it here instead pbs_options = {} if command_dict['queue'] == 'parallel': pbs_options.update( PBS_PARALLEL_JOB_OPTIONS ) elif command_dict['queue'] == 'serial': pbs_options.update( PBS_SERIAL_JOB_OPTIONS ) # make sure they are satisfied script_filename = command_dict['script_filename'] for k in pbs_options.keys(): if '__call__' in dir( pbs_options[k] ): pbs_options[k] = pbs_options[k]( script_filename ) pbs_command = create_executable_str( 'qsub' , [script_filename] , pbs_options ) new_job_id = run_local_commandline( pbs_command , collect_stdout = True ) new_job_id = new_job_id.strip() if '.' in new_job_id: new_job_id = new_job_id[:new_job_id.find( '.' )] print 'submitted ' + new_job_id # save the job id # assume its queue running_or_queued[new_job_id] = i else: print 'no new \"positions\" are available' # debug, need to know running_jobs = len( [i for i in queue_status.values() if i in ['R']] ) if running_jobs: print str( running_jobs ) + ' are still running...(excluding the jobs just submitted and including your other jobs)' # assess outcome of completed jobs for job_id in sorted( queue_status.keys() ): # sort in numerical order, right? # debug if not job_id in all_completed_jobs: print '\t'+ job_id , queue_status[job_id]# , job_id in running_or_queued.keys() # could just skip it all now? if queue_status[job_id] == 'C' and job_id in running_or_queued.keys(): task_id = running_or_queued[job_id][0] command_index = running_or_queued[job_id][1] command_dict = task_summaries[task_id]['commands'][command_index] check_successful = determine_check_successful_function( command_dict , single_relax = single_relax ) success = check_successful( command_dict ) failure_summary = '' if isinstance( success , bool ): complete = success elif len( success ) > 1 and isinstance( success[0] , bool ): complete = success[0] failure_summary += ' '+ ';'.join( [str( j ) for j in success[1:]] ) +' ' print complete , failure_summary , 'try again?'*bool( not complete ) # debug # track the number of attempts? # try until failure - how many times? tries = 0 if 'run' in command_dict.keys() and command_dict['run'] and not 'success' in command_dict['run'] and not 'failure' in command_dict['run']: tries = int( command_dict['run'] ) tries += 1 print tries , 'attempts so far' # debug if tries >= max_pbs_tries: # its a failure print job_id + ' completed successfully'*complete + (' failed with ' + str( tries ) + ' attempts')*(not complete) failure_summary = 'success'*complete + (str( tries ) +' tries;failure ' + failure_summary)*(not complete) elif complete: print job_id + ' completed successfully' failure_summary = 'success' #+ str( tries ) + ' tries' else: # record the number of tries print job_id + ' completed' + ' successfully'*complete failure_summary = str( tries ) # update the record print 'updating with: ' + failure_summary # debug task_summaries[task_id]['commands'][command_index]['run'] = failure_summary # optionally cleanup if ddg_monomer_cleanup and command_dict['feature'] == 'ddg_monomer':#'ddg' in i['output_filename']: print 'ddg_monomer writes useless output files, deleting these now...' remove_intermediate_ddg_monomer_files() # jobs that have since been completed - consider them complete? completed.append( running_or_queued[job_id] ) # good, so this grows del running_or_queued[job_id] # remove jobs to run? # print 'updating the status...' # debug # write out "completed"? or "running_or_queued"? if queue_status[job_id] == 'C' and not job_id in all_completed_jobs: all_completed_jobs.append( job_id ) # prevent redundant update info # update task_summaries e.g. write them! # modified: so the task summary records its own name...bah! for i in task_summaries: if not 'task_summary_filename' in i['filenames'].keys(): raise NotImplementedError( 'should input the task summary filename (not the summary itself)...' ) else: # write it out print 'updating: ' + i['filenames']['task_summary_filename'] write_task_summary( i , i['filenames']['task_summary_filename'] ) # pause... print '\n' , len( completed ) , 'completed' , len( task_list ) , 'tasks remaining' # debug if len( completed ) <= len( task_list ): # no need for edge-case end wait print 'waiting ' + str( PBS_QUEUE_MONITOR_DELAY ) +'s...' time.sleep( PBS_QUEUE_MONITOR_DELAY ) # return anything? # write one last time? for i in task_summaries: if not 'task_summary_filename' in i['filenames'].keys(): raise NotImplementedError( 'should input the task summary filename (not the summary itself)...' ) else: # write it out write_task_summary( i , i['filenames']['task_summary_filename'] )
def create_variant_protein_structures( pdb_filename , variants , chain , use_pyrosetta = USE_PYROSETTA ): # make sure the variants have been filtered if use_pyrosetta: # load the PDB as a pose pose = pose_from_pdb( pdb_filename ) failed = {} root_filename = pdb_filename.rstrip( 'pdb' ) # currently cannot handle multi-chain input # handle this before VIPUR if pose.chain( pose.total_residue() ) > 1: print 'CANNOT currently handle multi-chain PDBs (as pose), using PyMOL instead!' if PATH_TO_PYMOL: create_variant_protein_structures( pdb_filename , variants , chain , use_pyrosetta = False ) return else: faulty = 'clean before VIPUR, cannot handle multi-chain PDBs' failed[faulty] = variants elif not pose.pdb_info().chain( 1 ) == chain: print '...not sure what it happening, you wanted chain ' + chain + ' but VIPUR found chain ' + pose.chain( 1 ) + ', skipping this entire sample!' faulty = 'clean before VIPUR, improper chain ID' failed[faulty] = variants # in case this condition is found: faulty = 'could not load position from PDB' variant_structures = [] for variation in variants: # make a copy test_pose = Pose() test_pose.assign( pose ) native = variation[0] position = variation[1:-1] mutant = variation[-1] # make sure the position was loaded icode = ' ' # default...this could cause problems... if not position[-1].isdigit(): icode = position[-1] position = position[:-1] position = int( position ) if not test_pose.pdb_info().pdb2pose( chain , position , icode ): if faulty in failed.keys(): failed[faulty].append( variation ) else: failed[faulty] = [variation] break # stop the loop position = test_pose.pdb_info().pdb2pose( chain , position , icode ) # simple, use a mover to make the change # appears to have trouble with N terminal variants since it uses "replace_residue" # code is available that does not have this problem, however reloading into Rosetta with accurately determine the position of these atoms make_variant = MutateResidue( position , mutant ) make_variant.apply( test_pose ) # write out out_filename = self.root_filename +'.chain_'+ chain +'_'+ variation +'.pdb' variant_structures.append( out_filename ) test_pose.dump_pdb( out_filename ) print 'generated ' + variation + ' variant structure and wrote to ' + out_filename return variant_structures else: # use the pymol script #for variants in self.variants['permissible']: # use default output naming # create command explicitly here, slightly different root_filename = pdb_filename.rstrip( '.pdb' ) command = PATH_TO_PYMOL + ' -qcr ' + PATH_TO_VIPUR + '/pymol_make_variant_structure.py -- -p ' + pdb_filename + ' -m ' + ','.join( variants ) + ' -c ' + chain + ' -r ' + root_filename # print command run_local_commandline( command ) # reconstruct the names variant_structures = [root_filename + '.chain_' + chain +'_'+ i +'.pdb' for i in variants] # verify they have been made if [None for i in variant_structures if not os.path.isfile( i )]: raise IOError( 'could not make variant protein structures,\ntry checking the input PDB file or the pymol script pymol_make_variant_structure.py' ) return variant_structures
def run_rosetta_relax( pdb_filename , extra_options = {} , run = True , parallel = ROSETTA_RELAX_PARALLEL ): root_filename = pdb_filename.rstrip( '.pdb' ) # collect the options, set the input, derive the output filenames relax_options = {} relax_options.update( ROSETTA_RELAX_OPTIONS ) relax_options.update( extra_options ) relax_options['s'] = pdb_filename relax_options['native'] = pdb_filename # required to get gdtmm scores for i in relax_options.keys(): if '__call__' in dir( relax_options[i] ): relax_options[i] = relax_options[i]( root_filename ) # ...weird Rosetta append behavior... # if os.path.isfile( relax_options['out:file:silent'] ): # os.remove( relax_options['out:file:silent'] ) # if os.path.isfile( relax_options['out:file:scorefile'] ): # os.remove( relax_options['out:file:scorefile'] ) # for njc parallelization nstruct = int( relax_options.get( 'nstruct' , '0' ) ) parallel = int( parallel ) tmp_file = None if nstruct > 1 and parallel > 1: relax_options['nstruct'] = 1 #TODO: Add chunking option? score_filename = relax_options['out:file:scorefile'] silent_filename = relax_options['out:file:silent'] if 'run:jran' in relax_options: restoreJran = True jran = int( relax_options['run:jran'] ) else: restoreJran = False jran = 123 tmp_file = tempfile.NamedTemporaryFile( delete = False ) print 'Parallel relax commands are in ' + tmp_file.name for s in xrange( nstruct ): tag = '_%05d' % s relax_options['run:jran'] = jran*nstruct + s relax_options['out:file:scorefile'] = score_filename + tag relax_options['out:file:silent'] = silent_filename + tag print >>tmp_file , create_executable_str( PATH_TO_ROSETTA_RELAX , args = [] , options = relax_options ) + " > %s 2>&1; echo '[[VIPURLOG]]' %s %d" % ((silent_filename + tag).replace( 'silent_' , 'log_' ) , pdb_filename , s + 1 ) tmp_file.close() # the "find ... | xargs ..." idiom is used just in case nstruct is ever a *very* large number. command = '''\ parallel -j %d -a %s find . -name '%s_[0-9]*[0-9]' | xargs cat | awk 'NR == 1 || $2 != "score" {print $0}' > %s find . -name '%s_[0-9]*[0-9]' | xargs rm find . -name '%s_[0-9]*[0-9]' | xargs cat | awk 'NR <= 2 || !($2 == "score" || $1 == "SEQUENCE:") {print $0}' > %s find . -name '%s_[0-9]*[0-9]' | xargs rm ''' % (parallel , tmp_file.name , score_filename , score_filename , score_filename , silent_filename , silent_filename , silent_filename) print 'Parallel relax driver command:', command # restore option values relax_options['nstruct'] = str( nstruct ) relax_options['out:file:scorefile'] = score_filename relax_options['out:file:silent'] = silent_filename if restoreJran: relax_options['run:jran'] = jran if run: return (command , tmp_file.name , score_filename , silent_filename) if tmp_file: os.unlink( tmp_file.name ) else: command = create_executable_str( PATH_TO_ROSETTA_RELAX , args = [] , options = relax_options ) if run: run_local_commandline( command ) # command = create_executable_str( PATH_TO_ROSETTA_RELAX , args = [] , options = relax_options ) # run_local_commandline( command ) # the only output we need # return relax_options['out:file:scorefile'] return relax_options['out:file:silent']
def run_VIPUR_PBS( pdb_filename = '' , variants_filename = '' , out_path = '' , write_numbering_map = True , single_relax = True , delete_intermediate_relax_files = True , demo = False , rerun_preprocessing = False ): # for the example input if demo: pdb_filename = PATH_TO_VIPUR + '/example_input/2C35.pdb' variants_filename = PATH_TO_VIPUR + '/example_input/2C35.txt' out_path = PATH_TO_VIPUR + '/example_output' # alternatively, run on an entire directory if not pdb_filename and not variants_filename: # current directory print 'no input provided, assuming you want to run on every (.pdb,.txt) file pair found in the current directory' pdb_filename = os.getcwd() if os.path.isdir( pdb_filename ) and not variants_filename: # assume variants_filename from pdb_filename variants_filename = '.txt' if os.path.isdir( pdb_filename ) and variants_filename[0] == '.': # look for file extension # instead, run on the directory if not out_path: out_path = os.path.abspath( pdb_filename ) fa_filenames = [(out_path +'/')*bool( out_path ) + i for i in os.listdir( pdb_filename ) if get_file_extension( i ) == 'fa'] fa_filenames = [[i , get_root_filename( i ) + variants_filename] for i in fa_filenames if os.path.isfile( get_root_filename( i ) + variants_filename ) and not os.path.isfile( get_root_filename( i ) + '.pdb' )] print 'running VIPUR on all (.pdb,' + variants_filename + ') file pairs found in ' + pdb_filename # find .pdb files pdb_filenames = [(out_path +'/')*bool( out_path ) + i for i in os.listdir( pdb_filename ) if get_file_extension( i ) == 'pdb'] # look for pairs pdb_filenames = [[i , get_root_filename( i ) + variants_filename] for i in pdb_filenames if os.path.isfile( get_root_filename( i ) + variants_filename )] print str( len( pdb_filenames ) ) + ' pairs found' print str( len( fa_filenames ) ) + ' pairs found for sequence only mode' # go there... # os.chdir( pdb_filename ) if not pdb_filenames: if not fa_filenames: raise IOError( '!!! no (.pdb,' + variants_filename + ') file pairs found in ' + pdb_filename + '!!?!\nAND no (.fa,' + variants_filename + ') file pairs were found...' ) else: print '...only (.fa,' + variants_filename + ') file pairs were found, running in sequence only mode' else: # file extension etc. file_extension = get_file_extension( pdb_filename ) root_filename = get_root_filename( pdb_filename ) # normal execution, generalize by turning into list pdb_filenames = [] fa_filenames = [] if file_extension == 'pdb': pdb_filenames = [[(out_path +'/')*bool( out_path ) + pdb_filename , (out_path +'/')*bool( out_path ) + variants_filename]] else: fa_filenames = [[]] # combine all "filenames" to run into unified framework target_proteins = []#None]*(len( pdb_filenames ) + len( fa_filenames )) for i in pdb_filenames: this_out_path = get_root_filename( i[0] ) +'_VIPUR' # directory to create target_proteins.append( i + [False , this_out_path] ) for i in fa_filenames: this_out_path = get_root_filename( i[0] ) +'_VIPUR' # directory to create target_proteins.append( i + [True , this_out_path] ) # setup environment variables BEFORE pre processing # no need to setup a command, just run it if PBS_ENVIRONMENT_SETUP: print 'setting up environment variables' run_local_commandline( PBS_ENVIRONMENT_SETUP ) # pre processing task_summaries = [] for i in target_proteins: # guess what the task summary filename 'would' be, if it exists, keep going... task_summary_filename = i[3]*bool( i[3] ) +'/'+ get_root_filename( i[0] ).split( '/' )[-1] + '.task_summary' if os.path.isfile( task_summary_filename ) and not rerun_preprocessing: print 'hmmm, ' + i[0] + ' seems to have run preprocessing already, skipping now' #continue # skip this one, do not add to list of tasks...? # actually, skip running pre-processing BUT DO add it to the list of tasks else: task_summary_filename = run_preprocessing( i[0] , i[1] , sequence_only = i[2] , out_path = i[3] , task_summary_filename = task_summary_filename , write_numbering_map = write_numbering_map , single_relax = single_relax , pymol_environment_setup = PBS_ENVIRONMENT_SETUP ) # modify for PBS script task_summary = load_task_summary( task_summary_filename ) for j in xrange( len( task_summary['commands'] ) ): command = task_summary['commands'][j]['command'] # add for relax if task_summary['commands'][j]['feature'].replace( '_native' , '' ) == 'relax' and not 'rescore' in task_summary['commands'][j]['feature']: if not PBS_PARALLEL_ROSETTA_ENDING in command: command = command.replace( ROSETTA_ENDING , PBS_PARALLEL_ROSETTA_ENDING ) command = PBS_PARALLEL_ROSETTA_EXECUTION_COMMAND + ' '*bool( PBS_PARALLEL_ROSETTA_EXECUTION_COMMAND ) + command if ROSETTA_RELAX_PARALLEL_OPTIONS: command += ' '+ ' '.join( ['-'+ k + (' '+ ROSETTA_RELAX_PARALLEL_OPTIONS[k])*bool( ROSETTA_RELAX_PARALLEL_OPTIONS[k] ) for k in ROSETTA_RELAX_PARALLEL_OPTIONS] ) # command += ' -jd2:mpi_file_buf_job_distributor false' # command += ' -run:multiple_processes_writing_to_one_directory' # also use the parallel options pbs_options = 'parallel'#.update( PBS_PARALLEL_JOB_OPTIONS ) else: pbs_options = 'serial'#.update( PBS_SERIAL_JOB_OPTIONS ) # put "cd" in front command = ('cd '+ i[3] +';')*bool( i[3] ) + command # modify the task summary task_summary['commands'][j]['command'] = command # actually write the script... # don't worry about optional #PBS header info # need to add the variant? no, just use the output_filename for this script_filename = i[3] + '/'*bool( i[3] ) + get_root_filename( task_summary['commands'][j]['output_filename'].split( '/' )[-1] ) +'.'+ task_summary['commands'][j]['feature'] + '.pbs_script.sh' task_summary['commands'][j]['script_filename'] = script_filename f = open( script_filename , 'w' ) f.write( PBS_BASH_SCRIPT( command ) ) f.close() # use the script filename as the source for any log files # control the output and error paths # also generate the pbs call? might as well, keep it simple # no, uses ":" and "," characters... task_summary['commands'][j]['queue'] = pbs_options # rewrite the task summary write_task_summary( task_summary , task_summary_filename ) task_summaries.append( task_summary )#_filename ) # run them all run_VIPUR_task_summaries_PBS( task_summaries , single_relax = single_relax , delete_intermediate_relax_files = delete_intermediate_relax_files ) # post processing # this look identical!!! :) for i in xrange( len( task_summaries ) ): # always okay to rerun post processing...should not make any difference sequence_only = target_proteins[i][2] print '\n\n\nExtracting and Analyzing the Results:\n\n' task_summaries[i] = run_postprocessing( task_summaries[i] , sequence_only = sequence_only ) return task_summaries
def run_VIPUR_tasks_PBS( task_summaries , task_list , max_pbs_tries = 2 , ddg_monomer_cleanup = True , single_relax = False , delete_intermediate_relax_files = False ): # run the non_rescore tasks completed = [i for i in task_list if 'run' in task_summaries[i[0]]['commands'][i[1]] and 'success' in task_summaries[i[0]]['commands'][i[1]]['run']] # should running_or_queued be saved? written to file? running_or_queued = {} while not len( completed ) == len( task_list ): # check queue status queue_status = get_pbs_queue_status() # update "running_or_queued" list (?) # err, no, does not have information on which job it is...:( #for i in queue_status.keys(): # if queue_status[i] in ['R' , 'Q']: queue_space_occupied = len( [i for i in queue_status.values() if not i in ['C' , 'R']] ) # ignore "C"ompleted jobs, "R"unning job quota are not set by us... # if your queue system does not have a separate "R"un quota, remove 'R' from the above! available_space = queue_space_occupied - PBS_QUEUE_QUOTA # launch next jobs in available slots if available_space: # choose the next job jobs_to_run = [i for i in task_list if not i in completed and not i in running_or_queued.values() and not ( 'run' in task_summaries[i[0]]['commands'][i[1]] and ('success' in task_summaries[i[0]]['commands'][i[1]]['run'] or 'failure' in task_summaries[i[0]]['commands'][i[1]]['run']) ) ] for i in jobs_to_run: command_dict = task_summaries[i[0]]['commands'][i[1]] # write scripts as part of pre processing?...yeah... # write the command to a script #script_filename = command_dict['out_path'] +'/'*bool( command_dict['out_path'] )+ #script_filename = command_dict['script_filename'] # if its a rescore and relax jobs were separated, need to recombine them! if 'rescore' in command_dict['feature']: # combine the individual relax runs #relax_commands = [i for i in task_summary['commands'] if i['feature'].replace( '_native' , '' ) == 'relax'] #silent_filenames = [j['output_filename'] for j in relax_commands if j['variant'] == i['variant'] and 'run' in j.keys() and j['run'] == 'success'] silent_filenames = [j['output_filename'] for j in task_summaries[i[0]]['commands'] if j['feature'].replace( '_native' , '' ) == 'relax' and j['variant'] == command_dict['variant'] and 'run' in j.keys() and 'success' in j['run'] ] # actually need to identify the combined_silent_filename, be sure the relax files have not already been merged # which variant target_variant = [j for j in task_summaries[i[0]]['variants'].keys() if j.split( '_' )[-1] == command_dict['variant'] and j.split( '_' )[0] in command_dict['command']] if not target_variant: # its native combined_silent_filename = task_summaries[i[0]]['other']['combined_native_silent_filename'] combined_score_filename = task_summaries[i[0]]['other']['combined_native_score_filename'] elif len( target_variant ) > 1: raise Exception( '??? found more than on matching variant ???\n' + ', '.join( target_variant ) ) else: # found it combined_silent_filename = task_summaries[i[0]]['variants'][target_variant[0]]['combined_silent_filename'] combined_score_filename = task_summaries[i[0]]['variants'][target_variant[0]]['combined_score_filename'] #if not single_relax: # AND post processing has not already be run...scan for the combined silent file if not single_relax and not os.path.isfile( combined_silent_filename ): if not len( silent_filenames ) == ROSETTA_RELAX_OPTIONS['nstruct']: raise Exception( '??? somehow the matching relax run(s) has failed ???\n' + str( i ) ) score_filenames = [j.replace( '.silent' , '.sc' ) for j in silent_filenames] merge_rosetta_relax_output( silent_filenames , combined_silent_filename , score_filenames , combined_score_filename , delete_old_files = delete_intermediate_relax_files ) # rescore already knows the proper filename else: # just a single match for each # output filename should be correct as is :) None # submit this script using a queue command pbs_command = command_dict['qsub_command'] # SHOULD already have an abspath to the script new_job_id = run_local_commandline( pbs_command , collect_stdout = True ) new_job_id = new_job_id[:new_job_id.find( '.' )] # save the job id # assume its queue running_or_queued[new_job_id] = i # assess outcome of completed jobs for job_id in queue_status.keys(): if queue_status[job_id] == 'C' and job_id in running_or_queued.keys(): task_id = running_or_queued[job_id][0] command_index = running_or_queued[job_id][1] command_dict = task_summaries[task_id]['commands'][command_index] check_successful = determine_check_successful_function( command_dict , single_relax = single_relax ) success = check_successful( command_dict ) failure_summary = '' if isinstance( success , bool ): complete = success elif len( success ) > 1 and isinstance( success[0] , bool ): complete = success[0] failure_summary += ' '+ ';'.join( [str( j ) for j in success[1:]] ) +' ' # track the number of attempts? # try until failure - how many times? tries = 0 if 'run' in command_dict.keys() and command_dict['run'] and not 'success' in command_dict['run'] and not 'failure' in command_dict['run']: tries = int( command_dict['run'] ) tries += 1 if tries >= max_pbs_tries: # its a failure failure_summary = 'success'*complete + (str( tries ) +' tries;failure ' + failure_summary)*(not complete) else: # record the number of tries failure_summary = str( tries ) # update the record task_summaries[task_id]['commands'][command_index]['run'] = failure_summary # optionally cleanup if ddg_monomer_cleanup and command_dict['feature'] == 'ddg_monomer':#'ddg' in i['output_filename']: print 'ddg_monomer writes useless output files, deleting these now...' remove_intermediate_ddg_monomer_files() # jobs that have since been completed - consider them complete? completed.append( running_or_queued[job_id] ) del running_or_queued[job_id] # write out "completed"? or "running_or_queued"? # update task_summaries e.g. write them! # modified: so the task summary records its own name...bah! for i in task_summaries: if not 'task_summary_filename' in i['filenames'].keys(): raise NotImplementedError( 'should input the task summary filename (not the summary itself)...' ) else: # write it out write_task_summary( i , i['filenames']['task_summary_filename'] ) # pause... time.sleep( PBS_QUEUE_MONITOR_DELAY ) # return anything? # write one last time? for i in task_summaries: if not 'task_summary_filename' in i['filenames'].keys(): raise NotImplementedError( 'should input the task summary filename (not the summary itself)...' ) else: # write it out write_task_summary( i , i['filenames']['task_summary_filename'] )
def run_VIPUR_tasks_in_batch_SLURM( task_summaries , task_list , max_slurm_tries = 2 , ddg_monomer_cleanup = True , single_relax = True ): # also setup to do start-stop completed = [i for i in task_list if 'run' in task_summaries[i[0]]['commands'][i[1]] and 'success' in task_summaries[i[0]]['commands'][i[1]]['run']] attempt = 1 while not len( completed ) == len( task_list ): # do not worry about the queue in this mode # assume you will be able to submit etc. # not such thing as "running or queued" either, just run one batch at a time # gather all the commands into a single script # choose the next job jobs_to_run = [i for i in task_list if not i in completed and not ( 'run' in task_summaries[i[0]]['commands'][i[1]] and ('success' in task_summaries[i[0]]['commands'][i[1]]['run'] or 'failure' in task_summaries[i[0]]['commands'][i[1]]['run']) ) ] print str( len( jobs_to_run ) ) + ' processes still need to finish' # write the script master_script_text = '\n\n'.join( [task_summaries[i[0]]['commands'][i[1]]['command'] for i in jobs_to_run] ) # test without relax processes # master_script_text = '\n\n'.join( [task_summaries[i[0]]['commands'][i[1]]['command'] for i in jobs_to_run if not 'relax' in task_summaries[i[0]]['commands'][i[1]]['command']] ) master_script_text = SLURM_BASH_SCRIPT( master_script_text ) # check if they have different names!?...wait...they will... slurm_script_filename = task_summaries[0]['filenames']['slurm_script_filename'] slurm_output_filename = task_summaries[0]['filenames']['slurm_output_filename'] slurm_error_filename = task_summaries[0]['filenames']['slurm_error_filename'] slurm_script_filename = slurm_script_filename.replace( '.sh' , '_'+ str( attempt ) + '.sh' ) slurm_output_filename = slurm_output_filename.replace( '.out' , '_'+ str( attempt ) + '.out' ) slurm_error_filename = slurm_error_filename.replace( '.err' , '_'+ str( attempt ) + '.err' ) # can just use the first one now... f = open( slurm_script_filename , 'w' ) f.write( master_script_text ) f.close() # save a copy of this script for reference? # successive runs with overwrite the file... # debug # raw_input( 'everything okay?' ) # submit sbatch # simple for now... command = 'sbatch -n 40' if slurm_output_filename: command += ' -o ' + slurm_output_filename if slurm_error_filename: command += ' -e ' + slurm_error_filename command += ' ' + slurm_script_filename batch_job_id = run_local_commandline( command , collect_stdout = True ) # batch_job_id = run_local_commandline( 'sbatch -n 40 ' + slurm_script_filename , collect_stdout = True ) # srun or sbatch? batch_job_id = batch_job_id.strip().split( ' ' )[-1] print 'submitted ' + batch_job_id # monitor the job until it is complete # pause... batch_complete = False while not batch_complete: queue_status = get_slurm_queue_status( only_job_status = True ) # batch_complete = bool( [i for i in queue_status if i[0] == batch_job_id] ) batch_complete = not batch_job_id in queue_status.keys() # could be an immediate failure...but don't want to linger here anyway in that case # debug # print queue_status # print queue_status.keys() # print batch_complete , batch_job_id , batch_job_id in queue_status.keys() for i in queue_status.keys(): print i + '\t' + queue_status[i] # can be sure it doesn't need to wait if empty if queue_status: print 'waiting ' + str( SLURM_QUEUE_MONITOR_DELAY ) +'s...' time.sleep( SLURM_QUEUE_MONITOR_DELAY ) # evaluate if it ran successfully for job_pair in jobs_to_run: command_dict = task_summaries[job_pair[0]]['commands'][job_pair[1]] check_successful = determine_check_successful_function( command_dict , single_relax = single_relax ) success = check_successful( command_dict ) failure_summary = '' if isinstance( success , bool ): complete = success elif len( success ) > 1 and isinstance( success[0] , bool ): complete = success[0] failure_summary += ' '+ ';'.join( [str( j ) for j in success[1:]] ) +' ' # track the number of attempts? # try until failure - how many times? tries = 0 if 'run' in command_dict.keys() and command_dict['run'] and not 'success' in command_dict['run'] and not 'failure' in command_dict['run']: tries = int( command_dict['run'] ) tries += 1 job_task = task_summaries[job_pair[0]]['root_filename'].split( '/' )[-1] job_feature = task_summaries[job_pair[0]]['commands'][job_pair[1]]['feature'] job_variant = '' if 'variant' in task_summaries[job_pair[0]]['commands'][job_pair[1]].keys(): job_variant = task_summaries[job_pair[0]]['commands'][job_pair[1]]['variant'] this_job_description = job_task +' '+ job_feature + (' ' + job_variant)*bool( job_variant ) if tries >= max_slurm_tries: # its a failure print this_job_description + ' completed successfully'*complete + (' failed with ' + str( tries ) + ' attempts')*(not complete) failure_summary = 'success'*complete + (str( tries ) +' tries;failure ' + failure_summary)*(not complete) completed.append( job_pair ) elif complete: print this_job_description + ' simply completed successfully' failure_summary = 'success' #+ str( tries ) + ' tries' completed.append( job_pair ) else: # record the number of tries print this_job_description + ' completed' + ' successfully'*complete failure_summary = str( tries ) completed.append( job_pair ) # update the record task_summaries[job_pair[0]]['commands'][job_pair[1]]['run'] = failure_summary # no need to be here anymore # optionally cleanup # if ddg_monomer_cleanup and command_dict['feature'] == 'ddg_monomer':#'ddg' in i['output_filename']: # print 'ddg_monomer writes useless output files, deleting these now...' # remove_intermediate_ddg_monomer_files() # update task_summaries e.g. write them! # modified: so the task summary records its own name...bah! for i in task_summaries: if not 'task_summary_filename' in i['filenames'].keys(): raise NotImplementedError( 'should input the task summary filename (not the summary itself)...' ) else: # write it out print 'updating: ' + i['filenames']['task_summary_filename'] write_task_summary( i , i['filenames']['task_summary_filename'] ) # debug # print attempt # print len( completed ) , len( task_list ) # raw_input( 'start the next round?' ) # need to run another batch? attempt += 1
def create_variant_protein_structures(pdb_filename, variants, chain, use_pyrosetta=USE_PYROSETTA, pymol_environment_setup=''): # optionally run environment setup if pymol_environment_setup: print 'setting up environment variables' run_local_commandline(pymol_environment_setup) # make sure the variants have been filtered if use_pyrosetta: # load the PDB as a pose pose = pose_from_pdb(pdb_filename) failed = {} root_filename = pdb_filename.rstrip('pdb') # currently cannot handle multi-chain input # handle this before VIPUR if pose.chain(pose.total_residue()) > 1: print 'CANNOT currently handle multi-chain PDBs (as pose), using PyMOL instead!' if PATH_TO_PYMOL: create_variant_protein_structures(pdb_filename, variants, chain, use_pyrosetta=False) return else: faulty = 'clean before VIPUR, cannot handle multi-chain PDBs' failed[faulty] = variants elif not pose.pdb_info().chain(1) == chain: print '...not sure what it happening, you wanted chain ' + chain + ' but VIPUR found chain ' + pose.chain( 1) + ', skipping this entire sample!' faulty = 'clean before VIPUR, improper chain ID' failed[faulty] = variants # in case this condition is found: faulty = 'could not load position from PDB' variant_structures = [] for variation in variants: # make a copy test_pose = Pose() test_pose.assign(pose) native = variation[0] position = variation[1:-1] mutant = variation[-1] # make sure the position was loaded icode = ' ' # default...this could cause problems... if not position[-1].isdigit(): icode = position[-1] position = position[:-1] position = int(position) if not test_pose.pdb_info().pdb2pose(chain, position, icode): if faulty in failed.keys(): failed[faulty].append(variation) else: failed[faulty] = [variation] break # stop the loop position = test_pose.pdb_info().pdb2pose(chain, position, icode) # simple, use a mover to make the change # appears to have trouble with N terminal variants since it uses "replace_residue" # code is available that does not have this problem, however reloading into Rosetta with accurately determine the position of these atoms make_variant = MutateResidue(position, mutant) make_variant.apply(test_pose) # write out out_filename = self.root_filename + '.chain_' + chain + '_' + variation + '.pdb' variant_structures.append(out_filename) test_pose.dump_pdb(out_filename) print 'generated ' + variation + ' variant structure and wrote to ' + out_filename return variant_structures else: # use the pymol script #for variants in self.variants['permissible']: # use default output naming # create command explicitly here, slightly different root_filename = pdb_filename.rstrip('.pdb') command = PATH_TO_PYMOL + ' -qcr ' + PATH_TO_VIPUR + '/pymol_make_variant_structure.py -- -p ' + pdb_filename + ' -m ' + ','.join( variants) + ' -c ' + chain + ' -r ' + root_filename if pymol_environment_setup: command = pymol_environment_setup + '\n\n' + command run_local_commandline(command) # reconstruct the names variant_structures = [ root_filename + '.chain_' + chain + '_' + i + '.pdb' for i in variants ] # verify they have been made if [None for i in variant_structures if not os.path.isfile(i)]: raise IOError( 'could not make variant protein structures,\ntry checking the input PDB file or the pymol script pymol_make_variant_structure.py' ) return variant_structures
def run_VIPUR_tasks_SLURM( task_summaries , task_list , max_pbs_tries = 2 , ddg_monomer_cleanup = True , single_relax = False , delete_intermediate_relax_files = False ): # run the non_rescore tasks completed = [i for i in task_list if 'run' in task_summaries[i[0]]['commands'][i[1]] and 'success' in task_summaries[i[0]]['commands'][i[1]]['run']] # should running_or_queued be saved? written to file? running_or_queued = {} rounds = 0 while not len( completed ) == len( task_list ): rounds += 1 print '\n\nQUEUE MONITOR ROUND ' + str( rounds ) # debug # print running_or_queued # check queue status queue_status = get_slurm_queue_status( only_job_status = True ) # update "running_or_queued" list (?) # err, no, does not have information on which job it is...:( #for i in queue_status.keys(): # if queue_status[i] in ['R' , 'Q']: # used to be after submission, not occurs first # debug, need to know running_jobs = len( [i for i in queue_status.values() if i in ['R']] ) if running_jobs: print str( running_jobs ) + ' are still running...' # assess outcome of completed jobs # still_running = 0 # need to add the jobs that completed, removed themselves from the queue in SLURM # print queue_status.keys() + [j for j in running_or_queued.keys() if not j in queue_status.keys()] for job_id in queue_status.keys() + [j for j in running_or_queued.keys() if not j in queue_status.keys()]: # debug # if job_id in queue_status.keys(): # print '\t'+ job_id , queue_status[job_id] , job_id in running_or_queued.keys() # else: ## print '\t'+ job_id , None , job_id in running_or_queued.keys() # print '\t'+ job_id , job_id in running_or_queued.keys() if (not job_id in queue_status.keys()) or (queue_status[job_id] == 'C' and job_id in running_or_queued.keys()): task_id = running_or_queued[job_id][0] command_index = running_or_queued[job_id][1] command_dict = task_summaries[task_id]['commands'][command_index] check_successful = determine_check_successful_function( command_dict , single_relax = single_relax ) success = check_successful( command_dict ) failure_summary = '' if isinstance( success , bool ): complete = success elif len( success ) > 1 and isinstance( success[0] , bool ): complete = success[0] failure_summary += ' '+ ';'.join( [str( j ) for j in success[1:]] ) +' ' # track the number of attempts? # try until failure - how many times? tries = 0 if 'run' in command_dict.keys() and command_dict['run'] and not 'success' in command_dict['run'] and not 'failure' in command_dict['run']: tries = int( command_dict['run'] ) tries += 1 if tries >= max_pbs_tries: # its a failure print job_id + ' completed successfully'*complete + (' failed with ' + str( tries ) + ' attempts')*(not complete) failure_summary = 'success'*complete + (str( tries ) +' tries;failure ' + failure_summary)*(not complete) elif complete: print job_id + ' simply completed successfully' failure_summary = 'success' #+ str( tries ) + ' tries' else: # record the number of tries print job_id + ' completed' + ' successfully'*complete failure_summary = str( tries ) # update the record task_summaries[task_id]['commands'][command_index]['run'] = failure_summary # optionally cleanup if ddg_monomer_cleanup and command_dict['feature'] == 'ddg_monomer':#'ddg' in i['output_filename']: print 'ddg_monomer writes useless output files, deleting these now...' remove_intermediate_ddg_monomer_files() # jobs that have since been completed - consider them complete? completed.append( running_or_queued[job_id] ) del running_or_queued[job_id] # write out "completed"? or "running_or_queued"? # else: # still_running += 1 # print str( still_running) + ' jobs still running (or queued)...' # update task_summaries e.g. write them! # modified: so the task summary records its own name...bah! for i in task_summaries: if not 'task_summary_filename' in i['filenames'].keys(): raise NotImplementedError( 'should input the task summary filename (not the summary itself)...' ) else: # write it out print 'updating: ' + i['filenames']['task_summary_filename'] write_task_summary( i , i['filenames']['task_summary_filename'] ) # used to be first, submit jobs them check complete # but slurm removes jobs from the list queue_space_occupied = len( [i for i in queue_status.values() if not i in ['C' , 'PD']] ) # ignore "C"ompleted jobs, "R"unning job quota are not set by us... # if your queue system does not have a separate "R"un quota, remove 'R' from the above! available_space = SLURM_QUEUE_QUOTA - queue_space_occupied # launch next jobs in available slots if available_space: print str( queue_space_occupied ) + ' jobs queued or running, could submit up to ' + str( available_space ) + ' more' # choose the next job jobs_to_run = [i for i in task_list if not i in completed and not i in running_or_queued.values() and not ( 'run' in task_summaries[i[0]]['commands'][i[1]] and ('success' in task_summaries[i[0]]['commands'][i[1]]['run'] or 'failure' in task_summaries[i[0]]['commands'][i[1]]['run']) ) ] print str( len( jobs_to_run ) ) + ' jobs still need to finish (after the currently running jobs complete)' # only the next few for i in jobs_to_run[:available_space]: command_dict = task_summaries[i[0]]['commands'][i[1]] # write scripts as part of pre processing?...yeah... # write the command to a script #script_filename = command_dict['out_path'] +'/'*bool( command_dict['out_path'] )+ #script_filename = command_dict['script_filename'] # submission is specific to the job slurm_command = '' # if its a rescore and relax jobs were separated, need to recombine them! if 'rescore' in command_dict['feature']: # combine the individual relax runs #relax_commands = [i for i in task_summary['commands'] if i['feature'].replace( '_native' , '' ) == 'relax'] #silent_filenames = [j['output_filename'] for j in relax_commands if j['variant'] == i['variant'] and 'run' in j.keys() and j['run'] == 'success'] silent_filenames = [j['output_filename'] for j in task_summaries[i[0]]['commands'] if j['feature'].replace( '_native' , '' ) == 'relax' and j['variant'] == command_dict['variant'] and 'run' in j.keys() and 'success' in j['run'] ] # actually need to identify the combined_silent_filename, be sure the relax files have not already been merged # which variant target_variant = [j for j in task_summaries[i[0]]['variants'].keys() if j.split( '_' )[-1] == command_dict['variant'] and j.split( '_' )[0] in command_dict['command']] if not target_variant: # its native combined_silent_filename = task_summaries[i[0]]['other']['combined_native_silent_filename'] combined_score_filename = task_summaries[i[0]]['other']['combined_native_score_filename'] elif len( target_variant ) > 1: raise Exception( '??? found more than on matching variant ???\n' + ', '.join( target_variant ) ) else: # found it combined_silent_filename = task_summaries[i[0]]['variants'][target_variant[0]]['combined_silent_filename'] combined_score_filename = task_summaries[i[0]]['variants'][target_variant[0]]['combined_score_filename'] #if not single_relax: # AND post processing has not already be run...scan for the combined silent file if not single_relax and not os.path.isfile( combined_silent_filename ): if not len( silent_filenames ) == ROSETTA_RELAX_OPTIONS['nstruct']: raise Exception( '??? somehow the matching relax run(s) has failed ???\n' + str( i ) ) score_filenames = [j.replace( '.silent' , '.sc' ) for j in silent_filenames] merge_rosetta_relax_output( silent_filenames , combined_silent_filename , score_filenames , combined_score_filename , delete_old_files = delete_intermediate_relax_files ) # rescore already knows the proper filename else: # just a single match for each # output filename should be correct as is :) None # do this above, in initial processing # elif 'ddg_monomer' in command_dict['feature']: # must do it this way for now # write the run script # ddg_monomer_script_filename = task_summary['filenames']['slurm_script_filename'].replace( 'slurm_script_this_batch.sh' , 'run_ddg_momomer_script.sh' ) # f = open( ddg_monomer_script_filename , 'w' ) # f.write( command_dict['command'] ) # f.close() # submit this script using a queue command # srun or sbatch? slurm_command = command_dict['sbatch_command'] # SHOULD already have an abspath to the script new_job_id = run_local_commandline( slurm_command , collect_stdout = True ) # new_job_id = new_job_id[:new_job_id.find( ' ' )] new_job_id = new_job_id.strip().split( ' ' )[-1] print 'submitted ' + new_job_id # save the job id # assume its queue running_or_queued[new_job_id] = i # print 'added ' + new_job_id else: print 'no new \"positions\" are available' # OKAY, move the "updating" to just after the status check # problem with ddg_monomer, runs so fast... # make a specific exception: # ...move the pause to here # prevents odd behaviour...um...sorta? maybe not # if 'ddg_monomer' in command_dict['feature']: # used to be where the updates were compared # pause... if queue_space_occupied or jobs_to_run: print 'waiting ' + str( SLURM_QUEUE_MONITOR_DELAY ) +'s...' time.sleep( SLURM_QUEUE_MONITOR_DELAY ) # return anything? # write one last time? for i in task_summaries: if not 'task_summary_filename' in i['filenames'].keys(): raise NotImplementedError( 'should input the task summary filename (not the summary itself)...' ) else: # write it out write_task_summary( i , i['filenames']['task_summary_filename'] )