def run_probe( pdb_filename , variants , probe_output_filename = '' , run = True ): """ Runs PROBE on <pdb_filename> on the positions found among <variants> using the default options in PROBE_OPTIONS and writes the output to <probe_output_filename> (also returns this output filename) """ if not probe_output_filename: probe_output_filename = os.path.abspath( pdb_filename ).rstrip( '.pdb' ) + '.probe_out' # get the unique variant positions positions = list( set( [i[1:-1] for i in variants] ) ) positions.sort() # generate the commands to run # command = '#!/bin/sh\nrm ' + probe_output_filename + '\ntouch ' + probe_output_filename + '\n' command = 'rm ' + probe_output_filename + ';touch ' + probe_output_filename + ';' # delete any prior copy since we will append to it for i in positions: probe_options = {} probe_options.update( PROBE_OPTIONS ) probe_options['out'] = pdb_filename probe_options['Q'] = str( i ) command += create_executable_str( PATH_TO_PROBE , [] , probe_options , probe_output_filename , append = True ) +';'#'\n' # run PROBE, store the output if run: run_local_commandline( command ) return probe_output_filename , positions else: # the command, well, get positions etc. too return command , probe_output_filename , positions
def run_psiblast( sequence_filename , run = True ): """ Runs PSIBLAST on <sequence_filename> using the default options in PSIBLAST_OPTIONS and returns the relevant output file: "out_ascii_pssm" """ root_filename = os.path.abspath( sequence_filename ).rstrip( '.fa' ) # collect the options, set the input, derive the output filenames psiblast_options = {} psiblast_options.update( PSIBLAST_OPTIONS ) psiblast_options['query'] = sequence_filename for i in psiblast_options.keys(): if '__call__' in dir( psiblast_options[i] ): psiblast_options[i] = psiblast_options[i]( root_filename ) for i in psiblast_options.keys(): if isinstance( psiblast_options[i] , str ) and os.path.isfile( psiblast_options[i] ): psiblast_options[i] = os.path.abspath( psiblast_options[i] ) command = create_executable_str( PATH_TO_PSIBLAST , args = [] , options = psiblast_options ) if run: run_local_commandline( command ) # the only output we need return psiblast_options['out_ascii_pssm'] else: # just send the command return command , psiblast_options['out_ascii_pssm']
def run_rosetta_relax_local( pdb_filename , extra_options = {} , run = True ): root_filename = os.path.abspath( pdb_filename ).replace( '.pdb' , '' ) # collect the options, set the input, derive the output filenames relax_options = {} relax_options.update( ROSETTA_RELAX_OPTIONS ) relax_options.update( extra_options ) relax_options['s'] = pdb_filename relax_options['native'] = pdb_filename # required to get gdtmm scores for i in relax_options.keys(): if '__call__' in dir( relax_options[i] ): relax_options[i] = relax_options[i]( root_filename ) for i in relax_options.keys(): if isinstance( relax_options[i] , str ) and os.path.isfile( relax_options[i] ): relax_options[i] = os.path.abspath( relax_options[i] ) # ...weird Rosetta append behavior... if os.path.isfile( relax_options['out:file:silent'] ): os.remove( relax_options['out:file:silent'] ) if os.path.isfile( relax_options['out:file:scorefile'] ): os.remove( relax_options['out:file:scorefile'] ) command = create_executable_str( PATH_TO_ROSETTA_RELAX , args = [] , options = relax_options ) if run: run_local_commandline( command ) # the only output we need return relax_options['out:file:silent'] else: return command , relax_options['out:file:silent']
def run_psiblast(sequence_filename, run=True): """ Runs PSIBLAST on <sequence_filename> using the default options in PSIBLAST_OPTIONS and returns the relevant output file: "out_ascii_pssm" """ root_filename = os.path.abspath(sequence_filename).rstrip('.fa') # collect the options, set the input, derive the output filenames psiblast_options = {} psiblast_options.update(PSIBLAST_OPTIONS) psiblast_options['query'] = sequence_filename for i in psiblast_options.keys(): if '__call__' in dir(psiblast_options[i]): psiblast_options[i] = psiblast_options[i](root_filename) for i in psiblast_options.keys(): if isinstance(psiblast_options[i], str) and os.path.isfile( psiblast_options[i]): psiblast_options[i] = os.path.abspath(psiblast_options[i]) command = create_executable_str(PATH_TO_PSIBLAST, args=[], options=psiblast_options) if run: run_local_commandline(command) # the only output we need return psiblast_options['out_ascii_pssm'] else: # just send the command return command, psiblast_options['out_ascii_pssm']
def run_rosetta_relax_local(pdb_filename, extra_options={}, run=True): root_filename = os.path.abspath(pdb_filename).replace('.pdb', '') # collect the options, set the input, derive the output filenames relax_options = {} relax_options.update(ROSETTA_RELAX_OPTIONS) relax_options.update(extra_options) relax_options['s'] = pdb_filename relax_options['native'] = pdb_filename # required to get gdtmm scores for i in relax_options.keys(): if '__call__' in dir(relax_options[i]): relax_options[i] = relax_options[i](root_filename) for i in relax_options.keys(): if isinstance(relax_options[i], str) and os.path.isfile( relax_options[i]): relax_options[i] = os.path.abspath(relax_options[i]) # ...weird Rosetta append behavior... if os.path.isfile(relax_options['out:file:silent']): os.remove(relax_options['out:file:silent']) if os.path.isfile(relax_options['out:file:scorefile']): os.remove(relax_options['out:file:scorefile']) command = create_executable_str(PATH_TO_ROSETTA_RELAX, args=[], options=relax_options) if run: run_local_commandline(command) # the only output we need return relax_options['out:file:silent'] else: return command, relax_options['out:file:silent']
def run_rosetta_ddg_monomer(pdb_filename, mut_filename, out_filename='', out_path='', cleanup=True, run=True): root_filename = os.path.abspath(pdb_filename).rstrip('.pdb') # hardcoded...ddg_monomer is such a painful protocol... out_filename = '' if '/' in root_filename: out_filename += '/'.join(root_filename.split('/')[:-1]) + '/' out_filename += 'ddg_predictions.out' # clear it out if it exists, otherwise it will be appended to... if os.path.exists(out_filename): os.remove(out_filename) # collect the options, set the input, derive the output filenames ddg_monomer_options = {} ddg_monomer_options.update(ROSETTA_DDG_MONOMER_OPTIONS) ddg_monomer_options['in:file:s'] = pdb_filename ddg_monomer_options['ddg::mut_file'] = mut_filename for i in ddg_monomer_options.keys(): if '__call__' in dir(ddg_monomer_options[i]): ddg_monomer_options[i] = ddg_monomer_options[i](root_filename) for i in ddg_monomer_options.keys(): if isinstance(ddg_monomer_options[i], str) and os.path.isfile( ddg_monomer_options[i]): ddg_monomer_options[i] = os.path.abspath(ddg_monomer_options[i]) command = '' # optionally move into the specific directory... if out_path: command += 'cd ' + out_path + '; ' #\n\n' command += create_executable_str(PATH_TO_ROSETTA_DDG_MONOMER, args=[], options=ddg_monomer_options) if run: run_local_commandline(command) # optionally cleanup if cleanup: print 'ddg_monomer writes useless output files, deleting these now...' remove_intermediate_ddg_monomer_files() # the only output we need return out_filename else: return command, out_filename
def run_rosetta_rescore(silent_filename, native_filename, score_filename='', run=True): """ Performs extraction of individual PDB structures from <silent_filename> to <out_dir> (default to current location) using the "score" protocol of Rosetta (built against 3.5) Optionally specify <extra_options> """ root_filename = os.path.abspath(silent_filename).rstrip('.silent') score_options = {} score_options.update(ROSETTA_SCORE_OPTIONS) score_options['in:file:silent'] = silent_filename score_options[ 'in:file:native'] = native_filename # required to get gdtmm scores for i in score_options.keys(): if '__call__' in dir(score_options[i]): score_options[i] = score_options[i](root_filename) # necessary... if 'out:file:scorefile' in score_options.keys( ) and not 'rescore.sc' in score_options['out:file:scorefile']: score_options['out:file:scorefile'] = score_options[ 'out:file:scorefile'].replace('.sc', '_rescore.sc') for i in score_options.keys(): if isinstance(score_options[i], str) and os.path.isfile( score_options[i]): score_options[i] = os.path.abspath(score_options[i]) # ...weird Rosetta append behavior... if os.path.isfile(score_options['out:file:scorefile']): os.remove(score_options['out:file:scorefile']) # default options command = create_executable_str(PATH_TO_ROSETTA_SCORE, args=[], options=score_options) if run: run_local_commandline(command) return score_options['out:file:scorefile'] else: return command, score_options['out:file:scorefile']
def run_rosetta_ddg_monomer( pdb_filename , mut_filename , out_filename = '' , out_path = '' , cleanup = True , run = True ): root_filename = os.path.abspath( pdb_filename ).rstrip( '.pdb' ) # hardcoded...ddg_monomer is such a painful protocol... out_filename = '' if '/' in root_filename: out_filename += '/'.join( root_filename.split( '/' )[:-1] ) +'/' out_filename += 'ddg_predictions.out' # clear it out if it exists, otherwise it will be appended to... if os.path.exists( out_filename ): os.remove( out_filename ) # collect the options, set the input, derive the output filenames ddg_monomer_options = {} ddg_monomer_options.update( ROSETTA_DDG_MONOMER_OPTIONS ) ddg_monomer_options['in:file:s'] = pdb_filename ddg_monomer_options['ddg::mut_file'] = mut_filename for i in ddg_monomer_options.keys(): if '__call__' in dir( ddg_monomer_options[i] ): ddg_monomer_options[i] = ddg_monomer_options[i]( root_filename ) for i in ddg_monomer_options.keys(): if isinstance( ddg_monomer_options[i] , str ) and os.path.isfile( ddg_monomer_options[i] ): ddg_monomer_options[i] = os.path.abspath( ddg_monomer_options[i] ) command = '' # optionally move into the specific directory... if out_path: command += 'cd '+ out_path +'; ' command += create_executable_str( PATH_TO_ROSETTA_DDG_MONOMER , args = [] , options = ddg_monomer_options ) if run: run_local_commandline( command ) # optionally cleanup if cleanup: print 'ddg_monomer writes useless output files, deleting these now...' remove_intermediate_ddg_monomer_files() # the only output we need return out_filename else: return command , out_filename
def run_rosetta_rescore( silent_filename , native_filename , score_filename = '' , run = True ): """ Performs extraction of individual PDB structures from <silent_filename> to <out_dir> (default to current location) using the "score" protocol of Rosetta (built against 3.5) Optionally specify <extra_options> """ root_filename = os.path.abspath( silent_filename ).rstrip( '.silent' ) score_options = {} score_options.update( ROSETTA_SCORE_OPTIONS ) score_options['in:file:silent'] = silent_filename score_options['in:file:native'] = native_filename # required to get gdtmm scores for i in score_options.keys(): if '__call__' in dir( score_options[i] ): score_options[i] = score_options[i]( root_filename ) # necessary... if 'out:file:scorefile' in score_options.keys() and not 'rescore.sc' in score_options['out:file:scorefile']: score_options['out:file:scorefile'] = score_options['out:file:scorefile'].replace( '.sc' , '_rescore.sc' ) for i in score_options.keys(): if isinstance( score_options[i] , str ) and os.path.isfile( score_options[i] ): score_options[i] = os.path.abspath( score_options[i] ) # ...weird Rosetta append behavior... if os.path.isfile( score_options['out:file:scorefile'] ): os.remove( score_options['out:file:scorefile'] ) # default options command = create_executable_str( PATH_TO_ROSETTA_SCORE , args = [] , options = score_options ) if run: run_local_commandline( command ) return score_options['out:file:scorefile'] else: return command , score_options['out:file:scorefile']
def run_probe(pdb_filename, variants, probe_output_filename='', run=True): """ Runs PROBE on <pdb_filename> on the positions found among <variants> using the default options in PROBE_OPTIONS and writes the output to <probe_output_filename> (also returns this output filename) """ if not probe_output_filename: probe_output_filename = os.path.abspath(pdb_filename).rstrip( '.pdb') + '.probe_out' # get the unique variant positions positions = list(set([i[1:-1] for i in variants])) positions.sort() # generate the commands to run # command = '#!/bin/sh\nrm ' + probe_output_filename + '\ntouch ' + probe_output_filename + '\n' command = 'rm ' + probe_output_filename + ';touch ' + probe_output_filename + ';' # delete any prior copy since we will append to it for i in positions: probe_options = {} probe_options.update(PROBE_OPTIONS) probe_options['out'] = pdb_filename probe_options['Q'] = str(i) command += create_executable_str(PATH_TO_PROBE, [], probe_options, probe_output_filename, append=True) + ';' #'\n' # run PROBE, store the output if run: run_local_commandline(command) return probe_output_filename, positions else: # the command, well, get positions etc. too return command, probe_output_filename, positions
def run_VIPUR_tasks_PBS(task_summaries, task_list, max_pbs_tries=2, ddg_monomer_cleanup=True, single_relax=True, delete_intermediate_relax_files=False): # run the non_rescore tasks completed = [ i for i in task_list if 'run' in task_summaries[i[0]]['commands'][i[1]] and 'success' in task_summaries[i[0]]['commands'][i[1]]['run'] ] # should running_or_queued be saved? written to file? running_or_queued = {} rounds = 0 all_completed_jobs = [ ] # prevents annoying bulk output, only see it the first time it completes while not len(completed) == len(task_list): rounds += 1 print '\n\nQUEUE MONITOR ROUND ' + str(rounds) # debug # print running_or_queued # check queue status queue_status = get_pbs_queue_status() # update "running_or_queued" list (?) # err, no, does not have information on which job it is...:( #for i in queue_status.keys(): # if queue_status[i] in ['R' , 'Q']: queue_space_occupied = len([ i for i in queue_status.values() if not i in ['C', 'R'] ]) # ignore "C"ompleted jobs, "R"unning job quota are not set by us... # if your queue system does not have a separate "R"un quota, remove 'R' from the above! available_space = PBS_QUEUE_QUOTA - queue_space_occupied # launch next jobs in available slots if available_space: print str(queue_space_occupied ) + ' jobs queued or running, could submit up to ' + str( available_space) + ' more' # choose the next job jobs_to_run = [ i for i in task_list if not i in completed and not i in running_or_queued.values() and not ('run' in task_summaries[i[0]]['commands'][i[1]] and ('success' in task_summaries[i[0]]['commands'][ i[1]]['run'] or 'failure' in task_summaries[i[0]] ['commands'][i[1]]['run'])) ] print str( len(jobs_to_run) ) + ' jobs still need to finish (after the currently running jobs complete)' # only the next few for i in jobs_to_run[:available_space]: command_dict = task_summaries[i[0]]['commands'][i[1]] # write scripts as part of pre processing?...yeah... # write the command to a script #script_filename = command_dict['out_path'] +'/'*bool( command_dict['out_path'] )+ #script_filename = command_dict['script_filename'] # if its a rescore and relax jobs were separated, need to recombine them! if 'rescore' in command_dict['feature']: # combine the individual relax runs #relax_commands = [i for i in task_summary['commands'] if i['feature'].replace( '_native' , '' ) == 'relax'] #silent_filenames = [j['output_filename'] for j in relax_commands if j['variant'] == i['variant'] and 'run' in j.keys() and j['run'] == 'success'] silent_filenames = [ j['output_filename'] for j in task_summaries[i[0]]['commands'] if j['feature'].replace('_native', '') == 'relax' and j['variant'] == command_dict['variant'] and 'run' in j.keys() and 'success' in j['run'] ] # actually need to identify the combined_silent_filename, be sure the relax files have not already been merged # which variant target_variant = [ j for j in task_summaries[i[0]]['variants'].keys() if j.split('_')[-1] == command_dict['variant'] and j.split('_')[0] in command_dict['command'] ] if not target_variant: # its native combined_silent_filename = task_summaries[ i[0]]['other']['combined_native_silent_filename'] combined_score_filename = task_summaries[ i[0]]['other']['combined_native_score_filename'] elif len(target_variant) > 1: raise Exception( '??? found more than on matching variant ???\n' + ', '.join(target_variant)) else: # found it combined_silent_filename = task_summaries[ i[0]]['variants'][ target_variant[0]]['combined_silent_filename'] combined_score_filename = task_summaries[ i[0]]['variants'][ target_variant[0]]['combined_score_filename'] #if not single_relax: # AND post processing has not already be run...scan for the combined silent file if not single_relax and not os.path.isfile( combined_silent_filename): if not len(silent_filenames ) == ROSETTA_RELAX_OPTIONS['nstruct']: raise Exception( '??? somehow the matching relax run(s) has failed ???\n' + str(i)) score_filenames = [ j.replace('.silent', '.sc') for j in silent_filenames ] merge_rosetta_relax_output( silent_filenames, combined_silent_filename, score_filenames, combined_score_filename, delete_old_files=delete_intermediate_relax_files) # rescore already knows the proper filename else: # just a single match for each # output filename should be correct as is :) None # submit this script using a queue command # generate it here instead pbs_options = {} if command_dict['queue'] == 'parallel': pbs_options.update(PBS_PARALLEL_JOB_OPTIONS) elif command_dict['queue'] == 'serial': pbs_options.update(PBS_SERIAL_JOB_OPTIONS) # make sure they are satisfied script_filename = command_dict['script_filename'] for k in pbs_options.keys(): if '__call__' in dir(pbs_options[k]): pbs_options[k] = pbs_options[k](script_filename) pbs_command = create_executable_str('qsub', [script_filename], pbs_options) new_job_id = run_local_commandline(pbs_command, collect_stdout=True) new_job_id = new_job_id.strip() if '.' in new_job_id: new_job_id = new_job_id[:new_job_id.find('.')] print 'submitted ' + new_job_id # save the job id # assume its queue running_or_queued[new_job_id] = i else: print 'no new \"positions\" are available' # debug, need to know running_jobs = len([i for i in queue_status.values() if i in ['R']]) if running_jobs: print str( running_jobs ) + ' are still running...(excluding the jobs just submitted and including your other jobs)' # assess outcome of completed jobs for job_id in sorted( queue_status.keys()): # sort in numerical order, right? # debug if not job_id in all_completed_jobs: print '\t' + job_id, queue_status[ job_id] # , job_id in running_or_queued.keys() # could just skip it all now? if queue_status[ job_id] == 'C' and job_id in running_or_queued.keys(): task_id = running_or_queued[job_id][0] command_index = running_or_queued[job_id][1] command_dict = task_summaries[task_id]['commands'][ command_index] check_successful = determine_check_successful_function( command_dict, single_relax=single_relax) success = check_successful(command_dict) failure_summary = '' if isinstance(success, bool): complete = success elif len(success) > 1 and isinstance(success[0], bool): complete = success[0] failure_summary += ' ' + ';'.join( [str(j) for j in success[1:]]) + ' ' print complete, failure_summary, 'try again?' * bool( not complete) # debug # track the number of attempts? # try until failure - how many times? tries = 0 if 'run' in command_dict.keys( ) and command_dict['run'] and not 'success' in command_dict[ 'run'] and not 'failure' in command_dict['run']: tries = int(command_dict['run']) tries += 1 print tries, 'attempts so far' # debug if tries >= max_pbs_tries: # its a failure print job_id + ' completed successfully' * complete + ( ' failed with ' + str(tries) + ' attempts') * (not complete) failure_summary = 'success' * complete + ( str(tries) + ' tries;failure ' + failure_summary) * (not complete) elif complete: print job_id + ' completed successfully' failure_summary = 'success' #+ str( tries ) + ' tries' else: # record the number of tries print job_id + ' completed' + ' successfully' * complete failure_summary = str(tries) # update the record print 'updating with: ' + failure_summary # debug task_summaries[task_id]['commands'][command_index][ 'run'] = failure_summary # optionally cleanup if ddg_monomer_cleanup and command_dict[ 'feature'] == 'ddg_monomer': #'ddg' in i['output_filename']: print 'ddg_monomer writes useless output files, deleting these now...' remove_intermediate_ddg_monomer_files() # jobs that have since been completed - consider them complete? completed.append( running_or_queued[job_id]) # good, so this grows del running_or_queued[job_id] # remove jobs to run? # print 'updating the status...' # debug # write out "completed"? or "running_or_queued"? if queue_status[job_id] == 'C' and not job_id in all_completed_jobs: all_completed_jobs.append( job_id) # prevent redundant update info # update task_summaries e.g. write them! # modified: so the task summary records its own name...bah! for i in task_summaries: if not 'task_summary_filename' in i['filenames'].keys(): raise NotImplementedError( 'should input the task summary filename (not the summary itself)...' ) else: # write it out print 'updating: ' + i['filenames']['task_summary_filename'] write_task_summary(i, i['filenames']['task_summary_filename']) # pause... print '\n', len(completed), 'completed', len( task_list), 'tasks remaining' # debug if len(completed) <= len(task_list): # no need for edge-case end wait print 'waiting ' + str(PBS_QUEUE_MONITOR_DELAY) + 's...' time.sleep(PBS_QUEUE_MONITOR_DELAY) # return anything? # write one last time? for i in task_summaries: if not 'task_summary_filename' in i['filenames'].keys(): raise NotImplementedError( 'should input the task summary filename (not the summary itself)...' ) else: # write it out write_task_summary(i, i['filenames']['task_summary_filename'])
def run_rosetta_relax( pdb_filename , extra_options = {} , run = True , parallel = ROSETTA_RELAX_PARALLEL ): root_filename = pdb_filename.rstrip( '.pdb' ) # collect the options, set the input, derive the output filenames relax_options = {} relax_options.update( ROSETTA_RELAX_OPTIONS ) relax_options.update( extra_options ) relax_options['s'] = pdb_filename relax_options['native'] = pdb_filename # required to get gdtmm scores for i in relax_options.keys(): if '__call__' in dir( relax_options[i] ): relax_options[i] = relax_options[i]( root_filename ) # ...weird Rosetta append behavior... # if os.path.isfile( relax_options['out:file:silent'] ): # os.remove( relax_options['out:file:silent'] ) # if os.path.isfile( relax_options['out:file:scorefile'] ): # os.remove( relax_options['out:file:scorefile'] ) # for njc parallelization nstruct = int( relax_options.get( 'nstruct' , '0' ) ) parallel = int( parallel ) tmp_file = None if nstruct > 1 and parallel > 1: relax_options['nstruct'] = 1 #TODO: Add chunking option? score_filename = relax_options['out:file:scorefile'] silent_filename = relax_options['out:file:silent'] if 'run:jran' in relax_options: restoreJran = True jran = int( relax_options['run:jran'] ) else: restoreJran = False jran = 123 tmp_file = tempfile.NamedTemporaryFile( delete = False ) print 'Parallel relax commands are in ' + tmp_file.name for s in xrange( nstruct ): tag = '_%05d' % s relax_options['run:jran'] = jran*nstruct + s relax_options['out:file:scorefile'] = score_filename + tag relax_options['out:file:silent'] = silent_filename + tag print >>tmp_file , create_executable_str( PATH_TO_ROSETTA_RELAX , args = [] , options = relax_options ) + " > %s 2>&1; echo '[[VIPURLOG]]' %s %d" % ((silent_filename + tag).replace( 'silent_' , 'log_' ) , pdb_filename , s + 1 ) tmp_file.close() # the "find ... | xargs ..." idiom is used just in case nstruct is ever a *very* large number. command = '''\ parallel -j %d -a %s find . -name '%s_[0-9]*[0-9]' | xargs cat | awk 'NR == 1 || $2 != "score" {print $0}' > %s find . -name '%s_[0-9]*[0-9]' | xargs rm find . -name '%s_[0-9]*[0-9]' | xargs cat | awk 'NR <= 2 || !($2 == "score" || $1 == "SEQUENCE:") {print $0}' > %s find . -name '%s_[0-9]*[0-9]' | xargs rm ''' % (parallel , tmp_file.name , score_filename , score_filename , score_filename , silent_filename , silent_filename , silent_filename) print 'Parallel relax driver command:', command # restore option values relax_options['nstruct'] = str( nstruct ) relax_options['out:file:scorefile'] = score_filename relax_options['out:file:silent'] = silent_filename if restoreJran: relax_options['run:jran'] = jran if run: return (command , tmp_file.name , score_filename , silent_filename) if tmp_file: os.unlink( tmp_file.name ) else: command = create_executable_str( PATH_TO_ROSETTA_RELAX , args = [] , options = relax_options ) if run: run_local_commandline( command ) # command = create_executable_str( PATH_TO_ROSETTA_RELAX , args = [] , options = relax_options ) # run_local_commandline( command ) # the only output we need # return relax_options['out:file:scorefile'] return relax_options['out:file:silent']
def run_VIPUR_PBS( pdb_filename = '' , variants_filename = '' , out_path = '' , write_numbering_map = True , single_relax = True , delete_intermediate_relax_files = True , demo = False , rerun_preprocessing = False ): # for the example input if demo: pdb_filename = PATH_TO_VIPUR + '/example_input/2C35.pdb' variants_filename = PATH_TO_VIPUR + '/example_input/2C35.txt' out_path = PATH_TO_VIPUR + '/example_output' # alternatively, run on an entire directory if not pdb_filename and not variants_filename: # current directory print 'no input provided, assuming you want to run on every (.pdb,.txt) file pair found in the current directory' pdb_filename = os.getcwd() if os.path.isdir( pdb_filename ) and not variants_filename: # assume variants_filename from pdb_filename variants_filename = '.txt' if os.path.isdir( pdb_filename ) and variants_filename[0] == '.': # look for file extension # instead, run on the directory if not out_path: out_path = os.path.abspath( pdb_filename ) # print out_path fa_filenames = [(out_path +'/')*bool( out_path ) + i for i in os.listdir( pdb_filename ) if get_file_extension( i ) == 'fa'] fa_filenames = [[i , get_root_filename( i ) + variants_filename] for i in fa_filenames if os.path.isfile( get_root_filename( i ) + variants_filename ) and not os.path.isfile( get_root_filename( i ) + '.pdb' )] print 'running VIPUR on all (.pdb,' + variants_filename + ') file pairs found in ' + pdb_filename # find .pdb files pdb_filenames = [(out_path +'/')*bool( out_path ) + i for i in os.listdir( pdb_filename ) if get_file_extension( i ) == 'pdb'] # look for pairs pdb_filenames = [[i , get_root_filename( i ) + variants_filename] for i in pdb_filenames if os.path.isfile( get_root_filename( i ) + variants_filename )] # print [i for i in pdb_filenames if os.path.isfile( pdb_filename +'/'+ get_root_filename( i ) + variants_filename )] print str( len( pdb_filenames ) ) + ' pairs found' print str( len( fa_filenames ) ) + ' pairs found (for sequence only)' # go there... # os.chdir( pdb_filename ) if not pdb_filenames: if not fa_filenames: raise IOError( '!!! no (.pdb,' + variants_filename + ') file pairs found in ' + pdb_filename + '!!?!\nAND no (.fa,' + variants_filename + ') file pairs were found...' ) else: print '...only (.fa,' + variants_filename + ') file pairs were found, running in sequence only mode' else: # file extension etc. file_extension = get_file_extension( pdb_filename ) root_filename = get_root_filename( pdb_filename ) # normal execution, generalize by turning into list pdb_filenames = [] fa_filenames = [] if file_extension == 'pdb': pdb_filenames = [[(out_path +'/')*bool( out_path ) + pdb_filename , (out_path +'/')*bool( out_path ) + variants_filename]] else: fa_filenames = [[]] # combine all "filenames" to run into unified framework target_proteins = []#None]*(len( pdb_filenames ) + len( fa_filenames )) for i in pdb_filenames: this_out_path = get_root_filename( i[0] ) +'_VIPUR' # directory to create target_proteins.append( i + [False , this_out_path] ) for i in fa_filenames: this_out_path = get_root_filename( i[0] ) +'_VIPUR' # directory to create target_proteins.append( i + [True , this_out_path] ) # pre processing task_summaries = [] for i in target_proteins: # guess what the task summary filename 'would' be, if it exists, keep going... task_summary_filename = i[3]*bool( i[3] ) +'/'+ get_root_filename( i[0] ).split( '/' )[-1] + '.task_summary' if os.path.isfile( task_summary_filename ) and not rerun_preprocessing: print 'hmmm, ' + i[0] + ' seems to have run preprocessing already, skipping now' #continue # skip this one, do not add to list of tasks...? # actually, skip running pre-processing BUT DO add it to the list of tasks else: task_summary_filename = run_preprocessing( i[0] , i[1] , sequence_only = i[2] , out_path = i[3] , task_summary_filename = task_summary_filename , write_numbering_map = write_numbering_map , single_relax = single_relax ) # modify for PBS script task_summary = load_task_summary( task_summary_filename ) for j in xrange( task_summary['commands'] ): pbs_options = {} command = task_summary['commands'][j] # add for relax if task_summary['commands'][j]['feature'].replace( '_native' , '' ) == 'relax' and not 'rescore' in task_summary['commands'][j]['feature']: command = command.replace( '.linuxgccrelease' , '.mpi.linuxgccrelease' ) command = 'module load mvapich2/gnu/1.8.1; /share/apps/mvapich2/1.8.1/gnu/bin/mpiexec -n 36 ' + command command += ' -jd2:mpi_file_buf_job_distributor false' command += ' -run:multiple_processes_writing_to_one_directory' # also use the parallel options pbs_options.update( PBS_PARALLEL_JOB_OPTIONS ) else: pbs_options.update( PBS_SERIAL_JOB_OPTIONS ) # put "cd" in front command = ('cd '+ i[3] +';')*bool( i[3] ) + command # modify the task summary task_summary['commands'][j] = command # actually write the script... # don't worry about optional #PBS header info script_filename = i[3] + '/'*bool( i[3] ) + get_root_filename( i[0] ).split( '/' )[-1] +'.'+ task_summary['commands'][j]['feature'] + '.pbs_script.sh' task_summary['commands'][j]['script_filename'] = script_filename f = open( script_filename , 'w' ) f.write( command ) f.close() # use the script filename as the source for any log files # control the output and error paths for i in pbs_options.keys(): if '__call__' in dir( pbs_options[i] ): pbs_options[i] = pbs_options[i]( script_filename ) # also generate the pbs call? might as well, keep it simple... task_summary['commands'][j]['qsub_command'] = create_executable_str( 'qsub' , [script_filename] , pbs_options ) # rewrite the task summary write_task_summary( task_summary_filename ) task_summaries.append( task_summary_filename ) # run them all # run_VIPUR_task_summaries_serially( task_summaries , single_relax = single_relax , delete_intermediate_relax_files = delete_intermediate_relax_files ) run_VIPUR_task_summaries_PBS( task_summaries , single_relax = single_relax , delete_intermediate_relax_files = delete_intermediate_relax_files ) # post processing # this look identical!!! :) for i in xrange( len( task_summaries ) ): # always okay to rerun post processing...should not make any difference sequence_only = target_proteins[i][2] task_summaries[i] = run_postprocessing( task_summaries[i] , sequence_only = sequence_only ) return task_summaries
def run_VIPUR_SLURM( pdb_filename = '' , variants_filename = '' , out_path = '' , write_numbering_map = True , single_relax = False , delete_intermediate_relax_files = True , demo = False , rerun_preprocessing = False ): # the following should probably be a separate method... # for the example input if demo: pdb_filename = PATH_TO_VIPUR + '/example_input/2C35.pdb' variants_filename = PATH_TO_VIPUR + '/example_input/2C35.txt' out_path = PATH_TO_VIPUR + '/example_output' # alternatively, run on an entire directory if not pdb_filename and not variants_filename: # current directory print 'no input provided, assuming you want to run on every (.pdb,.txt) file pair found in the current directory' pdb_filename = os.getcwd() if os.path.isdir( pdb_filename ) and not variants_filename: # assume variants_filename from pdb_filename variants_filename = '.txt' if os.path.isdir( pdb_filename ) and variants_filename[0] == '.': # look for file extension # instead, run on the directory if not out_path: out_path = os.path.abspath( pdb_filename ) # print out_path fa_filenames = [(out_path +'/')*bool( out_path ) + i for i in os.listdir( pdb_filename ) if get_file_extension( i ) == 'fa'] fa_filenames = [[i , get_root_filename( i ) + variants_filename] for i in fa_filenames if os.path.isfile( get_root_filename( i ) + variants_filename ) and not os.path.isfile( get_root_filename( i ) + '.pdb' )] print 'running VIPUR on all (.pdb,' + variants_filename + ') file pairs found in ' + pdb_filename # find .pdb files pdb_filenames = [(out_path +'/')*bool( out_path ) + i for i in os.listdir( pdb_filename ) if get_file_extension( i ) == 'pdb'] # look for pairs pdb_filenames = [[i , get_root_filename( i ) + variants_filename] for i in pdb_filenames if os.path.isfile( get_root_filename( i ) + variants_filename )] # print [i for i in pdb_filenames if os.path.isfile( pdb_filename +'/'+ get_root_filename( i ) + variants_filename )] print str( len( pdb_filenames ) ) + ' pairs found' print str( len( fa_filenames ) ) + ' pairs found for sequence only mode' # go there... # os.chdir( pdb_filename ) if not pdb_filenames: if not fa_filenames: raise IOError( '!!! no (.pdb,' + variants_filename + ') file pairs found in ' + pdb_filename + '!!?!\nAND no (.fa,' + variants_filename + ') file pairs were found...' ) else: print '...only (.fa,' + variants_filename + ') file pairs were found, running in sequence only mode' else: # file extension etc. file_extension = get_file_extension( pdb_filename ) root_filename = get_root_filename( pdb_filename ) # normal execution, generalize by turning into list pdb_filenames = [] fa_filenames = [] if file_extension == 'pdb': pdb_filenames = [[(out_path +'/')*bool( out_path ) + pdb_filename , (out_path +'/')*bool( out_path ) + variants_filename]] else: fa_filenames = [[]] # combine all "filenames" to run into unified framework target_proteins = []#None]*(len( pdb_filenames ) + len( fa_filenames )) for i in pdb_filenames: this_out_path = get_root_filename( i[0] ) +'_VIPUR' # directory to create target_proteins.append( i + [False , this_out_path] ) for i in fa_filenames: this_out_path = get_root_filename( i[0] ) +'_VIPUR' # directory to create target_proteins.append( i + [True , this_out_path] ) # setup environment variables BEFORE pre processing # not needed with current SLURM setup... # pre processing task_summaries = [] for i in target_proteins: # guess what the task summary filename 'would' be, if it exists, keep going... task_summary_filename = i[3]*bool( i[3] ) +'/'+ get_root_filename( i[0] ).split( '/' )[-1] + '.task_summary' if os.path.isfile( task_summary_filename ) and not rerun_preprocessing: print 'hmmm, ' + i[0] + ' seems to have run preprocessing already, skipping now' #continue # skip this one, do not add to list of tasks...? # actually, skip running pre-processing BUT DO add it to the list of tasks # is this actually working? else: task_summary_filename = run_preprocessing( i[0] , i[1] , sequence_only = i[2] , out_path = i[3] , task_summary_filename = task_summary_filename , write_numbering_map = write_numbering_map , single_relax = single_relax ) # modify for SLURM script task_summary = load_task_summary( task_summary_filename ) # task_summary['filenames']['slurm_script_filename'] = 'slurm_' + get_root_filename( i[0] ) + '.sh' task_summary['filenames']['slurm_script_filename'] = out_path + '/slurm_script_this_batch.sh' task_summary['filenames']['slurm_output_filename'] = out_path + '/slurm_output_batch.out' task_summary['filenames']['slurm_error_filename'] = out_path + '/slurm_error_batch.err' # ...awkward...they all have individual task summarization of the same script...but nowhere else to put it... for j in xrange( len( task_summary['commands'] ) ): slurm_options = {} command = task_summary['commands'][j]['command'] # add for relax if task_summary['commands'][j]['feature'].replace( '_native' , '' ) == 'relax' and not 'rescore' in task_summary['commands'][j]['feature']: command = command.replace( '.linuxgccrelease' , '.mpi.linuxgccrelease' ) # command = 'module load mvapich2/gnu/1.8.1;/share/apps/mvapich2/1.8.1/gnu/bin/mpiexec -n 36 ' + command command = 'mpiexec -n 40 ' + command command += ' -jd2:mpi_file_buf_job_distributor false' command += ' -run:multiple_processes_writing_to_one_directory' # also use the parallel options # pbs_options.update( PBS_PARALLEL_JOB_OPTIONS ) # else: slurm_options.update( SLURM_JOB_OPTIONS ) # put "cd" in front # command = ('#!/bin/bash\n\ncd '+ i[3] +'\n\n')*bool( i[3] ) + command +'\n\n' # command = ('cd '+ i[3] +';')*bool( i[3] ) + command # not needed for slurm, use abspaths and one big batch # special... if task_summary['commands'][j]['feature'] == 'psiblast' and not 'num_threads' in task_summary['commands'][j]['command']: task_summary['commands'][j]['command'] += ' -num_threads 40' # modify the task summary task_summary['commands'][j]['command'] = command # MUST still do ddg_monomer on single process... if 'ddg_monomer' in task_summary['commands'][j]['feature'] or 'rescore' in task_summary['commands'][j]['feature']: if 'rescore' in task_summary['commands'][j]['feature']: # sanity check if not 'variant' in task_summary['commands'][j].keys(): raise Exception( 'rescore command without the variant information...!?' ) # need variant in the script, otherwise overwrite :( script_filename = i[3] + '/'*bool( i[3] ) + get_root_filename( i[0] ).split( '/' )[-1] +'.'+ task_summary['commands'][j]['feature'] +'_'+ task_summary['commands'][j]['variant'] + '.slurm_script.sh' else: # ddg monomer is "per protein", no need for more detail script_filename = i[3] + '/'*bool( i[3] ) + get_root_filename( i[0] ).split( '/' )[-1] +'.'+ task_summary['commands'][j]['feature'] + '.slurm_script.sh' task_summary['commands'][j]['script_filename'] = script_filename # only write ONE submission script per batch = run of VIPUR f = open( script_filename , 'w' ) f.write( SLURM_BASH_SCRIPT( command ) ) f.close() # use the script filename as the source for any log files # control the output and error paths for k in slurm_options.keys(): if '__call__' in dir( slurm_options[k] ): slurm_options[k] = slurm_options[k]( script_filename ) slurm_options['N'] = '1' slurm_options['n'] = '1' # also generate the pbs call? might as well, keep it simple... # srun or sbatch? task_summary['commands'][j]['sbatch_command'] = create_executable_str( 'sbatch' , [script_filename] , slurm_options ) # actually write the script... # don't worry about optional #PBS header info # script_filename = i[3] + '/'*bool( i[3] ) + get_root_filename( i[0] ).split( '/' )[-1] +'.'+ task_summary['commands'][j]['feature'] + '.slurm_script.sh' # task_summary['commands'][j]['script_filename'] = script_filename # only write ONE submission script per batch = run of VIPUR # f = open( script_filename , 'w' ) # f.write( SLURM_BASH_SCRIPT( command ) ) # f.close() # use the script filename as the source for any log files # control the output and error paths # for k in slurm_options.keys(): # if '__call__' in dir( slurm_options[k] ): # slurm_options[k] = slurm_options[k]( script_filename ) # also generate the pbs call? might as well, keep it simple... # srun or sbatch? # task_summary['commands'][j]['srun_command'] = create_executable_str( 'srun' , [script_filename] , slurm_options ) # rewrite the task summary write_task_summary( task_summary , task_summary_filename ) task_summaries.append( task_summary )#_filename ) # run them all # run_VIPUR_task_summaries_serially( task_summaries , single_relax = single_relax , delete_intermediate_relax_files = delete_intermediate_relax_files ) run_VIPUR_task_summaries_SLURM( task_summaries , single_relax = single_relax , delete_intermediate_relax_files = delete_intermediate_relax_files ) # post processing # this look identical!!! :) for i in xrange( len( task_summaries ) ): # always okay to rerun post processing...should not make any difference sequence_only = target_proteins[i][2] task_summaries[i] = run_postprocessing( task_summaries[i] , sequence_only = sequence_only ) return task_summaries
def run_VIPUR_tasks_PBS( task_summaries , task_list , max_pbs_tries = 2 , ddg_monomer_cleanup = True , single_relax = True , delete_intermediate_relax_files = False ): # run the non_rescore tasks completed = [i for i in task_list if 'run' in task_summaries[i[0]]['commands'][i[1]] and 'success' in task_summaries[i[0]]['commands'][i[1]]['run']] # should running_or_queued be saved? written to file? running_or_queued = {} rounds = 0 all_completed_jobs = [] # prevents annoying bulk output, only see it the first time it completes while not len( completed ) == len( task_list ): rounds += 1 print '\n\nQUEUE MONITOR ROUND ' + str( rounds ) # debug # print running_or_queued # check queue status queue_status = get_pbs_queue_status() # update "running_or_queued" list (?) # err, no, does not have information on which job it is...:( #for i in queue_status.keys(): # if queue_status[i] in ['R' , 'Q']: queue_space_occupied = len( [i for i in queue_status.values() if not i in ['C' , 'R']] ) # ignore "C"ompleted jobs, "R"unning job quota are not set by us... # if your queue system does not have a separate "R"un quota, remove 'R' from the above! available_space = PBS_QUEUE_QUOTA - queue_space_occupied # launch next jobs in available slots if available_space: print str( queue_space_occupied ) + ' jobs queued or running, could submit up to ' + str( available_space ) + ' more' # choose the next job jobs_to_run = [i for i in task_list if not i in completed and not i in running_or_queued.values() and not ( 'run' in task_summaries[i[0]]['commands'][i[1]] and ('success' in task_summaries[i[0]]['commands'][i[1]]['run'] or 'failure' in task_summaries[i[0]]['commands'][i[1]]['run']) ) ] print str( len( jobs_to_run ) ) + ' jobs still need to finish (after the currently running jobs complete)' # only the next few for i in jobs_to_run[:available_space]: command_dict = task_summaries[i[0]]['commands'][i[1]] # write scripts as part of pre processing?...yeah... # write the command to a script #script_filename = command_dict['out_path'] +'/'*bool( command_dict['out_path'] )+ #script_filename = command_dict['script_filename'] # if its a rescore and relax jobs were separated, need to recombine them! if 'rescore' in command_dict['feature']: # combine the individual relax runs #relax_commands = [i for i in task_summary['commands'] if i['feature'].replace( '_native' , '' ) == 'relax'] #silent_filenames = [j['output_filename'] for j in relax_commands if j['variant'] == i['variant'] and 'run' in j.keys() and j['run'] == 'success'] silent_filenames = [j['output_filename'] for j in task_summaries[i[0]]['commands'] if j['feature'].replace( '_native' , '' ) == 'relax' and j['variant'] == command_dict['variant'] and 'run' in j.keys() and 'success' in j['run'] ] # actually need to identify the combined_silent_filename, be sure the relax files have not already been merged # which variant target_variant = [j for j in task_summaries[i[0]]['variants'].keys() if j.split( '_' )[-1] == command_dict['variant'] and j.split( '_' )[0] in command_dict['command']] if not target_variant: # its native combined_silent_filename = task_summaries[i[0]]['other']['combined_native_silent_filename'] combined_score_filename = task_summaries[i[0]]['other']['combined_native_score_filename'] elif len( target_variant ) > 1: raise Exception( '??? found more than on matching variant ???\n' + ', '.join( target_variant ) ) else: # found it combined_silent_filename = task_summaries[i[0]]['variants'][target_variant[0]]['combined_silent_filename'] combined_score_filename = task_summaries[i[0]]['variants'][target_variant[0]]['combined_score_filename'] #if not single_relax: # AND post processing has not already be run...scan for the combined silent file if not single_relax and not os.path.isfile( combined_silent_filename ): if not len( silent_filenames ) == ROSETTA_RELAX_OPTIONS['nstruct']: raise Exception( '??? somehow the matching relax run(s) has failed ???\n' + str( i ) ) score_filenames = [j.replace( '.silent' , '.sc' ) for j in silent_filenames] merge_rosetta_relax_output( silent_filenames , combined_silent_filename , score_filenames , combined_score_filename , delete_old_files = delete_intermediate_relax_files ) # rescore already knows the proper filename else: # just a single match for each # output filename should be correct as is :) None # submit this script using a queue command # generate it here instead pbs_options = {} if command_dict['queue'] == 'parallel': pbs_options.update( PBS_PARALLEL_JOB_OPTIONS ) elif command_dict['queue'] == 'serial': pbs_options.update( PBS_SERIAL_JOB_OPTIONS ) # make sure they are satisfied script_filename = command_dict['script_filename'] for k in pbs_options.keys(): if '__call__' in dir( pbs_options[k] ): pbs_options[k] = pbs_options[k]( script_filename ) pbs_command = create_executable_str( 'qsub' , [script_filename] , pbs_options ) new_job_id = run_local_commandline( pbs_command , collect_stdout = True ) new_job_id = new_job_id.strip() if '.' in new_job_id: new_job_id = new_job_id[:new_job_id.find( '.' )] print 'submitted ' + new_job_id # save the job id # assume its queue running_or_queued[new_job_id] = i else: print 'no new \"positions\" are available' # debug, need to know running_jobs = len( [i for i in queue_status.values() if i in ['R']] ) if running_jobs: print str( running_jobs ) + ' are still running...(excluding the jobs just submitted and including your other jobs)' # assess outcome of completed jobs for job_id in sorted( queue_status.keys() ): # sort in numerical order, right? # debug if not job_id in all_completed_jobs: print '\t'+ job_id , queue_status[job_id]# , job_id in running_or_queued.keys() # could just skip it all now? if queue_status[job_id] == 'C' and job_id in running_or_queued.keys(): task_id = running_or_queued[job_id][0] command_index = running_or_queued[job_id][1] command_dict = task_summaries[task_id]['commands'][command_index] check_successful = determine_check_successful_function( command_dict , single_relax = single_relax ) success = check_successful( command_dict ) failure_summary = '' if isinstance( success , bool ): complete = success elif len( success ) > 1 and isinstance( success[0] , bool ): complete = success[0] failure_summary += ' '+ ';'.join( [str( j ) for j in success[1:]] ) +' ' print complete , failure_summary , 'try again?'*bool( not complete ) # debug # track the number of attempts? # try until failure - how many times? tries = 0 if 'run' in command_dict.keys() and command_dict['run'] and not 'success' in command_dict['run'] and not 'failure' in command_dict['run']: tries = int( command_dict['run'] ) tries += 1 print tries , 'attempts so far' # debug if tries >= max_pbs_tries: # its a failure print job_id + ' completed successfully'*complete + (' failed with ' + str( tries ) + ' attempts')*(not complete) failure_summary = 'success'*complete + (str( tries ) +' tries;failure ' + failure_summary)*(not complete) elif complete: print job_id + ' completed successfully' failure_summary = 'success' #+ str( tries ) + ' tries' else: # record the number of tries print job_id + ' completed' + ' successfully'*complete failure_summary = str( tries ) # update the record print 'updating with: ' + failure_summary # debug task_summaries[task_id]['commands'][command_index]['run'] = failure_summary # optionally cleanup if ddg_monomer_cleanup and command_dict['feature'] == 'ddg_monomer':#'ddg' in i['output_filename']: print 'ddg_monomer writes useless output files, deleting these now...' remove_intermediate_ddg_monomer_files() # jobs that have since been completed - consider them complete? completed.append( running_or_queued[job_id] ) # good, so this grows del running_or_queued[job_id] # remove jobs to run? # print 'updating the status...' # debug # write out "completed"? or "running_or_queued"? if queue_status[job_id] == 'C' and not job_id in all_completed_jobs: all_completed_jobs.append( job_id ) # prevent redundant update info # update task_summaries e.g. write them! # modified: so the task summary records its own name...bah! for i in task_summaries: if not 'task_summary_filename' in i['filenames'].keys(): raise NotImplementedError( 'should input the task summary filename (not the summary itself)...' ) else: # write it out print 'updating: ' + i['filenames']['task_summary_filename'] write_task_summary( i , i['filenames']['task_summary_filename'] ) # pause... print '\n' , len( completed ) , 'completed' , len( task_list ) , 'tasks remaining' # debug if len( completed ) <= len( task_list ): # no need for edge-case end wait print 'waiting ' + str( PBS_QUEUE_MONITOR_DELAY ) +'s...' time.sleep( PBS_QUEUE_MONITOR_DELAY ) # return anything? # write one last time? for i in task_summaries: if not 'task_summary_filename' in i['filenames'].keys(): raise NotImplementedError( 'should input the task summary filename (not the summary itself)...' ) else: # write it out write_task_summary( i , i['filenames']['task_summary_filename'] )
def run_rosetta_relax(pdb_filename, extra_options={}, run=True, parallel=ROSETTA_RELAX_PARALLEL): root_filename = pdb_filename.rstrip('.pdb') # collect the options, set the input, derive the output filenames relax_options = {} relax_options.update(ROSETTA_RELAX_OPTIONS) relax_options.update(extra_options) relax_options['s'] = pdb_filename relax_options['native'] = pdb_filename # required to get gdtmm scores for i in relax_options.keys(): if '__call__' in dir(relax_options[i]): relax_options[i] = relax_options[i](root_filename) # ...weird Rosetta append behavior... # if os.path.isfile( relax_options['out:file:silent'] ): # os.remove( relax_options['out:file:silent'] ) # if os.path.isfile( relax_options['out:file:scorefile'] ): # os.remove( relax_options['out:file:scorefile'] ) # for njc parallelization nstruct = int(relax_options.get('nstruct', '0')) parallel = int(parallel) tmp_file = None if nstruct > 1 and parallel > 1: relax_options['nstruct'] = 1 #TODO: Add chunking option? score_filename = relax_options['out:file:scorefile'] silent_filename = relax_options['out:file:silent'] if 'run:jran' in relax_options: restoreJran = True jran = int(relax_options['run:jran']) else: restoreJran = False jran = 123 tmp_file = tempfile.NamedTemporaryFile(delete=False) print 'Parallel relax commands are in ' + tmp_file.name for s in xrange(nstruct): tag = '_%05d' % s relax_options['run:jran'] = jran * nstruct + s relax_options['out:file:scorefile'] = score_filename + tag relax_options['out:file:silent'] = silent_filename + tag print >> tmp_file, create_executable_str( PATH_TO_ROSETTA_RELAX, args=[], options=relax_options ) + " > %s 2>&1; echo '[[VIPURLOG]]' %s %d" % ( (silent_filename + tag).replace('silent_', 'log_'), pdb_filename, s + 1) tmp_file.close() # the "find ... | xargs ..." idiom is used just in case nstruct is ever a *very* large number. command = '''\ parallel -j %d -a %s find . -name '%s_[0-9]*[0-9]' | xargs cat | awk 'NR == 1 || $2 != "score" {print $0}' > %s find . -name '%s_[0-9]*[0-9]' | xargs rm find . -name '%s_[0-9]*[0-9]' | xargs cat | awk 'NR <= 2 || !($2 == "score" || $1 == "SEQUENCE:") {print $0}' > %s find . -name '%s_[0-9]*[0-9]' | xargs rm ''' % (parallel, tmp_file.name, score_filename, score_filename, score_filename, silent_filename, silent_filename, silent_filename) print 'Parallel relax driver command:', command # restore option values relax_options['nstruct'] = str(nstruct) relax_options['out:file:scorefile'] = score_filename relax_options['out:file:silent'] = silent_filename if restoreJran: relax_options['run:jran'] = jran if run: return (command, tmp_file.name, score_filename, silent_filename) if tmp_file: os.unlink(tmp_file.name) else: command = create_executable_str(PATH_TO_ROSETTA_RELAX, args=[], options=relax_options) if run: run_local_commandline(command) # the only output we need # return relax_options['out:file:scorefile'] return relax_options['out:file:silent']