def run_probe( pdb_filename , variants , probe_output_filename = '' , run = True ):
    """
    Runs PROBE on  <pdb_filename>  on the positions found among  <variants>
    using the default options in PROBE_OPTIONS and writes the output to
    <probe_output_filename>  (also returns this output filename)
    """
    if not probe_output_filename:
        probe_output_filename = os.path.abspath( pdb_filename ).rstrip( '.pdb' ) + '.probe_out'

    # get the unique variant positions
    positions = list( set( [i[1:-1] for i in variants] ) )
    positions.sort()
    
    # generate the commands to run
#    command = '#!/bin/sh\nrm ' + probe_output_filename + '\ntouch ' + probe_output_filename + '\n'
    command = 'rm ' + probe_output_filename + ';touch ' + probe_output_filename + ';'
    # delete any prior copy since we will append to it
    
    for i in positions:
        probe_options = {}
        probe_options.update( PROBE_OPTIONS )
            
        probe_options['out'] = pdb_filename
        probe_options['Q'] = str( i )

        command += create_executable_str( PATH_TO_PROBE , [] , probe_options , probe_output_filename , append = True ) +';'#'\n'

    # run PROBE, store the output
    if run:
        run_local_commandline( command )

        return probe_output_filename , positions
    else:
        # the command, well, get positions etc. too
        return command , probe_output_filename , positions
Example #2
0
def run_psiblast(sequence_filename, run=True):
    """
    Runs PSIBLAST on  <sequence_filename>  using the default options in
    PSIBLAST_OPTIONS and returns the relevant output file: "out_ascii_pssm"
    """
    root_filename = os.path.abspath(sequence_filename).rstrip('.fa')

    # collect the options, set the input, derive the output filenames
    psiblast_options = {}
    psiblast_options.update(PSIBLAST_OPTIONS)
    psiblast_options['query'] = sequence_filename
    for i in psiblast_options.keys():
        if '__call__' in dir(psiblast_options[i]):
            psiblast_options[i] = psiblast_options[i](root_filename)

    for i in psiblast_options.keys():
        if isinstance(psiblast_options[i], str) and os.path.isfile(
                psiblast_options[i]):
            psiblast_options[i] = os.path.abspath(psiblast_options[i])

    command = create_executable_str(PATH_TO_PSIBLAST,
                                    args=[],
                                    options=psiblast_options)

    if run:
        run_local_commandline(command)

        # the only output we need
        return psiblast_options['out_ascii_pssm']
    else:
        # just send the command
        return command, psiblast_options['out_ascii_pssm']
def run_rosetta_relax_local( pdb_filename , extra_options = {} , run = True ):
    root_filename = os.path.abspath( pdb_filename ).replace( '.pdb' , '' )
    
    # collect the options, set the input, derive the output filenames
    relax_options = {}
    relax_options.update( ROSETTA_RELAX_OPTIONS )
    relax_options.update( extra_options )
    relax_options['s'] = pdb_filename
    relax_options['native'] = pdb_filename    # required to get gdtmm scores
    for i in relax_options.keys():
        if '__call__' in dir( relax_options[i] ):
            relax_options[i] = relax_options[i]( root_filename )

    for i in relax_options.keys():
        if isinstance( relax_options[i] , str ) and os.path.isfile( relax_options[i] ):
            relax_options[i] = os.path.abspath( relax_options[i] )

    # ...weird Rosetta append behavior...
    if os.path.isfile( relax_options['out:file:silent'] ):
        os.remove( relax_options['out:file:silent'] )
    if os.path.isfile( relax_options['out:file:scorefile'] ):
        os.remove( relax_options['out:file:scorefile'] )
    
    command = create_executable_str( PATH_TO_ROSETTA_RELAX , args = [] , options = relax_options )

    if run:
        run_local_commandline( command )
    
        # the only output we need
        return relax_options['out:file:silent']
    else:
        return command , relax_options['out:file:silent']
Example #4
0
def run_rosetta_relax_local(pdb_filename, extra_options={}, run=True):
    root_filename = os.path.abspath(pdb_filename).replace('.pdb', '')

    # collect the options, set the input, derive the output filenames
    relax_options = {}
    relax_options.update(ROSETTA_RELAX_OPTIONS)
    relax_options.update(extra_options)
    relax_options['s'] = pdb_filename
    relax_options['native'] = pdb_filename  # required to get gdtmm scores
    for i in relax_options.keys():
        if '__call__' in dir(relax_options[i]):
            relax_options[i] = relax_options[i](root_filename)

    for i in relax_options.keys():
        if isinstance(relax_options[i], str) and os.path.isfile(
                relax_options[i]):
            relax_options[i] = os.path.abspath(relax_options[i])

    # ...weird Rosetta append behavior...
    if os.path.isfile(relax_options['out:file:silent']):
        os.remove(relax_options['out:file:silent'])
    if os.path.isfile(relax_options['out:file:scorefile']):
        os.remove(relax_options['out:file:scorefile'])

    command = create_executable_str(PATH_TO_ROSETTA_RELAX,
                                    args=[],
                                    options=relax_options)

    if run:
        run_local_commandline(command)

        # the only output we need
        return relax_options['out:file:silent']
    else:
        return command, relax_options['out:file:silent']
def run_psiblast( sequence_filename , run = True ):
    """
    Runs PSIBLAST on  <sequence_filename>  using the default options in
    PSIBLAST_OPTIONS and returns the relevant output file: "out_ascii_pssm"
    """
    root_filename = os.path.abspath( sequence_filename ).rstrip( '.fa' )
    
    # collect the options, set the input, derive the output filenames
    psiblast_options = {}
    psiblast_options.update( PSIBLAST_OPTIONS )
    psiblast_options['query'] = sequence_filename
    for i in psiblast_options.keys():
        if '__call__' in dir( psiblast_options[i] ):
            psiblast_options[i] = psiblast_options[i]( root_filename )

    for i in psiblast_options.keys():
        if isinstance( psiblast_options[i] , str ) and os.path.isfile( psiblast_options[i] ):
            psiblast_options[i] = os.path.abspath( psiblast_options[i] )
    
    command = create_executable_str( PATH_TO_PSIBLAST , args = [] , options = psiblast_options )

    if run:
        run_local_commandline( command )
    
        # the only output we need
        return psiblast_options['out_ascii_pssm']
    else:
        # just send the command
        return command , psiblast_options['out_ascii_pssm']
Example #6
0
def run_rosetta_ddg_monomer(pdb_filename,
                            mut_filename,
                            out_filename='',
                            out_path='',
                            cleanup=True,
                            run=True):
    root_filename = os.path.abspath(pdb_filename).rstrip('.pdb')
    # hardcoded...ddg_monomer is such a painful protocol...
    out_filename = ''
    if '/' in root_filename:
        out_filename += '/'.join(root_filename.split('/')[:-1]) + '/'
    out_filename += 'ddg_predictions.out'
    # clear it out if it exists, otherwise it will be appended to...
    if os.path.exists(out_filename):
        os.remove(out_filename)

    # collect the options, set the input, derive the output filenames
    ddg_monomer_options = {}
    ddg_monomer_options.update(ROSETTA_DDG_MONOMER_OPTIONS)
    ddg_monomer_options['in:file:s'] = pdb_filename
    ddg_monomer_options['ddg::mut_file'] = mut_filename
    for i in ddg_monomer_options.keys():
        if '__call__' in dir(ddg_monomer_options[i]):
            ddg_monomer_options[i] = ddg_monomer_options[i](root_filename)

    for i in ddg_monomer_options.keys():
        if isinstance(ddg_monomer_options[i], str) and os.path.isfile(
                ddg_monomer_options[i]):
            ddg_monomer_options[i] = os.path.abspath(ddg_monomer_options[i])

    command = ''
    # optionally move into the specific directory...
    if out_path:
        command += 'cd ' + out_path + '; '  #\n\n'

    command += create_executable_str(PATH_TO_ROSETTA_DDG_MONOMER,
                                     args=[],
                                     options=ddg_monomer_options)

    if run:
        run_local_commandline(command)

        # optionally cleanup
        if cleanup:
            print 'ddg_monomer writes useless output files, deleting these now...'
            remove_intermediate_ddg_monomer_files()

        # the only output we need
        return out_filename
    else:
        return command, out_filename
Example #7
0
def run_rosetta_rescore(silent_filename,
                        native_filename,
                        score_filename='',
                        run=True):
    """
    Performs extraction of individual PDB structures from  <silent_filename>
    to  <out_dir>  (default to current location) using the "score" protocol
    of Rosetta (built against 3.5)
    
    Optionally specify  <extra_options>
    """
    root_filename = os.path.abspath(silent_filename).rstrip('.silent')

    score_options = {}
    score_options.update(ROSETTA_SCORE_OPTIONS)
    score_options['in:file:silent'] = silent_filename
    score_options[
        'in:file:native'] = native_filename  # required to get gdtmm scores
    for i in score_options.keys():
        if '__call__' in dir(score_options[i]):
            score_options[i] = score_options[i](root_filename)

    # necessary...
    if 'out:file:scorefile' in score_options.keys(
    ) and not 'rescore.sc' in score_options['out:file:scorefile']:
        score_options['out:file:scorefile'] = score_options[
            'out:file:scorefile'].replace('.sc', '_rescore.sc')

    for i in score_options.keys():
        if isinstance(score_options[i], str) and os.path.isfile(
                score_options[i]):
            score_options[i] = os.path.abspath(score_options[i])

    # ...weird Rosetta append behavior...
    if os.path.isfile(score_options['out:file:scorefile']):
        os.remove(score_options['out:file:scorefile'])

    # default options
    command = create_executable_str(PATH_TO_ROSETTA_SCORE,
                                    args=[],
                                    options=score_options)

    if run:
        run_local_commandline(command)

        return score_options['out:file:scorefile']
    else:
        return command, score_options['out:file:scorefile']
Example #8
0
def get_slurm_queue_status( user = SLURM_USER , header_lines = 1 , trailer_lines = 0 , only_job_status = True ):
    # header_lines = 2 for FULL queue, = 5 for USER queue ("-u")
    command = 'squeue'
    if user:
        command += ' -u ' + user

#    queue_info = subprocess.Popen( command.split( ' ' ) , stdout = subprocess.PIPE , stdin = subprocess.PIPE , stderr = subprocess.STDOUT ).communicate()[0]
    queue_info = run_local_commandline( command , collect_stdout = True )

    # debug
#    print queue_info

    # simple parsing
    queue_info = queue_info.split( '\n' )
    if trailer_lines:
        queue_info = queue_info[header_lines:-1*trailer_lines]
    elif header_lines:
        queue_info = queue_info[header_lines:]
    queue_info = [[j for j in i.split( ' ' ) if j.strip()] for i in queue_info if i.strip()]

    # debug
#    print queue_info

    # optionally only report the job statuses
    if only_job_status:
#        queue_info = [(i[0][:i[0].find( '.' )] , i[-2]) for i in queue_info]
        queue_info = [(i[0] , i[4]) for i in queue_info]
        # make into a dict? job ids should be unique...
        queue_info = dict( queue_info )

    # debug
#    print queue_info

    return queue_info
def run_rosetta_ddg_monomer( pdb_filename , mut_filename , out_filename = '' , out_path = '' , cleanup = True , run = True ):
    root_filename = os.path.abspath( pdb_filename ).rstrip( '.pdb' )
    # hardcoded...ddg_monomer is such a painful protocol...
    out_filename = ''
    if '/' in root_filename:
        out_filename += '/'.join( root_filename.split( '/' )[:-1] ) +'/'
    out_filename += 'ddg_predictions.out'
    # clear it out if it exists, otherwise it will be appended to...
    if os.path.exists( out_filename ):
        os.remove( out_filename )

    # collect the options, set the input, derive the output filenames
    ddg_monomer_options = {}
    ddg_monomer_options.update( ROSETTA_DDG_MONOMER_OPTIONS )
    ddg_monomer_options['in:file:s'] = pdb_filename
    ddg_monomer_options['ddg::mut_file'] = mut_filename
    for i in ddg_monomer_options.keys():
        if '__call__' in dir( ddg_monomer_options[i] ):
            ddg_monomer_options[i] = ddg_monomer_options[i]( root_filename )

    for i in ddg_monomer_options.keys():
        if isinstance( ddg_monomer_options[i] , str ) and os.path.isfile( ddg_monomer_options[i] ):
            ddg_monomer_options[i] = os.path.abspath( ddg_monomer_options[i] )
    
    command = ''
    # optionally move into the specific directory...
    if out_path:
        command += 'cd '+ out_path +'; '
    
    command += create_executable_str( PATH_TO_ROSETTA_DDG_MONOMER , args = [] , options = ddg_monomer_options )

    if run:
        run_local_commandline( command )
    
        # optionally cleanup
        if cleanup:
            print 'ddg_monomer writes useless output files, deleting these now...'
            remove_intermediate_ddg_monomer_files()
        
        # the only output we need
        return out_filename
    else:
        return command , out_filename
Example #10
0
def run_probe(pdb_filename, variants, probe_output_filename='', run=True):
    """
    Runs PROBE on  <pdb_filename>  on the positions found among  <variants>
    using the default options in PROBE_OPTIONS and writes the output to
    <probe_output_filename>  (also returns this output filename)
    """
    if not probe_output_filename:
        probe_output_filename = os.path.abspath(pdb_filename).rstrip(
            '.pdb') + '.probe_out'

    # get the unique variant positions
    positions = list(set([i[1:-1] for i in variants]))
    positions.sort()

    # generate the commands to run
    #    command = '#!/bin/sh\nrm ' + probe_output_filename + '\ntouch ' + probe_output_filename + '\n'
    command = 'rm ' + probe_output_filename + ';touch ' + probe_output_filename + ';'
    # delete any prior copy since we will append to it

    for i in positions:
        probe_options = {}
        probe_options.update(PROBE_OPTIONS)

        probe_options['out'] = pdb_filename
        probe_options['Q'] = str(i)

        command += create_executable_str(PATH_TO_PROBE, [],
                                         probe_options,
                                         probe_output_filename,
                                         append=True) + ';'  #'\n'

    # run PROBE, store the output
    if run:
        run_local_commandline(command)

        return probe_output_filename, positions
    else:
        # the command, well, get positions etc. too
        return command, probe_output_filename, positions
def run_rosetta_rescore( silent_filename , native_filename , score_filename = '' , run = True ):
    """
    Performs extraction of individual PDB structures from  <silent_filename>
    to  <out_dir>  (default to current location) using the "score" protocol
    of Rosetta (built against 3.5)
    
    Optionally specify  <extra_options>
    """
    root_filename = os.path.abspath( silent_filename ).rstrip( '.silent' )
    
    score_options = {}
    score_options.update( ROSETTA_SCORE_OPTIONS )
    score_options['in:file:silent'] = silent_filename
    score_options['in:file:native'] = native_filename    # required to get gdtmm scores
    for i in score_options.keys():
        if '__call__' in dir( score_options[i] ):
            score_options[i] = score_options[i]( root_filename )

    # necessary...
    if 'out:file:scorefile' in score_options.keys() and not 'rescore.sc' in score_options['out:file:scorefile']:
        score_options['out:file:scorefile'] = score_options['out:file:scorefile'].replace( '.sc' , '_rescore.sc' )

    for i in score_options.keys():
        if isinstance( score_options[i] , str ) and os.path.isfile( score_options[i] ):
            score_options[i] = os.path.abspath( score_options[i] )

    # ...weird Rosetta append behavior...
    if os.path.isfile( score_options['out:file:scorefile'] ):
        os.remove( score_options['out:file:scorefile'] )
        
    # default options
    command = create_executable_str( PATH_TO_ROSETTA_SCORE , args = [] , options = score_options )

    if run:
        run_local_commandline( command )
    
        return score_options['out:file:scorefile']
    else:
        return command , score_options['out:file:scorefile']
Example #12
0
def run_slurm_job( script_filename , slurm_run_command = 'sbatch' , output_filename = 'temp_slurm.sh' ):    
    # optionally write a file
    if not os.path.isfile( script_filename ) and isinstance( script_filename , str ):
#        if os.path.isfile( output_filename ):    # should really check if it exists first...
        f = open( output_filename , 'w' )
        f.write( script_filename )
        f.close()
        
        script_filename = output_filename
    
    # submit it, grab the name :)
#    job_id = run_and_record( slurm_run_command +' '+ script_filename , )
    job_id = run_local_commandline( slurm_run_command +' '+ script_filename , collect_stdout = True )
    job_id = job_id[0].split( ' ' )[-1]
    
    return job_id
Example #13
0
def get_pbs_queue_status( user = PBS_USER , header_lines = 5 , trailer_lines = 1 , only_job_status = True ):
    # header_lines = 2 for FULL queue, = 5 for USER queue ("-u")
    command = 'qstat'
    if user:
        command += ' -u ' + user

    queue_info = run_local_commandline( command , collect_stdout = True )

    # simple parsing
    queue_info = queue_info.split( '\n' )[header_lines:-1*trailer_lines]
    queue_info = [[j for j in i.split( ' ' ) if j.strip()] for i in queue_info]

    # optionally only report the job statuses
    if only_job_status:
        queue_info = [(i[0][:i[0].find( '.' )] , i[-2])  if '.' in i[0] else  (i[0] , i[-2]) for i in queue_info]
        # make into a dict? job ids should be unique...
        queue_info = dict( queue_info )

    return queue_info
Example #14
0
def get_pbs_queue_status(user=PBS_USER,
                         header_lines=5,
                         trailer_lines=1,
                         only_job_status=True):
    # header_lines = 2 for FULL queue, = 5 for USER queue ("-u")
    command = 'qstat'
    if user:
        command += ' -u ' + user

    queue_info = run_local_commandline(command, collect_stdout=True)

    # simple parsing
    queue_info = queue_info.split('\n')[header_lines:-1 * trailer_lines]
    queue_info = [[j for j in i.split(' ') if j.strip()] for i in queue_info]

    # optionally only report the job statuses
    if only_job_status:
        queue_info = [(i[0][:i[0].find('.')], i[-2]) if '.' in i[0] else
                      (i[0], i[-2]) for i in queue_info]
        # make into a dict? job ids should be unique...
        queue_info = dict(queue_info)

    return queue_info
Example #15
0
def run_VIPUR_PBS(pdb_filename='',
                  variants_filename='',
                  out_path='',
                  write_numbering_map=True,
                  single_relax=True,
                  delete_intermediate_relax_files=True,
                  demo=False,
                  rerun_preprocessing=False):
    # for the example input
    if demo:
        pdb_filename = PATH_TO_VIPUR + '/example_input/2C35.pdb'
        variants_filename = PATH_TO_VIPUR + '/example_input/2C35.txt'

        out_path = PATH_TO_VIPUR + '/example_output'

    # alternatively, run on an entire directory
    if not pdb_filename and not variants_filename:
        # current directory
        print 'no input provided, assuming you want to run on every (.pdb,.txt) file pair found in the current directory'
        pdb_filename = os.getcwd()

    if os.path.isdir(pdb_filename) and not variants_filename:
        # assume variants_filename from pdb_filename
        variants_filename = '.txt'

    if os.path.isdir(pdb_filename) and variants_filename[0] == '.':
        # look for file extension
        # instead, run on the directory
        if not out_path:
            out_path = os.path.abspath(pdb_filename)

        fa_filenames = [(out_path + '/') * bool(out_path) + i
                        for i in os.listdir(pdb_filename)
                        if get_file_extension(i) == 'fa']
        fa_filenames = [
            [i, get_root_filename(i) + variants_filename] for i in fa_filenames
            if os.path.isfile(get_root_filename(i) + variants_filename)
            and not os.path.isfile(get_root_filename(i) + '.pdb')
        ]

        print 'running VIPUR on all (.pdb,' + variants_filename + ') file pairs found in ' + pdb_filename
        # find .pdb files
        pdb_filenames = [(out_path + '/') * bool(out_path) + i
                         for i in os.listdir(pdb_filename)
                         if get_file_extension(i) == 'pdb']

        # look for pairs
        pdb_filenames = [
            [i, get_root_filename(i) + variants_filename]
            for i in pdb_filenames
            if os.path.isfile(get_root_filename(i) + variants_filename)
        ]

        print str(len(pdb_filenames)) + ' pairs found'
        print str(len(fa_filenames)) + ' pairs found for sequence only mode'

        # go there...
        #        os.chdir( pdb_filename )

        if not pdb_filenames:
            if not fa_filenames:
                raise IOError('!!! no (.pdb,' + variants_filename +
                              ') file pairs found in ' + pdb_filename +
                              '!!?!\nAND no (.fa,' + variants_filename +
                              ') file pairs were found...')
            else:
                print '...only (.fa,' + variants_filename + ') file pairs were found, running in sequence only mode'

    else:
        # file extension etc.
        file_extension = get_file_extension(pdb_filename)
        root_filename = get_root_filename(pdb_filename)

        # normal execution, generalize by turning into list
        pdb_filenames = []
        fa_filenames = []
        if file_extension == 'pdb':
            pdb_filenames = [[
                (out_path + '/') * bool(out_path) + pdb_filename,
                (out_path + '/') * bool(out_path) + variants_filename
            ]]
        else:
            fa_filenames = [[]]

    # combine all "filenames" to run into unified framework
    target_proteins = []  #None]*(len( pdb_filenames ) + len( fa_filenames ))
    for i in pdb_filenames:
        this_out_path = get_root_filename(
            i[0]) + '_VIPUR'  # directory to create
        target_proteins.append(i + [False, this_out_path])
    for i in fa_filenames:
        this_out_path = get_root_filename(
            i[0]) + '_VIPUR'  # directory to create
        target_proteins.append(i + [True, this_out_path])

    # setup environment variables BEFORE pre processing
    # no need to setup a command, just run it
    if PBS_ENVIRONMENT_SETUP:
        print 'setting up environment variables'
        run_local_commandline(PBS_ENVIRONMENT_SETUP)

    # pre processing
    task_summaries = []
    for i in target_proteins:
        # guess what the task summary filename 'would' be, if it exists, keep going...
        task_summary_filename = i[3] * bool(i[3]) + '/' + get_root_filename(
            i[0]).split('/')[-1] + '.task_summary'
        if os.path.isfile(task_summary_filename) and not rerun_preprocessing:
            print 'hmmm, ' + i[
                0] + ' seems to have run preprocessing already, skipping now'
            #continue    # skip this one, do not add to list of tasks...?
            # actually, skip running pre-processing BUT DO add it to the list of tasks
        else:
            task_summary_filename = run_preprocessing(
                i[0],
                i[1],
                sequence_only=i[2],
                out_path=i[3],
                task_summary_filename=task_summary_filename,
                write_numbering_map=write_numbering_map,
                single_relax=single_relax,
                pymol_environment_setup=PBS_ENVIRONMENT_SETUP)

        # modify for PBS script
        task_summary = load_task_summary(task_summary_filename)
        for j in xrange(len(task_summary['commands'])):

            command = task_summary['commands'][j]['command']

            # add for relax
            if task_summary['commands'][j]['feature'].replace(
                    '_native',
                    '') == 'relax' and not 'rescore' in task_summary[
                        'commands'][j]['feature']:
                if not PBS_PARALLEL_ROSETTA_ENDING in command:
                    command = command.replace(ROSETTA_ENDING,
                                              PBS_PARALLEL_ROSETTA_ENDING)
                command = PBS_PARALLEL_ROSETTA_EXECUTION_COMMAND + ' ' * bool(
                    PBS_PARALLEL_ROSETTA_EXECUTION_COMMAND) + command

                if ROSETTA_RELAX_PARALLEL_OPTIONS:
                    command += ' ' + ' '.join([
                        '-' + k + (' ' + ROSETTA_RELAX_PARALLEL_OPTIONS[k]) *
                        bool(ROSETTA_RELAX_PARALLEL_OPTIONS[k])
                        for k in ROSETTA_RELAX_PARALLEL_OPTIONS
                    ])
#                command += ' -jd2:mpi_file_buf_job_distributor false'
#                command += ' -run:multiple_processes_writing_to_one_directory'

# also use the parallel options
                pbs_options = 'parallel'  #.update( PBS_PARALLEL_JOB_OPTIONS )
            else:
                pbs_options = 'serial'  #.update( PBS_SERIAL_JOB_OPTIONS )

            # put "cd" in front
            command = ('cd ' + i[3] + ';') * bool(i[3]) + command

            # modify the task summary
            task_summary['commands'][j]['command'] = command

            # actually write the script...
            # don't worry about optional #PBS header info
            # need to add the variant? no, just use the output_filename for this
            script_filename = i[3] + '/' * bool(i[3]) + get_root_filename(
                task_summary['commands'][j]['output_filename'].split('/')[-1]
            ) + '.' + task_summary['commands'][j]['feature'] + '.pbs_script.sh'
            task_summary['commands'][j]['script_filename'] = script_filename

            f = open(script_filename, 'w')
            f.write(PBS_BASH_SCRIPT(command))
            f.close()

            # use the script filename as the source for any log files
            # control the output and error paths

            # also generate the pbs call? might as well, keep it simple
            # no, uses ":" and "," characters...
            task_summary['commands'][j]['queue'] = pbs_options

        # rewrite the task summary
        write_task_summary(task_summary, task_summary_filename)

        task_summaries.append(task_summary)  #_filename )

    # run them all
    run_VIPUR_task_summaries_PBS(
        task_summaries,
        single_relax=single_relax,
        delete_intermediate_relax_files=delete_intermediate_relax_files)

    # post processing
    # this look identical!!! :)
    for i in xrange(len(task_summaries)):
        # always okay to rerun post processing...should not make any difference
        sequence_only = target_proteins[i][2]
        print '\n\n\nExtracting and Analyzing the Results:\n\n'
        task_summaries[i] = run_postprocessing(task_summaries[i],
                                               sequence_only=sequence_only)

    return task_summaries
Example #16
0
def run_VIPUR_tasks_PBS(task_summaries,
                        task_list,
                        max_pbs_tries=2,
                        ddg_monomer_cleanup=True,
                        single_relax=True,
                        delete_intermediate_relax_files=False):
    # run the non_rescore tasks
    completed = [
        i for i in task_list if 'run' in task_summaries[i[0]]['commands'][i[1]]
        and 'success' in task_summaries[i[0]]['commands'][i[1]]['run']
    ]
    # should running_or_queued be saved? written to file?
    running_or_queued = {}
    rounds = 0
    all_completed_jobs = [
    ]  # prevents annoying bulk output, only see it the first time it completes
    while not len(completed) == len(task_list):
        rounds += 1
        print '\n\nQUEUE MONITOR ROUND ' + str(rounds)

        # debug
        #        print running_or_queued

        # check queue status
        queue_status = get_pbs_queue_status()

        # update "running_or_queued" list (?)
        # err, no, does not have information on which job it is...:(
        #for i in queue_status.keys():
        #    if queue_status[i] in ['R' , 'Q']:

        queue_space_occupied = len([
            i for i in queue_status.values() if not i in ['C', 'R']
        ])  # ignore "C"ompleted jobs, "R"unning job quota are not set by us...
        # if your queue system does not have a separate "R"un quota, remove 'R' from the above!
        available_space = PBS_QUEUE_QUOTA - queue_space_occupied

        # launch next jobs in available slots
        if available_space:
            print str(queue_space_occupied
                      ) + ' jobs queued or running, could submit up to ' + str(
                          available_space) + ' more'
            # choose the next job
            jobs_to_run = [
                i for i in task_list
                if not i in completed and not i in running_or_queued.values()
                and not ('run' in task_summaries[i[0]]['commands'][i[1]] and
                         ('success' in task_summaries[i[0]]['commands'][
                             i[1]]['run'] or 'failure' in task_summaries[i[0]]
                          ['commands'][i[1]]['run']))
            ]
            print str(
                len(jobs_to_run)
            ) + ' jobs still need to finish (after the currently running jobs complete)'

            # only the next few
            for i in jobs_to_run[:available_space]:
                command_dict = task_summaries[i[0]]['commands'][i[1]]

                # write scripts as part of pre processing?...yeah...
                # write the command to a script
                #script_filename = command_dict['out_path'] +'/'*bool( command_dict['out_path'] )+
                #script_filename = command_dict['script_filename']

                # if its a rescore and relax jobs were separated, need to recombine them!
                if 'rescore' in command_dict['feature']:
                    # combine the individual relax runs
                    #relax_commands = [i for i in task_summary['commands'] if i['feature'].replace( '_native' , '' ) == 'relax']
                    #silent_filenames = [j['output_filename'] for j in relax_commands if j['variant'] == i['variant'] and 'run' in j.keys() and j['run'] == 'success']
                    silent_filenames = [
                        j['output_filename']
                        for j in task_summaries[i[0]]['commands']
                        if j['feature'].replace('_native', '') == 'relax'
                        and j['variant'] == command_dict['variant']
                        and 'run' in j.keys() and 'success' in j['run']
                    ]
                    # actually need to identify the combined_silent_filename, be sure the relax files have not already been merged
                    # which variant
                    target_variant = [
                        j for j in task_summaries[i[0]]['variants'].keys()
                        if j.split('_')[-1] == command_dict['variant']
                        and j.split('_')[0] in command_dict['command']
                    ]
                    if not target_variant:
                        # its native
                        combined_silent_filename = task_summaries[
                            i[0]]['other']['combined_native_silent_filename']
                        combined_score_filename = task_summaries[
                            i[0]]['other']['combined_native_score_filename']
                    elif len(target_variant) > 1:
                        raise Exception(
                            '??? found more than on matching variant ???\n' +
                            ', '.join(target_variant))
                    else:
                        # found it
                        combined_silent_filename = task_summaries[
                            i[0]]['variants'][
                                target_variant[0]]['combined_silent_filename']
                        combined_score_filename = task_summaries[
                            i[0]]['variants'][
                                target_variant[0]]['combined_score_filename']

                    #if not single_relax:    # AND post processing has not already be run...scan for the combined silent file
                    if not single_relax and not os.path.isfile(
                            combined_silent_filename):
                        if not len(silent_filenames
                                   ) == ROSETTA_RELAX_OPTIONS['nstruct']:
                            raise Exception(
                                '??? somehow the matching relax run(s) has failed ???\n'
                                + str(i))
                        score_filenames = [
                            j.replace('.silent', '.sc')
                            for j in silent_filenames
                        ]

                        merge_rosetta_relax_output(
                            silent_filenames,
                            combined_silent_filename,
                            score_filenames,
                            combined_score_filename,
                            delete_old_files=delete_intermediate_relax_files)
                        # rescore already knows the proper filename
                    else:
                        # just a single match for each
                        # output filename should be correct as is :)
                        None

                # submit this script using a queue command
                # generate it here instead
                pbs_options = {}
                if command_dict['queue'] == 'parallel':
                    pbs_options.update(PBS_PARALLEL_JOB_OPTIONS)
                elif command_dict['queue'] == 'serial':
                    pbs_options.update(PBS_SERIAL_JOB_OPTIONS)
                # make sure they are satisfied
                script_filename = command_dict['script_filename']
                for k in pbs_options.keys():
                    if '__call__' in dir(pbs_options[k]):
                        pbs_options[k] = pbs_options[k](script_filename)

                pbs_command = create_executable_str('qsub', [script_filename],
                                                    pbs_options)
                new_job_id = run_local_commandline(pbs_command,
                                                   collect_stdout=True)
                new_job_id = new_job_id.strip()
                if '.' in new_job_id:
                    new_job_id = new_job_id[:new_job_id.find('.')]
                print 'submitted ' + new_job_id

                # save the job id
                # assume its queue
                running_or_queued[new_job_id] = i

        else:
            print 'no new \"positions\" are available'

        # debug, need to know
        running_jobs = len([i for i in queue_status.values() if i in ['R']])
        if running_jobs:
            print str(
                running_jobs
            ) + ' are still running...(excluding the jobs just submitted and including your other jobs)'

        # assess outcome of completed jobs
        for job_id in sorted(
                queue_status.keys()):  # sort in numerical order, right?
            # debug
            if not job_id in all_completed_jobs:
                print '\t' + job_id, queue_status[
                    job_id]  # , job_id in running_or_queued.keys()
                # could just skip it all now?

            if queue_status[
                    job_id] == 'C' and job_id in running_or_queued.keys():
                task_id = running_or_queued[job_id][0]
                command_index = running_or_queued[job_id][1]
                command_dict = task_summaries[task_id]['commands'][
                    command_index]

                check_successful = determine_check_successful_function(
                    command_dict, single_relax=single_relax)

                success = check_successful(command_dict)

                failure_summary = ''
                if isinstance(success, bool):
                    complete = success
                elif len(success) > 1 and isinstance(success[0], bool):
                    complete = success[0]
                    failure_summary += ' ' + ';'.join(
                        [str(j) for j in success[1:]]) + ' '
                    print complete, failure_summary, 'try again?' * bool(
                        not complete)  # debug

                # track the number of attempts?
                # try until failure - how many times?
                tries = 0
                if 'run' in command_dict.keys(
                ) and command_dict['run'] and not 'success' in command_dict[
                        'run'] and not 'failure' in command_dict['run']:
                    tries = int(command_dict['run'])
                tries += 1
                print tries, 'attempts so far'  # debug

                if tries >= max_pbs_tries:
                    # its a failure
                    print job_id + ' completed successfully' * complete + (
                        ' failed with ' + str(tries) +
                        ' attempts') * (not complete)
                    failure_summary = 'success' * complete + (
                        str(tries) + ' tries;failure ' +
                        failure_summary) * (not complete)
                elif complete:
                    print job_id + ' completed successfully'
                    failure_summary = 'success'  #+ str( tries ) + ' tries'
                else:
                    # record the number of tries
                    print job_id + ' completed' + ' successfully' * complete
                    failure_summary = str(tries)

                # update the record
                print 'updating with: ' + failure_summary  # debug
                task_summaries[task_id]['commands'][command_index][
                    'run'] = failure_summary

                # optionally cleanup
                if ddg_monomer_cleanup and command_dict[
                        'feature'] == 'ddg_monomer':  #'ddg' in i['output_filename']:
                    print 'ddg_monomer writes useless output files, deleting these now...'
                    remove_intermediate_ddg_monomer_files()

                # jobs that have since been completed - consider them complete?
                completed.append(
                    running_or_queued[job_id])  # good, so this grows
                del running_or_queued[job_id]
                # remove jobs to run?
#                print 'updating the status...'    # debug

# write out "completed"? or "running_or_queued"?

            if queue_status[job_id] == 'C' and not job_id in all_completed_jobs:
                all_completed_jobs.append(
                    job_id)  # prevent redundant update info

        # update task_summaries e.g. write them!
        # modified: so the task summary records its own name...bah!
        for i in task_summaries:
            if not 'task_summary_filename' in i['filenames'].keys():
                raise NotImplementedError(
                    'should input the task summary filename (not the summary itself)...'
                )
            else:
                # write it out
                print 'updating: ' + i['filenames']['task_summary_filename']
                write_task_summary(i, i['filenames']['task_summary_filename'])

        # pause...
        print '\n', len(completed), 'completed', len(
            task_list), 'tasks remaining'  # debug
        if len(completed) <= len(task_list):  # no need for edge-case end wait
            print 'waiting ' + str(PBS_QUEUE_MONITOR_DELAY) + 's...'
            time.sleep(PBS_QUEUE_MONITOR_DELAY)

    # return anything?
    # write one last time?
    for i in task_summaries:
        if not 'task_summary_filename' in i['filenames'].keys():
            raise NotImplementedError(
                'should input the task summary filename (not the summary itself)...'
            )
        else:
            # write it out
            write_task_summary(i, i['filenames']['task_summary_filename'])
Example #17
0
def run_rosetta_relax(pdb_filename,
                      extra_options={},
                      run=True,
                      parallel=ROSETTA_RELAX_PARALLEL):
    root_filename = pdb_filename.rstrip('.pdb')

    # collect the options, set the input, derive the output filenames
    relax_options = {}
    relax_options.update(ROSETTA_RELAX_OPTIONS)
    relax_options.update(extra_options)
    relax_options['s'] = pdb_filename
    relax_options['native'] = pdb_filename  # required to get gdtmm scores
    for i in relax_options.keys():
        if '__call__' in dir(relax_options[i]):
            relax_options[i] = relax_options[i](root_filename)

    # ...weird Rosetta append behavior...
#    if os.path.isfile( relax_options['out:file:silent'] ):
#        os.remove( relax_options['out:file:silent'] )
#    if os.path.isfile( relax_options['out:file:scorefile'] ):
#        os.remove( relax_options['out:file:scorefile'] )

# for njc parallelization
    nstruct = int(relax_options.get('nstruct', '0'))
    parallel = int(parallel)
    tmp_file = None
    if nstruct > 1 and parallel > 1:
        relax_options['nstruct'] = 1  #TODO: Add chunking option?
        score_filename = relax_options['out:file:scorefile']
        silent_filename = relax_options['out:file:silent']

        if 'run:jran' in relax_options:
            restoreJran = True
            jran = int(relax_options['run:jran'])
        else:
            restoreJran = False
            jran = 123

        tmp_file = tempfile.NamedTemporaryFile(delete=False)
        print 'Parallel relax commands are in ' + tmp_file.name

        for s in xrange(nstruct):
            tag = '_%05d' % s
            relax_options['run:jran'] = jran * nstruct + s
            relax_options['out:file:scorefile'] = score_filename + tag
            relax_options['out:file:silent'] = silent_filename + tag
            print >> tmp_file, create_executable_str(
                PATH_TO_ROSETTA_RELAX, args=[], options=relax_options
            ) + " > %s 2>&1; echo '[[VIPURLOG]]' %s %d" % (
                (silent_filename + tag).replace('silent_',
                                                'log_'), pdb_filename, s + 1)

        tmp_file.close()
        # the "find ... | xargs ..." idiom is used just in case nstruct is ever a *very* large number.
        command = '''\
parallel -j %d -a %s
find . -name '%s_[0-9]*[0-9]' | xargs cat | awk 'NR == 1 || $2 != "score" {print $0}' > %s
find . -name '%s_[0-9]*[0-9]' | xargs rm
find . -name '%s_[0-9]*[0-9]' | xargs cat | awk 'NR <= 2 || !($2 == "score" || $1 == "SEQUENCE:") {print $0}' > %s
find . -name '%s_[0-9]*[0-9]' | xargs rm
''' % (parallel, tmp_file.name, score_filename, score_filename, score_filename,
        silent_filename, silent_filename, silent_filename)
        print 'Parallel relax driver command:', command

        # restore option values
        relax_options['nstruct'] = str(nstruct)
        relax_options['out:file:scorefile'] = score_filename
        relax_options['out:file:silent'] = silent_filename
        if restoreJran:
            relax_options['run:jran'] = jran

        if run:
            return (command, tmp_file.name, score_filename, silent_filename)

        if tmp_file:
            os.unlink(tmp_file.name)
    else:
        command = create_executable_str(PATH_TO_ROSETTA_RELAX,
                                        args=[],
                                        options=relax_options)

    if run:
        run_local_commandline(command)

    # the only output we need
#    return relax_options['out:file:scorefile']
    return relax_options['out:file:silent']
Example #18
0
def run_VIPUR_tasks_PBS( task_summaries , task_list , max_pbs_tries = 2 , ddg_monomer_cleanup = True , single_relax = True , delete_intermediate_relax_files = False ):
    # run the non_rescore tasks
    completed = [i for i in task_list if 'run' in task_summaries[i[0]]['commands'][i[1]] and 'success' in task_summaries[i[0]]['commands'][i[1]]['run']]
    # should running_or_queued be saved? written to file?
    running_or_queued = {}
    rounds = 0
    all_completed_jobs = []    # prevents annoying bulk output, only see it the first time it completes
    while not len( completed ) == len( task_list ):
        rounds += 1
        print '\n\nQUEUE MONITOR ROUND ' + str( rounds )
        
        # debug
#        print running_or_queued
    
        # check queue status
        queue_status = get_pbs_queue_status()

        # update "running_or_queued" list (?)
        # err, no, does not have information on which job it is...:(
        #for i in queue_status.keys():
        #    if queue_status[i] in ['R' , 'Q']:

        queue_space_occupied = len( [i for i in queue_status.values() if not i in ['C' , 'R']] )    # ignore "C"ompleted jobs, "R"unning job quota are not set by us...
        # if your queue system does not have a separate "R"un quota, remove 'R' from the above!
        available_space = PBS_QUEUE_QUOTA - queue_space_occupied

        
        # launch next jobs in available slots
        if available_space:
            print str( queue_space_occupied ) + ' jobs queued or running, could submit up to ' + str( available_space ) + ' more'
            # choose the next job
            jobs_to_run = [i for i in task_list if
                not i in completed and
                not i in running_or_queued.values() and
                not ( 'run' in task_summaries[i[0]]['commands'][i[1]] and
                    ('success' in task_summaries[i[0]]['commands'][i[1]]['run'] or
                    'failure' in task_summaries[i[0]]['commands'][i[1]]['run']) )
                ]
            print str( len( jobs_to_run ) ) + ' jobs still need to finish (after the currently running jobs complete)'
            
            # only the next few
            for i in jobs_to_run[:available_space]:
                command_dict = task_summaries[i[0]]['commands'][i[1]]
            
                # write scripts as part of pre processing?...yeah...
                # write the command to a script
                #script_filename = command_dict['out_path'] +'/'*bool( command_dict['out_path'] )+
                #script_filename = command_dict['script_filename']
            
            
                # if its a rescore and relax jobs were separated, need to recombine them!
                if 'rescore' in command_dict['feature']:
                    # combine the individual relax runs
                    #relax_commands = [i for i in task_summary['commands'] if i['feature'].replace( '_native' , '' ) == 'relax']
                    #silent_filenames = [j['output_filename'] for j in relax_commands if j['variant'] == i['variant'] and 'run' in j.keys() and j['run'] == 'success']
                    silent_filenames = [j['output_filename'] for j in task_summaries[i[0]]['commands'] if
                        j['feature'].replace( '_native' , '' ) == 'relax' and
                        j['variant'] == command_dict['variant'] and
                        'run' in j.keys() and
                        'success' in j['run']
                        ]
                    # actually need to identify the combined_silent_filename, be sure the relax files have not already been merged
                    # which variant
                    target_variant = [j for j in task_summaries[i[0]]['variants'].keys() if j.split( '_' )[-1] == command_dict['variant'] and j.split( '_' )[0] in command_dict['command']]
                    if not target_variant:
                        # its native
                        combined_silent_filename = task_summaries[i[0]]['other']['combined_native_silent_filename']
                        combined_score_filename = task_summaries[i[0]]['other']['combined_native_score_filename']
                    elif len( target_variant ) > 1:
                        raise Exception( '??? found more than on matching variant ???\n' + ', '.join( target_variant ) )
                    else:
                        # found it
                        combined_silent_filename = task_summaries[i[0]]['variants'][target_variant[0]]['combined_silent_filename']
                        combined_score_filename = task_summaries[i[0]]['variants'][target_variant[0]]['combined_score_filename']

                    #if not single_relax:    # AND post processing has not already be run...scan for the combined silent file
                    if not single_relax and not os.path.isfile( combined_silent_filename ):
                        if not len( silent_filenames ) == ROSETTA_RELAX_OPTIONS['nstruct']:
                            raise Exception( '??? somehow the matching relax run(s) has failed ???\n' + str( i ) )
                        score_filenames = [j.replace( '.silent' , '.sc' ) for j in silent_filenames]

                        merge_rosetta_relax_output( silent_filenames , combined_silent_filename , score_filenames , combined_score_filename , delete_old_files = delete_intermediate_relax_files )
                        # rescore already knows the proper filename
                    else:
                        # just a single match for each
                        # output filename should be correct as is :)
                        None

            
                # submit this script using a queue command
                # generate it here instead
                pbs_options = {}
                if command_dict['queue'] == 'parallel':
                    pbs_options.update( PBS_PARALLEL_JOB_OPTIONS )
                elif command_dict['queue'] == 'serial':
                    pbs_options.update( PBS_SERIAL_JOB_OPTIONS )
                # make sure they are satisfied
                script_filename = command_dict['script_filename']
                for k in pbs_options.keys():
                    if '__call__' in dir( pbs_options[k] ):
                        pbs_options[k] = pbs_options[k]( script_filename )

                pbs_command = create_executable_str( 'qsub' , [script_filename] , pbs_options )
                new_job_id = run_local_commandline( pbs_command , collect_stdout = True )
                new_job_id = new_job_id.strip()
                if '.' in new_job_id:
                    new_job_id = new_job_id[:new_job_id.find( '.' )]
                print 'submitted ' + new_job_id
                
                # save the job id
                # assume its queue
                running_or_queued[new_job_id] = i

        else:
            print 'no new \"positions\" are available'

        # debug, need to know
        running_jobs = len( [i for i in queue_status.values() if i in ['R']] )
        if running_jobs:
            print str( running_jobs ) + ' are still running...(excluding the jobs just submitted and including your other jobs)'
        
        # assess outcome of completed jobs
        for job_id in sorted( queue_status.keys() ):    # sort in numerical order, right?
            # debug
            if not job_id in all_completed_jobs:
                print '\t'+ job_id , queue_status[job_id]# , job_id in running_or_queued.keys()
                # could just skip it all now?
        
            if queue_status[job_id] == 'C' and job_id in running_or_queued.keys():
                task_id = running_or_queued[job_id][0]
                command_index = running_or_queued[job_id][1]
                command_dict = task_summaries[task_id]['commands'][command_index]

                check_successful = determine_check_successful_function( command_dict , single_relax = single_relax )

                success = check_successful( command_dict )

                failure_summary = ''
                if isinstance( success , bool ):
                    complete = success
                elif len( success ) > 1 and isinstance( success[0] , bool ):
                    complete = success[0]
                    failure_summary += ' '+ ';'.join( [str( j ) for j in success[1:]] ) +' '
                    print complete , failure_summary , 'try again?'*bool( not complete )    # debug

                # track the number of attempts?
                # try until failure - how many times?
                tries = 0
                if 'run' in command_dict.keys() and command_dict['run'] and not 'success' in command_dict['run'] and not 'failure' in command_dict['run']:
                    tries = int( command_dict['run'] )
                tries += 1
                print tries , 'attempts so far'    # debug
                
                if tries >= max_pbs_tries:
                    # its a failure
                    print job_id + ' completed successfully'*complete + (' failed with ' + str( tries ) + ' attempts')*(not complete)
                    failure_summary = 'success'*complete + (str( tries ) +' tries;failure ' + failure_summary)*(not complete)
                elif complete:
                    print job_id + ' completed successfully'
                    failure_summary = 'success' #+ str( tries ) + ' tries'
                else:
                    # record the number of tries
                    print job_id + ' completed' + ' successfully'*complete
                    failure_summary = str( tries )
                
                # update the record
                print 'updating with: ' + failure_summary    # debug
                task_summaries[task_id]['commands'][command_index]['run'] = failure_summary
            
                # optionally cleanup
                if ddg_monomer_cleanup and command_dict['feature'] == 'ddg_monomer':#'ddg' in i['output_filename']:
                    print 'ddg_monomer writes useless output files, deleting these now...'
                    remove_intermediate_ddg_monomer_files()

                # jobs that have since been completed - consider them complete?
                completed.append( running_or_queued[job_id] )    # good, so this grows
                del running_or_queued[job_id]
                # remove jobs to run?
#                print 'updating the status...'    # debug

                # write out "completed"? or "running_or_queued"?

            if queue_status[job_id] == 'C' and not job_id in all_completed_jobs:
                all_completed_jobs.append( job_id )    # prevent redundant update info


        # update task_summaries e.g. write them!
        # modified: so the task summary records its own name...bah!
        for i in task_summaries:
            if not 'task_summary_filename' in i['filenames'].keys():
                raise NotImplementedError( 'should input the task summary filename (not the summary itself)...' )
            else:
                # write it out
                print 'updating: ' + i['filenames']['task_summary_filename']
                write_task_summary( i , i['filenames']['task_summary_filename'] )

        
        # pause...
        print '\n' , len( completed ) , 'completed' , len( task_list ) , 'tasks remaining'    # debug
        if len( completed ) <= len( task_list ):    # no need for edge-case end wait
            print 'waiting ' + str( PBS_QUEUE_MONITOR_DELAY ) +'s...'
            time.sleep( PBS_QUEUE_MONITOR_DELAY )


    # return anything?
    # write one last time?
    for i in task_summaries:
        if not 'task_summary_filename' in i['filenames'].keys():
            raise NotImplementedError( 'should input the task summary filename (not the summary itself)...' )
        else:
            # write it out
            write_task_summary( i , i['filenames']['task_summary_filename'] )
def create_variant_protein_structures( pdb_filename , variants , chain , use_pyrosetta = USE_PYROSETTA ):
    # make sure the variants have been filtered
    if use_pyrosetta:
        # load the PDB as a pose
        pose = pose_from_pdb( pdb_filename )
        failed = {}
        root_filename = pdb_filename.rstrip( 'pdb' )
            
        # currently cannot handle multi-chain input
        # handle this before VIPUR
        if pose.chain( pose.total_residue() ) > 1:
            print 'CANNOT currently handle multi-chain PDBs (as pose), using PyMOL instead!'
            if PATH_TO_PYMOL:
                create_variant_protein_structures( pdb_filename , variants , chain , use_pyrosetta = False )
                return
            else:
                faulty = 'clean before VIPUR, cannot handle multi-chain PDBs'
                failed[faulty] = variants
        elif not pose.pdb_info().chain( 1 ) == chain:
            print '...not sure what it happening, you wanted chain ' + chain + ' but VIPUR found chain ' + pose.chain( 1 ) + ', skipping this entire sample!'
            faulty = 'clean before VIPUR, improper chain ID'
            failed[faulty] = variants
                
        # in case this condition is found:
        faulty = 'could not load position from PDB'
        variant_structures = []
        for variation in variants:
            # make a copy
            test_pose = Pose()
            test_pose.assign( pose )

            native = variation[0]
            position = variation[1:-1]
            mutant = variation[-1]
                
            # make sure the position was loaded
            icode = ' '    # default...this could cause problems...
            if not position[-1].isdigit():
                icode = position[-1]
                position = position[:-1]
                position = int( position )
            if not test_pose.pdb_info().pdb2pose( chain , position , icode ):
                if faulty in failed.keys():
                    failed[faulty].append( variation )
                else:
                    failed[faulty] = [variation]
                break    # stop the loop
            position = test_pose.pdb_info().pdb2pose( chain , position , icode )
                    
            # simple, use a mover to make the change
            # appears to have trouble with N terminal variants since it uses "replace_residue"
            # code is available that does not have this problem, however reloading into Rosetta with accurately determine the position of these atoms
            make_variant = MutateResidue( position , mutant )
            make_variant.apply( test_pose )

            # write out
            out_filename = self.root_filename +'.chain_'+ chain +'_'+ variation +'.pdb'
            variant_structures.append( out_filename )
            test_pose.dump_pdb( out_filename )
            print 'generated ' + variation + ' variant structure and wrote to ' + out_filename

        return variant_structures        
    else:
        # use the pymol script
        #for variants in self.variants['permissible']:
        # use default output naming
        # create command explicitly here, slightly different
        root_filename = pdb_filename.rstrip( '.pdb' )
        command = PATH_TO_PYMOL + ' -qcr ' + PATH_TO_VIPUR + '/pymol_make_variant_structure.py -- -p ' + pdb_filename + ' -m ' + ','.join( variants ) + ' -c ' + chain + ' -r ' + root_filename
#            print command
        run_local_commandline( command )
            
        # reconstruct the names
        variant_structures = [root_filename + '.chain_' + chain +'_'+ i +'.pdb' for i in variants]
        
        # verify they have been made
        if [None for i in variant_structures if not os.path.isfile( i )]:
            raise IOError( 'could not make variant protein structures,\ntry checking the input PDB file or the pymol script pymol_make_variant_structure.py' )
        
        return variant_structures
def run_rosetta_relax( pdb_filename , extra_options = {} , run = True , parallel = ROSETTA_RELAX_PARALLEL ):
    root_filename = pdb_filename.rstrip( '.pdb' )
    
    # collect the options, set the input, derive the output filenames
    relax_options = {}
    relax_options.update( ROSETTA_RELAX_OPTIONS )
    relax_options.update( extra_options )
    relax_options['s'] = pdb_filename
    relax_options['native'] = pdb_filename    # required to get gdtmm scores
    for i in relax_options.keys():
        if '__call__' in dir( relax_options[i] ):
            relax_options[i] = relax_options[i]( root_filename )

    # ...weird Rosetta append behavior...
#    if os.path.isfile( relax_options['out:file:silent'] ):
#        os.remove( relax_options['out:file:silent'] )
#    if os.path.isfile( relax_options['out:file:scorefile'] ):
#        os.remove( relax_options['out:file:scorefile'] )


    # for njc parallelization
    nstruct = int( relax_options.get( 'nstruct' , '0' ) )
    parallel = int( parallel )
    tmp_file = None
    if nstruct > 1 and parallel > 1:
        relax_options['nstruct'] = 1 #TODO: Add chunking option?
        score_filename = relax_options['out:file:scorefile']
        silent_filename = relax_options['out:file:silent']

        if 'run:jran' in relax_options:
            restoreJran = True
            jran = int( relax_options['run:jran'] )
        else:
            restoreJran = False
            jran = 123

        tmp_file = tempfile.NamedTemporaryFile( delete = False )
        print 'Parallel relax commands are in ' + tmp_file.name

        for s in xrange( nstruct ):
            tag = '_%05d' % s
            relax_options['run:jran'] = jran*nstruct + s
            relax_options['out:file:scorefile'] = score_filename + tag
            relax_options['out:file:silent'] = silent_filename + tag
            print >>tmp_file , create_executable_str( PATH_TO_ROSETTA_RELAX , args = [] , options = relax_options ) + " > %s 2>&1; echo '[[VIPURLOG]]' %s %d" % ((silent_filename + tag).replace( 'silent_' , 'log_' ) , pdb_filename , s + 1 )

        tmp_file.close()
        # the "find ... | xargs ..." idiom is used just in case nstruct is ever a *very* large number.
        command = '''\
parallel -j %d -a %s
find . -name '%s_[0-9]*[0-9]' | xargs cat | awk 'NR == 1 || $2 != "score" {print $0}' > %s
find . -name '%s_[0-9]*[0-9]' | xargs rm
find . -name '%s_[0-9]*[0-9]' | xargs cat | awk 'NR <= 2 || !($2 == "score" || $1 == "SEQUENCE:") {print $0}' > %s
find . -name '%s_[0-9]*[0-9]' | xargs rm
''' % (parallel , tmp_file.name , score_filename , score_filename , score_filename , silent_filename , silent_filename , silent_filename)
        print 'Parallel relax driver command:', command

        # restore option values
        relax_options['nstruct'] = str( nstruct )
        relax_options['out:file:scorefile'] = score_filename
        relax_options['out:file:silent'] = silent_filename
        if restoreJran:
            relax_options['run:jran'] = jran

        if run:
            return (command , tmp_file.name , score_filename , silent_filename)

        if tmp_file:
            os.unlink( tmp_file.name )
    else:
        command = create_executable_str( PATH_TO_ROSETTA_RELAX , args = [] , options = relax_options )

    if run:
        run_local_commandline( command )
    
#    command = create_executable_str( PATH_TO_ROSETTA_RELAX , args = [] , options = relax_options )

#    run_local_commandline( command )
    
    # the only output we need
#    return relax_options['out:file:scorefile']
    return relax_options['out:file:silent']
Example #21
0
def run_VIPUR_PBS( pdb_filename = '' , variants_filename = '' ,
        out_path = '' , write_numbering_map = True ,
        single_relax = True , delete_intermediate_relax_files = True ,
        demo = False , rerun_preprocessing = False ):
    # for the example input
    if demo:
        pdb_filename = PATH_TO_VIPUR + '/example_input/2C35.pdb'
        variants_filename = PATH_TO_VIPUR + '/example_input/2C35.txt'

        out_path = PATH_TO_VIPUR + '/example_output'

    # alternatively, run on an entire directory
    if not pdb_filename and not variants_filename:
        # current directory
        print 'no input provided, assuming you want to run on every (.pdb,.txt) file pair found in the current directory'
        pdb_filename = os.getcwd()

    if os.path.isdir( pdb_filename ) and not variants_filename:
        # assume variants_filename from pdb_filename
        variants_filename = '.txt'

    if os.path.isdir( pdb_filename ) and variants_filename[0] == '.':
        # look for file extension
        # instead, run on the directory
        if not out_path:
            out_path = os.path.abspath( pdb_filename )
        
        fa_filenames = [(out_path +'/')*bool( out_path ) + i for i in os.listdir( pdb_filename ) if get_file_extension( i ) == 'fa']
        fa_filenames = [[i , get_root_filename( i ) + variants_filename] for i in fa_filenames if os.path.isfile( get_root_filename( i ) + variants_filename ) and not os.path.isfile( get_root_filename( i ) + '.pdb' )]

        print 'running VIPUR on all (.pdb,' + variants_filename + ') file pairs found in ' + pdb_filename
        # find .pdb files
        pdb_filenames = [(out_path +'/')*bool( out_path ) + i for i in os.listdir( pdb_filename ) if get_file_extension( i ) == 'pdb']

        # look for pairs
        pdb_filenames = [[i , get_root_filename( i ) + variants_filename] for i in pdb_filenames if os.path.isfile( get_root_filename( i ) + variants_filename )]

        print str( len( pdb_filenames ) ) + ' pairs found'
        print str( len( fa_filenames ) ) + ' pairs found for sequence only mode'

        # go there...
#        os.chdir( pdb_filename )

        if not pdb_filenames:
            if not fa_filenames:
                raise IOError( '!!! no (.pdb,' + variants_filename + ') file pairs found in ' + pdb_filename + '!!?!\nAND no (.fa,' + variants_filename + ') file pairs were found...' )
            else:
                print '...only (.fa,' + variants_filename + ') file pairs were found, running in sequence only mode'

    else:
        # file extension etc.
        file_extension = get_file_extension( pdb_filename )
        root_filename = get_root_filename( pdb_filename )

        # normal execution, generalize by turning into list
        pdb_filenames = []
        fa_filenames = []
        if file_extension == 'pdb':
            pdb_filenames = [[(out_path +'/')*bool( out_path ) + pdb_filename , (out_path +'/')*bool( out_path ) + variants_filename]]
        else:
            fa_filenames = [[]]


    # combine all "filenames" to run into unified framework
    target_proteins = []#None]*(len( pdb_filenames ) + len( fa_filenames ))
    for i in pdb_filenames:
        this_out_path = get_root_filename( i[0] ) +'_VIPUR'    # directory to create
        target_proteins.append( i + [False , this_out_path] )
    for i in fa_filenames:
        this_out_path = get_root_filename( i[0] ) +'_VIPUR'    # directory to create
        target_proteins.append( i + [True , this_out_path] )

    # setup environment variables BEFORE pre processing
    # no need to setup a command, just run it
    if PBS_ENVIRONMENT_SETUP:
        print 'setting up environment variables'
        run_local_commandline( PBS_ENVIRONMENT_SETUP )

    # pre processing
    task_summaries = []
    for i in target_proteins:
        # guess what the task summary filename 'would' be, if it exists, keep going...
        task_summary_filename = i[3]*bool( i[3] ) +'/'+ get_root_filename( i[0] ).split( '/' )[-1] + '.task_summary'
        if os.path.isfile( task_summary_filename ) and not rerun_preprocessing:
            print 'hmmm, ' + i[0] + ' seems to have run preprocessing already, skipping now'
            #continue    # skip this one, do not add to list of tasks...?
            # actually, skip running pre-processing BUT DO add it to the list of tasks
        else:
            task_summary_filename = run_preprocessing( i[0] , i[1] ,
                sequence_only = i[2] , out_path = i[3] ,
                task_summary_filename = task_summary_filename ,
                write_numbering_map = write_numbering_map , single_relax = single_relax ,
                pymol_environment_setup = PBS_ENVIRONMENT_SETUP )


        # modify for PBS script
        task_summary = load_task_summary( task_summary_filename )
        for j in xrange( len( task_summary['commands'] ) ):

            command = task_summary['commands'][j]['command']

            # add for relax
            if task_summary['commands'][j]['feature'].replace( '_native' , '' ) == 'relax' and not 'rescore' in task_summary['commands'][j]['feature']:
                if not PBS_PARALLEL_ROSETTA_ENDING in command:
                    command = command.replace( ROSETTA_ENDING , PBS_PARALLEL_ROSETTA_ENDING )
                command = PBS_PARALLEL_ROSETTA_EXECUTION_COMMAND + ' '*bool( PBS_PARALLEL_ROSETTA_EXECUTION_COMMAND ) + command

                if ROSETTA_RELAX_PARALLEL_OPTIONS:
                    command += ' '+ ' '.join( ['-'+ k + (' '+ ROSETTA_RELAX_PARALLEL_OPTIONS[k])*bool( ROSETTA_RELAX_PARALLEL_OPTIONS[k] ) for k in ROSETTA_RELAX_PARALLEL_OPTIONS] )
#                command += ' -jd2:mpi_file_buf_job_distributor false'
#                command += ' -run:multiple_processes_writing_to_one_directory'
                
                # also use the parallel options
                pbs_options = 'parallel'#.update( PBS_PARALLEL_JOB_OPTIONS )
            else:
                pbs_options = 'serial'#.update( PBS_SERIAL_JOB_OPTIONS )

            # put "cd" in front
            command = ('cd '+ i[3] +';')*bool( i[3] ) + command
            
            # modify the task summary
            task_summary['commands'][j]['command'] = command
            
            
            # actually write the script...
            # don't worry about optional #PBS header info
            # need to add the variant? no, just use the output_filename for this
            script_filename = i[3] + '/'*bool( i[3] ) + get_root_filename( task_summary['commands'][j]['output_filename'].split( '/' )[-1] ) +'.'+ task_summary['commands'][j]['feature'] + '.pbs_script.sh'
            task_summary['commands'][j]['script_filename'] = script_filename

            f = open( script_filename , 'w' )
            f.write( PBS_BASH_SCRIPT( command ) )
            f.close()
            
            # use the script filename as the source for any log files
            # control the output and error paths

            # also generate the pbs call? might as well, keep it simple
            # no, uses ":" and "," characters...
            task_summary['commands'][j]['queue'] = pbs_options

        # rewrite the task summary
        write_task_summary( task_summary , task_summary_filename )

        task_summaries.append( task_summary )#_filename )


    # run them all
    run_VIPUR_task_summaries_PBS( task_summaries , single_relax = single_relax , delete_intermediate_relax_files = delete_intermediate_relax_files )


    # post processing
    # this look identical!!! :)
    for i in xrange( len( task_summaries ) ):
        # always okay to rerun post processing...should not make any difference
        sequence_only = target_proteins[i][2]
        print '\n\n\nExtracting and Analyzing the Results:\n\n'
        task_summaries[i] = run_postprocessing( task_summaries[i] , sequence_only = sequence_only )

    return task_summaries
Example #22
0
def run_VIPUR_tasks_PBS( task_summaries , task_list , max_pbs_tries = 2 , ddg_monomer_cleanup = True , single_relax = False , delete_intermediate_relax_files = False ):
    # run the non_rescore tasks
    completed = [i for i in task_list if 'run' in task_summaries[i[0]]['commands'][i[1]] and 'success' in task_summaries[i[0]]['commands'][i[1]]['run']]
    # should running_or_queued be saved? written to file?
    running_or_queued = {}
    while not len( completed ) == len( task_list ):
        # check queue status
        queue_status = get_pbs_queue_status()

        # update "running_or_queued" list (?)
        # err, no, does not have information on which job it is...:(
        #for i in queue_status.keys():
        #    if queue_status[i] in ['R' , 'Q']:

        queue_space_occupied = len( [i for i in queue_status.values() if not i in ['C' , 'R']] )    # ignore "C"ompleted jobs, "R"unning job quota are not set by us...
        # if your queue system does not have a separate "R"un quota, remove 'R' from the above!
        available_space = queue_space_occupied - PBS_QUEUE_QUOTA

        
        # launch next jobs in available slots
        if available_space:
            # choose the next job
            jobs_to_run = [i for i in task_list if
                not i in completed and
                not i in running_or_queued.values() and
                not ( 'run' in task_summaries[i[0]]['commands'][i[1]] and
                    ('success' in task_summaries[i[0]]['commands'][i[1]]['run'] or
                    'failure' in task_summaries[i[0]]['commands'][i[1]]['run']) )
                ]
            
            for i in jobs_to_run:
                command_dict = task_summaries[i[0]]['commands'][i[1]]
            
                # write scripts as part of pre processing?...yeah...
                # write the command to a script
                #script_filename = command_dict['out_path'] +'/'*bool( command_dict['out_path'] )+
                #script_filename = command_dict['script_filename']
            
            
                # if its a rescore and relax jobs were separated, need to recombine them!
                if 'rescore' in command_dict['feature']:
                    # combine the individual relax runs
                    #relax_commands = [i for i in task_summary['commands'] if i['feature'].replace( '_native' , '' ) == 'relax']
                    #silent_filenames = [j['output_filename'] for j in relax_commands if j['variant'] == i['variant'] and 'run' in j.keys() and j['run'] == 'success']
                    silent_filenames = [j['output_filename'] for j in task_summaries[i[0]]['commands'] if
                        j['feature'].replace( '_native' , '' ) == 'relax' and
                        j['variant'] == command_dict['variant'] and
                        'run' in j.keys() and
                        'success' in j['run']
                        ]
                    # actually need to identify the combined_silent_filename, be sure the relax files have not already been merged
                    # which variant
                    target_variant = [j for j in task_summaries[i[0]]['variants'].keys() if j.split( '_' )[-1] == command_dict['variant'] and j.split( '_' )[0] in command_dict['command']]
                    if not target_variant:
                        # its native
                        combined_silent_filename = task_summaries[i[0]]['other']['combined_native_silent_filename']
                        combined_score_filename = task_summaries[i[0]]['other']['combined_native_score_filename']
                    elif len( target_variant ) > 1:
                        raise Exception( '??? found more than on matching variant ???\n' + ', '.join( target_variant ) )
                    else:
                        # found it
                        combined_silent_filename = task_summaries[i[0]]['variants'][target_variant[0]]['combined_silent_filename']
                        combined_score_filename = task_summaries[i[0]]['variants'][target_variant[0]]['combined_score_filename']

                    #if not single_relax:    # AND post processing has not already be run...scan for the combined silent file
                    if not single_relax and not os.path.isfile( combined_silent_filename ):
                        if not len( silent_filenames ) == ROSETTA_RELAX_OPTIONS['nstruct']:
                            raise Exception( '??? somehow the matching relax run(s) has failed ???\n' + str( i ) )
                        score_filenames = [j.replace( '.silent' , '.sc' ) for j in silent_filenames]

                        merge_rosetta_relax_output( silent_filenames , combined_silent_filename , score_filenames , combined_score_filename , delete_old_files = delete_intermediate_relax_files )
                        # rescore already knows the proper filename
                    else:
                        # just a single match for each
                        # output filename should be correct as is :)
                        None

            
                # submit this script using a queue command
                pbs_command = command_dict['qsub_command']    # SHOULD already have an abspath to the script
                new_job_id = run_local_commandline( pbs_command , collect_stdout = True )
                new_job_id = new_job_id[:new_job_id.find( '.' )]
                
                # save the job id
                # assume its queue
                running_or_queued[new_job_id] = i

        
        # assess outcome of completed jobs
        for job_id in queue_status.keys():
            if queue_status[job_id] == 'C' and job_id in running_or_queued.keys():
                task_id = running_or_queued[job_id][0]
                command_index = running_or_queued[job_id][1]
                command_dict = task_summaries[task_id]['commands'][command_index]

                check_successful = determine_check_successful_function( command_dict , single_relax = single_relax )

                success = check_successful( command_dict )

                failure_summary = ''
                if isinstance( success , bool ):
                    complete = success
                elif len( success ) > 1 and isinstance( success[0] , bool ):
                    complete = success[0]
                    failure_summary += ' '+ ';'.join( [str( j ) for j in success[1:]] ) +' '
 
                # track the number of attempts?
                # try until failure - how many times?
                tries = 0
                if 'run' in command_dict.keys() and command_dict['run'] and not 'success' in command_dict['run'] and not 'failure' in command_dict['run']:
                    tries = int( command_dict['run'] )
                tries += 1
                
                if tries >= max_pbs_tries:
                    # its a failure
                    failure_summary = 'success'*complete + (str( tries ) +' tries;failure ' + failure_summary)*(not complete)
                else:
                    # record the number of tries
                    failure_summary = str( tries )
                
                # update the record
                task_summaries[task_id]['commands'][command_index]['run'] = failure_summary
            
                # optionally cleanup
                if ddg_monomer_cleanup and command_dict['feature'] == 'ddg_monomer':#'ddg' in i['output_filename']:
                    print 'ddg_monomer writes useless output files, deleting these now...'
                    remove_intermediate_ddg_monomer_files()

                # jobs that have since been completed - consider them complete?
                completed.append( running_or_queued[job_id] )
                del running_or_queued[job_id]
                
                # write out "completed"? or "running_or_queued"?


        # update task_summaries e.g. write them!
        # modified: so the task summary records its own name...bah!
        for i in task_summaries:
            if not 'task_summary_filename' in i['filenames'].keys():
                raise NotImplementedError( 'should input the task summary filename (not the summary itself)...' )
            else:
                # write it out
                write_task_summary( i , i['filenames']['task_summary_filename'] )

        
        # pause...
        time.sleep( PBS_QUEUE_MONITOR_DELAY )

    # return anything?
    # write one last time?
    for i in task_summaries:
        if not 'task_summary_filename' in i['filenames'].keys():
            raise NotImplementedError( 'should input the task summary filename (not the summary itself)...' )
        else:
            # write it out
            write_task_summary( i , i['filenames']['task_summary_filename'] )
Example #23
0
def run_VIPUR_tasks_in_batch_SLURM( task_summaries , task_list , max_slurm_tries = 2 , ddg_monomer_cleanup = True , single_relax = True ):
    # also setup to do start-stop
    completed = [i for i in task_list if 'run' in task_summaries[i[0]]['commands'][i[1]] and 'success' in task_summaries[i[0]]['commands'][i[1]]['run']]

    attempt = 1
    while not len( completed ) == len( task_list ):
        # do not worry about the queue in this mode
        # assume you will be able to submit etc.
        # not such thing as "running or queued" either, just run one batch at a time
            
        # gather all the commands into a single script
        # choose the next job
        jobs_to_run = [i for i in task_list if
            not i in completed and
            not ( 'run' in task_summaries[i[0]]['commands'][i[1]] and
                ('success' in task_summaries[i[0]]['commands'][i[1]]['run'] or
                'failure' in task_summaries[i[0]]['commands'][i[1]]['run']) )
            ]
        print str( len( jobs_to_run ) ) + ' processes still need to finish'
    
        # write the script
        master_script_text = '\n\n'.join( [task_summaries[i[0]]['commands'][i[1]]['command'] for i in jobs_to_run] )
        # test without relax processes
#        master_script_text = '\n\n'.join( [task_summaries[i[0]]['commands'][i[1]]['command'] for i in jobs_to_run if not 'relax' in task_summaries[i[0]]['commands'][i[1]]['command']] )
        master_script_text = SLURM_BASH_SCRIPT( master_script_text )
        
        # check if they have different names!?...wait...they will...
        slurm_script_filename = task_summaries[0]['filenames']['slurm_script_filename']
        slurm_output_filename = task_summaries[0]['filenames']['slurm_output_filename']
        slurm_error_filename = task_summaries[0]['filenames']['slurm_error_filename']
        
        slurm_script_filename = slurm_script_filename.replace( '.sh' , '_'+ str( attempt ) + '.sh' )
        slurm_output_filename = slurm_output_filename.replace( '.out' , '_'+ str( attempt ) + '.out' )
        slurm_error_filename = slurm_error_filename.replace( '.err' , '_'+ str( attempt ) + '.err' )
        # can just use the first one now...

        f = open( slurm_script_filename , 'w' )
        f.write( master_script_text )
        f.close()
        
        # save a copy of this script for reference?
        # successive runs with overwrite the file...
        
        # debug
#        raw_input( 'everything okay?' )
    
        # submit sbatch
        # simple for now...
        command = 'sbatch -n 40'
        if slurm_output_filename:
            command += ' -o ' + slurm_output_filename
        if slurm_error_filename:
            command += ' -e ' + slurm_error_filename
        command += ' ' + slurm_script_filename
        batch_job_id = run_local_commandline( command , collect_stdout = True )
#        batch_job_id = run_local_commandline( 'sbatch -n 40 ' + slurm_script_filename , collect_stdout = True )
        # srun or sbatch?
        batch_job_id = batch_job_id.strip().split( ' ' )[-1]
        print 'submitted ' + batch_job_id

        
        # monitor the job until it is complete

        # pause...
        batch_complete = False
        while not batch_complete:
            queue_status = get_slurm_queue_status( only_job_status = True )
#            batch_complete = bool( [i for i in queue_status if i[0] == batch_job_id] )
            batch_complete = not batch_job_id in queue_status.keys()
            # could be an immediate failure...but don't want to linger here anyway in that case

            # debug
#            print queue_status
#            print queue_status.keys()
#            print batch_complete , batch_job_id , batch_job_id in queue_status.keys()
            for i in queue_status.keys():
                print i + '\t' + queue_status[i]

            # can be sure it doesn't need to wait if empty
            if queue_status:
                print 'waiting ' + str( SLURM_QUEUE_MONITOR_DELAY ) +'s...'
                time.sleep( SLURM_QUEUE_MONITOR_DELAY )


        # evaluate if it ran successfully
        for job_pair in jobs_to_run:
            command_dict = task_summaries[job_pair[0]]['commands'][job_pair[1]]

            check_successful = determine_check_successful_function( command_dict , single_relax = single_relax )

            success = check_successful( command_dict )

            failure_summary = ''
            if isinstance( success , bool ):
                complete = success
            elif len( success ) > 1 and isinstance( success[0] , bool ):
                complete = success[0]
                failure_summary += ' '+ ';'.join( [str( j ) for j in success[1:]] ) +' '
 
            # track the number of attempts?
            # try until failure - how many times?
            tries = 0
            if 'run' in command_dict.keys() and command_dict['run'] and not 'success' in command_dict['run'] and not 'failure' in command_dict['run']:
                tries = int( command_dict['run'] )
            tries += 1

            job_task = task_summaries[job_pair[0]]['root_filename'].split( '/' )[-1]
            job_feature = task_summaries[job_pair[0]]['commands'][job_pair[1]]['feature']
            job_variant = ''
            if 'variant' in task_summaries[job_pair[0]]['commands'][job_pair[1]].keys():
                job_variant = task_summaries[job_pair[0]]['commands'][job_pair[1]]['variant']
            this_job_description = job_task +' '+ job_feature + (' ' + job_variant)*bool( job_variant )

            if tries >= max_slurm_tries:
                # its a failure
                print this_job_description + ' completed successfully'*complete + (' failed with ' + str( tries ) + ' attempts')*(not complete)
                failure_summary = 'success'*complete + (str( tries ) +' tries;failure ' + failure_summary)*(not complete)
                completed.append( job_pair )
            elif complete:
                print this_job_description + ' simply completed successfully'
                failure_summary = 'success' #+ str( tries ) + ' tries'
                completed.append( job_pair )
            else:
                # record the number of tries
                print this_job_description + ' completed' + ' successfully'*complete
                failure_summary = str( tries )
                completed.append( job_pair )
                
            # update the record
            task_summaries[job_pair[0]]['commands'][job_pair[1]]['run'] = failure_summary
            
            # no need to be here anymore
            # optionally cleanup
#            if ddg_monomer_cleanup and command_dict['feature'] == 'ddg_monomer':#'ddg' in i['output_filename']:
#                print 'ddg_monomer writes useless output files, deleting these now...'
#                remove_intermediate_ddg_monomer_files()

        # update task_summaries e.g. write them!
        # modified: so the task summary records its own name...bah!
        for i in task_summaries:
            if not 'task_summary_filename' in i['filenames'].keys():
                raise NotImplementedError( 'should input the task summary filename (not the summary itself)...' )
            else:
                # write it out
                print 'updating: ' + i['filenames']['task_summary_filename']
                write_task_summary( i , i['filenames']['task_summary_filename'] )

        # debug
#        print attempt
#        print len( completed ) , len( task_list )
#        raw_input( 'start the next round?' )

        # need to run another batch?
        attempt += 1
Example #24
0
def create_variant_protein_structures(pdb_filename,
                                      variants,
                                      chain,
                                      use_pyrosetta=USE_PYROSETTA,
                                      pymol_environment_setup=''):
    # optionally run environment setup
    if pymol_environment_setup:
        print 'setting up environment variables'
        run_local_commandline(pymol_environment_setup)

    # make sure the variants have been filtered
    if use_pyrosetta:
        # load the PDB as a pose
        pose = pose_from_pdb(pdb_filename)
        failed = {}
        root_filename = pdb_filename.rstrip('pdb')

        # currently cannot handle multi-chain input
        # handle this before VIPUR
        if pose.chain(pose.total_residue()) > 1:
            print 'CANNOT currently handle multi-chain PDBs (as pose), using PyMOL instead!'
            if PATH_TO_PYMOL:
                create_variant_protein_structures(pdb_filename,
                                                  variants,
                                                  chain,
                                                  use_pyrosetta=False)
                return
            else:
                faulty = 'clean before VIPUR, cannot handle multi-chain PDBs'
                failed[faulty] = variants
        elif not pose.pdb_info().chain(1) == chain:
            print '...not sure what it happening, you wanted chain ' + chain + ' but VIPUR found chain ' + pose.chain(
                1) + ', skipping this entire sample!'
            faulty = 'clean before VIPUR, improper chain ID'
            failed[faulty] = variants

        # in case this condition is found:
        faulty = 'could not load position from PDB'
        variant_structures = []
        for variation in variants:
            # make a copy
            test_pose = Pose()
            test_pose.assign(pose)

            native = variation[0]
            position = variation[1:-1]
            mutant = variation[-1]

            # make sure the position was loaded
            icode = ' '  # default...this could cause problems...
            if not position[-1].isdigit():
                icode = position[-1]
                position = position[:-1]
                position = int(position)
            if not test_pose.pdb_info().pdb2pose(chain, position, icode):
                if faulty in failed.keys():
                    failed[faulty].append(variation)
                else:
                    failed[faulty] = [variation]
                break  # stop the loop
            position = test_pose.pdb_info().pdb2pose(chain, position, icode)

            # simple, use a mover to make the change
            # appears to have trouble with N terminal variants since it uses "replace_residue"
            # code is available that does not have this problem, however reloading into Rosetta with accurately determine the position of these atoms
            make_variant = MutateResidue(position, mutant)
            make_variant.apply(test_pose)

            # write out
            out_filename = self.root_filename + '.chain_' + chain + '_' + variation + '.pdb'
            variant_structures.append(out_filename)
            test_pose.dump_pdb(out_filename)
            print 'generated ' + variation + ' variant structure and wrote to ' + out_filename

        return variant_structures
    else:
        # use the pymol script
        #for variants in self.variants['permissible']:
        # use default output naming
        # create command explicitly here, slightly different
        root_filename = pdb_filename.rstrip('.pdb')
        command = PATH_TO_PYMOL + ' -qcr ' + PATH_TO_VIPUR + '/pymol_make_variant_structure.py -- -p ' + pdb_filename + ' -m ' + ','.join(
            variants) + ' -c ' + chain + ' -r ' + root_filename
        if pymol_environment_setup:
            command = pymol_environment_setup + '\n\n' + command
        run_local_commandline(command)

        # reconstruct the names
        variant_structures = [
            root_filename + '.chain_' + chain + '_' + i + '.pdb'
            for i in variants
        ]

        # verify they have been made
        if [None for i in variant_structures if not os.path.isfile(i)]:
            raise IOError(
                'could not make variant protein structures,\ntry checking the input PDB file or the pymol script pymol_make_variant_structure.py'
            )

        return variant_structures
Example #25
0
def run_VIPUR_tasks_SLURM( task_summaries , task_list , max_pbs_tries = 2 , ddg_monomer_cleanup = True , single_relax = False , delete_intermediate_relax_files = False ):
    # run the non_rescore tasks
    completed = [i for i in task_list if 'run' in task_summaries[i[0]]['commands'][i[1]] and 'success' in task_summaries[i[0]]['commands'][i[1]]['run']]
    # should running_or_queued be saved? written to file?
    running_or_queued = {}
    rounds = 0
    while not len( completed ) == len( task_list ):
        rounds += 1
        print '\n\nQUEUE MONITOR ROUND ' + str( rounds )
        
        # debug
#        print running_or_queued
    
        # check queue status
        queue_status = get_slurm_queue_status( only_job_status = True )

        # update "running_or_queued" list (?)
        # err, no, does not have information on which job it is...:(
        #for i in queue_status.keys():
        #    if queue_status[i] in ['R' , 'Q']:


        # used to be after submission, not occurs first

        # debug, need to know
        running_jobs = len( [i for i in queue_status.values() if i in ['R']] )
        if running_jobs:
            print str( running_jobs ) + ' are still running...'
        
        # assess outcome of completed jobs
#        still_running = 0
        # need to add the jobs that completed, removed themselves from the queue in SLURM
#        print queue_status.keys() + [j for j in running_or_queued.keys() if not j in queue_status.keys()]
        for job_id in queue_status.keys() + [j for j in running_or_queued.keys() if not j in queue_status.keys()]:
            # debug
#            if job_id in queue_status.keys():
#                print '\t'+ job_id , queue_status[job_id] , job_id in running_or_queued.keys()
#            else:
##                print '\t'+ job_id , None , job_id in running_or_queued.keys()
#                print '\t'+ job_id , job_id in running_or_queued.keys()
        
            if (not job_id in queue_status.keys()) or (queue_status[job_id] == 'C' and job_id in running_or_queued.keys()):
                task_id = running_or_queued[job_id][0]
                command_index = running_or_queued[job_id][1]
                command_dict = task_summaries[task_id]['commands'][command_index]

                check_successful = determine_check_successful_function( command_dict , single_relax = single_relax )

                success = check_successful( command_dict )

                failure_summary = ''
                if isinstance( success , bool ):
                    complete = success
                elif len( success ) > 1 and isinstance( success[0] , bool ):
                    complete = success[0]
                    failure_summary += ' '+ ';'.join( [str( j ) for j in success[1:]] ) +' '
 
                # track the number of attempts?
                # try until failure - how many times?
                tries = 0
                if 'run' in command_dict.keys() and command_dict['run'] and not 'success' in command_dict['run'] and not 'failure' in command_dict['run']:
                    tries = int( command_dict['run'] )
                tries += 1
                
                if tries >= max_pbs_tries:
                    # its a failure
                    print job_id + ' completed successfully'*complete + (' failed with ' + str( tries ) + ' attempts')*(not complete)
                    failure_summary = 'success'*complete + (str( tries ) +' tries;failure ' + failure_summary)*(not complete)
                elif complete:
                    print job_id + ' simply completed successfully'
                    failure_summary = 'success' #+ str( tries ) + ' tries'
                else:
                    # record the number of tries
                    print job_id + ' completed' + ' successfully'*complete
                    failure_summary = str( tries )
                
                # update the record
                task_summaries[task_id]['commands'][command_index]['run'] = failure_summary
            
                # optionally cleanup
                if ddg_monomer_cleanup and command_dict['feature'] == 'ddg_monomer':#'ddg' in i['output_filename']:
                    print 'ddg_monomer writes useless output files, deleting these now...'
                    remove_intermediate_ddg_monomer_files()

                # jobs that have since been completed - consider them complete?
                completed.append( running_or_queued[job_id] )
                del running_or_queued[job_id]
                
                # write out "completed"? or "running_or_queued"?

#            else:
#                still_running += 1
#        print str( still_running) + ' jobs still running (or queued)...'

        # update task_summaries e.g. write them!
        # modified: so the task summary records its own name...bah!
        for i in task_summaries:
            if not 'task_summary_filename' in i['filenames'].keys():
                raise NotImplementedError( 'should input the task summary filename (not the summary itself)...' )
            else:
                # write it out
                print 'updating: ' + i['filenames']['task_summary_filename']
                write_task_summary( i , i['filenames']['task_summary_filename'] )


        # used to be first, submit jobs them check complete
        # but slurm removes jobs from the list

        queue_space_occupied = len( [i for i in queue_status.values() if not i in ['C' , 'PD']] )    # ignore "C"ompleted jobs, "R"unning job quota are not set by us...
        # if your queue system does not have a separate "R"un quota, remove 'R' from the above!
        available_space = SLURM_QUEUE_QUOTA - queue_space_occupied

        
        # launch next jobs in available slots
        if available_space:
            print str( queue_space_occupied ) + ' jobs queued or running, could submit up to ' + str( available_space ) + ' more'
            # choose the next job
            jobs_to_run = [i for i in task_list if
                not i in completed and
                not i in running_or_queued.values() and
                not ( 'run' in task_summaries[i[0]]['commands'][i[1]] and
                    ('success' in task_summaries[i[0]]['commands'][i[1]]['run'] or
                    'failure' in task_summaries[i[0]]['commands'][i[1]]['run']) )
                ]
            print str( len( jobs_to_run ) ) + ' jobs still need to finish (after the currently running jobs complete)'
            
            # only the next few
            for i in jobs_to_run[:available_space]:
                command_dict = task_summaries[i[0]]['commands'][i[1]]
            
                # write scripts as part of pre processing?...yeah...
                # write the command to a script
                #script_filename = command_dict['out_path'] +'/'*bool( command_dict['out_path'] )+
                #script_filename = command_dict['script_filename']
            
                # submission is specific to the job
                slurm_command = ''
            
                # if its a rescore and relax jobs were separated, need to recombine them!
                if 'rescore' in command_dict['feature']:
                    # combine the individual relax runs
                    #relax_commands = [i for i in task_summary['commands'] if i['feature'].replace( '_native' , '' ) == 'relax']
                    #silent_filenames = [j['output_filename'] for j in relax_commands if j['variant'] == i['variant'] and 'run' in j.keys() and j['run'] == 'success']
                    silent_filenames = [j['output_filename'] for j in task_summaries[i[0]]['commands'] if
                        j['feature'].replace( '_native' , '' ) == 'relax' and
                        j['variant'] == command_dict['variant'] and
                        'run' in j.keys() and
                        'success' in j['run']
                        ]
                    # actually need to identify the combined_silent_filename, be sure the relax files have not already been merged
                    # which variant
                    target_variant = [j for j in task_summaries[i[0]]['variants'].keys() if j.split( '_' )[-1] == command_dict['variant'] and j.split( '_' )[0] in command_dict['command']]
                    if not target_variant:
                        # its native
                        combined_silent_filename = task_summaries[i[0]]['other']['combined_native_silent_filename']
                        combined_score_filename = task_summaries[i[0]]['other']['combined_native_score_filename']
                    elif len( target_variant ) > 1:
                        raise Exception( '??? found more than on matching variant ???\n' + ', '.join( target_variant ) )
                    else:
                        # found it
                        combined_silent_filename = task_summaries[i[0]]['variants'][target_variant[0]]['combined_silent_filename']
                        combined_score_filename = task_summaries[i[0]]['variants'][target_variant[0]]['combined_score_filename']

                    #if not single_relax:    # AND post processing has not already be run...scan for the combined silent file
                    if not single_relax and not os.path.isfile( combined_silent_filename ):
                        if not len( silent_filenames ) == ROSETTA_RELAX_OPTIONS['nstruct']:
                            raise Exception( '??? somehow the matching relax run(s) has failed ???\n' + str( i ) )
                        score_filenames = [j.replace( '.silent' , '.sc' ) for j in silent_filenames]

                        merge_rosetta_relax_output( silent_filenames , combined_silent_filename , score_filenames , combined_score_filename , delete_old_files = delete_intermediate_relax_files )
                        # rescore already knows the proper filename
                    else:
                        # just a single match for each
                        # output filename should be correct as is :)
                        None

                # do this above, in initial processing
#                elif 'ddg_monomer' in command_dict['feature']:
                    # must do it this way for now
                    # write the run script
#                    ddg_monomer_script_filename = task_summary['filenames']['slurm_script_filename'].replace( 'slurm_script_this_batch.sh' , 'run_ddg_momomer_script.sh' )
#                    f = open( ddg_monomer_script_filename , 'w' )
#                    f.write( command_dict['command'] )
#                    f.close()

            
                # submit this script using a queue command
                # srun or sbatch?
                slurm_command = command_dict['sbatch_command']    # SHOULD already have an abspath to the script
                new_job_id = run_local_commandline( slurm_command , collect_stdout = True )
#                new_job_id = new_job_id[:new_job_id.find( ' ' )]
                new_job_id = new_job_id.strip().split( ' ' )[-1]
                print 'submitted ' + new_job_id
                
                # save the job id
                # assume its queue
                running_or_queued[new_job_id] = i
#                print 'added ' + new_job_id

        else:
            print 'no new \"positions\" are available'

        # OKAY, move the "updating" to just after the status check
        # problem with ddg_monomer, runs so fast...
        # make a specific exception:
        # ...move the pause to here
        # prevents odd behaviour...um...sorta? maybe not
#        if 'ddg_monomer' in command_dict['feature']:


        # used to be where the updates were compared

        
        # pause...
        if queue_space_occupied or jobs_to_run:
            print 'waiting ' + str( SLURM_QUEUE_MONITOR_DELAY ) +'s...'
            time.sleep( SLURM_QUEUE_MONITOR_DELAY )

    # return anything?
    # write one last time?
    for i in task_summaries:
        if not 'task_summary_filename' in i['filenames'].keys():
            raise NotImplementedError( 'should input the task summary filename (not the summary itself)...' )
        else:
            # write it out
            write_task_summary( i , i['filenames']['task_summary_filename'] )