Beispiel #1
0
def initiate():

    workspace = pipeline.workspace_from_dir(sys.argv[1])
    workspace.cd_to_root()

    try:
        print('Trying qsub')
        """Return some relevant information about the currently running job."""
        print_debug_header()
        job_info = read_job_info(workspace.job_info_path(os.environ['JOB_ID']))
        job_info['job_id'] = int(os.environ['JOB_ID'])
        job_info['task_id'] = int(os.environ['SGE_TASK_ID']) - 1
    except:
        try:
            print('Trying slurm')
            # If not qsub, slurm?
            job_info = read_job_info(workspace.slurm_cmd_file)
            print('Read job info')
            job_info['task_id'] = int(sys.argv[2])
            print('Assigned task id')
        except:
            print('Trying local')
            # Apparently this is a local job.
            # TODO: Need a better way to get job info for local jobs.
            job_info = {
                'inputs': [x for x in workspace.unclaimed_inputs],
                'nstruct': 1,
                'test_run': False,
                'task_id': int(sys.argv[2])
            }

    return workspace, job_info
Beispiel #2
0
def main():
    args = docopt.docopt(__doc__)
    prefix = args['--prefix'] or ''
    workspaces = []
    for ws in args['<workspaces>']:
        workspaces.append(pipeline.workspace_from_dir(ws))
    print(pipeline.load_resfile(args['<workspaces>'][0]))
    print(workspaces[0].resfile_path)
    if args['--design-list']:
        designs = designs_from_list(workspaces,
                                    args['--design-list'],
                                    verbose=args['--verbose'])
    else:
        designs = find_reasonable_designs(workspaces, args['--reasonable'],
                                          args['--verbose'])
    metrics = [
        DesignNameMetric(),
        ResfileSequenceMetric(),
        SequenceClusterMetric(args['--subs-matrix']),
        # StructureClusterMetric(args['--structure-threshold']),
    ]

    discover_extra_metrics(metrics, designs)
    calculate_quality_metrics(metrics, designs, args['--verbose'])
    report_quality_metrics(designs, metrics, prefix + 'quality_metrics.xlsx')
    #report_score_vs_rmsd_funnels(designs, prefix + 'score_vs_rmsd.pdf')
    #report_pymol_sessions(designs, prefix + 'pymol_sessions')
    annotate_designs(designs, args['--symbol'])
Beispiel #3
0
def main():
    args = docopt.docopt(__doc__)
    folder = args['<folder>']
    init()
    target = pose_from_file(args['<input>'])
    workspace = pipeline.workspace_from_dir(folder)
    loop = workspace.largest_loop

    #mobile = pose_from_file('/Users/benjaminrjagger/UCSF/cas_des/CPP_sims/4un3_1051_penetratin/input.pdb.gz')

    # ins_len = chain_end_res(mobile, 1) - chain_end_res(target, 1)
    # des_res = list(range(1, int(loop.start)-1)) + list(range(int(loop.end)+1, chain_end_res(mobile, 1)))
    # wt_res = list(range(1,int(loop.start) -1)) + list(range(int(loop.end)+1 - ins_len, chain_end_res(target, 1)))

    # res_map = map_unsigned_long_unsigned_long()
    # for i in range(len(des_res)):
    #     res_map[des_res[i]] = wt_res[i]

    # rmsd = CA_rmsd(mobile, target, res_map)

    for root, dirs, files in os.walk(workspace.output_dir):
        for name in files:
            if name.endswith('.pdb.gz') or name.endswith('.pdb'):
                pdbpath = os.path.join(root, name)
                mobile = pose_from_file(pdbpath)
                ins_len = chain_end_res(mobile, 1) - chain_end_res(target, 1)
                des_res = list(range(
                    1,
                    int(loop.start) - 1)) + list(
                        range(int(loop.end) + 1, chain_end_res(mobile, 1)))
                wt_res = list(range(1,
                                    int(loop.start) - 1)) + list(
                                        range(
                                            int(loop.end) + 1 - ins_len,
                                            chain_end_res(target, 1)))

                res_map = map_unsigned_long_unsigned_long()
                for i in range(len(des_res)):
                    res_map[des_res[i]] = wt_res[i]

                rmsd = CA_rmsd(mobile, target, res_map)
                metric_name = 'EXTRA_METRIC_CA_RMSD_NO_LOOP [[-]]'

                if name.endswith('.pdb.gz'):
                    add_lines_to_gzip(pdbpath, [metric_name + ' ' + str(rmsd)])

                if name.endswith('.pdb'):
                    add_lines_reg(pdbpath, [metric_name + ' ' + str(rmsd)])
                #rmsd = all_atom_rmsd(mobile, target)
                print(rmsd)
Beispiel #4
0
def main():
    args = docopt.docopt(__doc__)
    print(args)
    cluster.require_qsub()

    workspace = pipeline.workspace_from_dir(args['<workspace>'])
    # If not a fragment workspace, make a new one
    if not hasattr(workspace, 'fasta_path'):
        step = workspace.get_next_step()
        workspace = pipeline.ValidationWorkspace(workspace.root_dir, step)
    workspace.check_paths()
    workspace.make_dirs()
    workspace.clear_fragments()

    inputs = pick_inputs(workspace)
    if not inputs:
        print('All inputs already have fragments')
    # if not '--input_model' in args:
    # model = workspace.input_pdb_path
    # else:
    # model = args['--input_model']

    # Run the fragment generation script.

    generate_fragments = [
        'klab_generate_fragments',
        '--outdir',
        workspace.fragments_dir,
        '--memfree',
        args['--mem-free'],
    ] + inputs
    if not args['--ignore-loop-file']:
        generate_fragments += [
            '--loops_file',
            workspace.loops_path,
        ]

    if args['--dry-run']:
        print(' '.join(generate_fragments))
    else:
        subprocess.call(generate_fragments)
Beispiel #5
0
from roseasy.pipeline import workspace_from_dir
import sys, os, glob

workspace = workspace_from_dir(sys.argv[1])
outpath = os.path.join(workspace.focus_dir, 'outputs')

for silentfile in glob.glob(outpath + '/*/silent.out'):
    os.symlink(
        silentfile,
        os.path.join(sys.argv[2], 'inputs',
                     os.path.basename(os.path.dirname(silentfile)) + '.pdb'))
Beispiel #6
0
def main():
    args = docopt.docopt(__doc__)
    if not args['--local'] and not args['--slurm'] and not args['--make-dirs']:
        cluster.require_qsub()

    workspace = pipeline.workspace_from_dir(args['<workspace>'])
    if args['--step']:
        step = args['--step']
    elif hasattr(workspace, 'step'):
        step = workspace.step
    else:
        step = workspace.get_next_step()

    if not args['<script>']:
        script = os.path.join(workspace.focus_dir, 'run.py')
    else:
        script = args['<script>']

    if not os.path.exists(script):
        raise pipeline.PathNotFound(script)


    # Workspace type is defined in the run script, so we first need to
    # import that.
    script_path = os.path.dirname(script)
    sys.path.insert(1, script_path)
    script_name = os.path.basename(script)[:-3]
    imp = importlib.import_module(script_name)

    workspace = imp.get_workspace(workspace.root_dir, step)
    workspace.check_paths()
    # workspace.check_rosetta()
    workspace.make_dirs()
    if args['--make-dirs']:
        sys.exit()
    # Copying the script to the focus directory helps track exactly what
    # we did at each step.
    shutil.copyfile(script, workspace.script_path)

    if args['--clear'] or args['--test-run']:
        workspace.clear_outputs()

    inputs = [
            x for x in workspace.unclaimed_inputs
            ]

    if len(inputs)==0:
        num_inputs = 1
    else:
        num_inputs = len(inputs)

    if args['--test-run']:
        nstruct = num_inputs * 10
    else:
        nstruct = num_inputs * int(args['--nstruct'])

    if workspace.subdirs:
        for inp in inputs:
            subdir = workspace.output_subdir(inp)
            # scripting.clear_directory(subdir)

    # Submit the job

    if args['--local']:
        print('Running locally.')
        for n in range(1,nstruct + 1):
            cmd = [workspace.python_path]
            cmd.append(workspace.script_path)
            cmd.append(workspace.focus_dir)
            cmd.append(str(n))
            execute(cmd)
            # read_and_display(cmd)

    elif args['--slurm']:
        big_jobs.submit_slurm(
                workspace, 
                nstruct=nstruct,
                max_runtime=args['--max-runtime'],
                max_memory=args['--max-memory'],
                test_run=args['--test-run'],
                job_name=script_name,
                inputs=inputs
                )

    else:
        big_jobs.submit(
                workspace.script_path, workspace,
                nstruct=nstruct,
                max_runtime=args['--max-runtime'],
                max_memory=args['--max-memory'],
                test_run=args['--test-run'],
                job_name=script_name,
                inputs=inputs
                )
    repack_mask = packertask.repacking_residues()
    design_mask = packertask.designing_residues()
    movemap = setup_movemap_from_resselectors(design_mask, repack_mask)

    return tf, movemap


if __name__ == '__main__':
    test = False
    # test = True
    if not test:
        workspace, job_info = big_jobs.initiate()
        test_run = job_info.get('test_run', False)
        pdbpath = workspace.input_path(job_info)
    else:
        workspace = pipeline.workspace_from_dir(sys.argv[1])
        pdbpath = '02_designs/inputs/model_0.pdb.gz'
        test_run = True
        task_id = 1
    init('-total_threads 1 -packing:ex1 -packing:ex2 -packing:ex1aro '\
            '-use_input_sc')
    pose = pose_from_file(pdbpath)
    task_factory, movemap = clash_based_taskfactory(pdbpath, pose)
    ref = create_score_function('ref2015')
    rot = RotamerTrialsMover(ref, task_factory)
    print('APPLYING ROTAMERTRIALS')
    rot.apply(pose)

    # cst_gen_str = '''
    # <CoordinateConstraintGenerator name="backbone"
    # ca_only="true"
Beispiel #8
0
def make_picks(workspace, pick_file=None, clear=False, use_cache=True, dry_run=False, keep_dups=False):
    """
    Return a subset of the designs in the given data frame based on the 
    conditions specified in the given "pick" file.

    An example pick file is show below::

        threshold:
        - restraint_dist < 1
        - buried_unsatisfied_h_bonds < 1

        pareto:
        - total_score
        - restraint_dist
        - foldability

        depth: 1
        epsilon: 0.5%

    Any designs not meeting the conditions set in the "threshold" section will 
    be discarded.  Any designs that are non-dominated with respect to the 
    metrics listed in the "Pareto" section will be kept.  The "depth" and 
    "epsilon" parameters provide a measure of control over how many designs 
    are included in the Pareto front.
    """
    # Read the rules for making picks from the given file.

    if pick_file is None:
        pick_file = workspace.pick_file

    if not os.path.exists(pick_file):
        raise IOError("""\
Could not find '{}'.

Either specify a pick file on the command line, or create a file called 
`pick.yml` and put in a directory in your workspace that corresponds to the 
step you want it to apply to.""")

    import yaml
    with open(pick_file) as file:
        rules = yaml.load(file)

    print("Picking designs according to '{0}'.".format(os.path.relpath(pick_file)))
    print()

    pareto = rules.get('pareto', [])
    thresholds = rules.get('threshold', [])

    known_keys = 'threshold', 'pareto', 'depth', 'epsilon'
    unknown_keys = set(rules) - set(known_keys)

    if unknown_keys:
        not_understood = '\n'.join('    ' + x for x in sorted(unknown_keys))
        did_you_mean = '\n'.join('    ' + x for x in known_keys)
        raise IOError("""\
The following parameters in '{2}' are not understood:
{0}

Did you mean:
{1}\n""".format(not_understood, did_you_mean, os.path.relpath(pick_file)))

    # Load all the metrics for the models we're picking from.

    if clear:
        workspace.clear_inputs()

    predecessor = workspace.predecessor
    print(predecessor)
    if predecessor == workspace.input_pdb_path:
        raise("Cannot pick designs for initial step.")
    metrics = []
    metadata = {}

    for input_dir in pipeline.workspace_from_dir(predecessor).output_subdirs:
        submetrics, submetadata = load(
                input_dir,
                use_cache=use_cache,
        )
        submetrics['abspath'] = submetrics.apply(
                lambda row: os.path.abspath(os.path.join(input_dir, row['path'])),
                axis='columns',
        )
        metrics.append(submetrics)
        metadata.update(submetadata)
    print('Metrics', metrics)
    metrics = pd.concat(metrics, ignore_index=True)

    # Check to make sure we know about all the metrics we were given, and 
    # produce a helpful error if we find something unexpected (e.g. maybe a 
    # typo?).  This is a little complicated for the threshold queries, because 
    # running them is the only way to find out if they have any problems.

    unknown_metrics = set(pareto) - set(metadata)

    for query in thresholds:
        try:
            metrics.query(query)
        except pd.core.computation.ops.UndefinedVariableError as err:
            # Kinda gross, but we have to parse the error message to get the 
            # name of the metric causing the problem.
            unknown_metric = re.search("'(.+)'", str(err)).group(1)
            unknown_metrics.add(unknown_metric)

    if unknown_metrics:
        not_understood = '\n'.join('    ' + x for x in sorted(unknown_metrics))
        did_you_mean = '\n'.join('    ' + x for x in sorted(metadata))
        raise IOError("""\
The following metrics are not understood:
{0}

Did you mean:
{1}\n""".format(not_understood, did_you_mean))

    # Tell the user whether high or low values are favored for each metric 
    # included in the Pareto front, so they can confirm that we're doing the 
    # right thing.
    
    if pareto:
        print("""\
Please confirm whether high (+) or low (-) values should be preferred for each 
of the following metrics:""")

        for metric in rules['pareto']:
            print("  ({dir}) {metric}".format(
                    metric=metric,
                    dir=metadata[metric].direction))

        print()
        print("""\
If there's an error, it's probably because you didn't specify a direction in 
the name of the filter, e.g. "Foldability [+]".  To avoid this problem in the 
future, add the appropriate direction (in square brackets) to the filter name 
in 'filters.xml'.  To fix the immediate problem, go into the directory 
containing your design PDBs, manually edit the file called 'metrics.yml', and 
correct the 'dir' field for any metrics necessary.""")
        print()

    # Figure out how long the longest status message will be, so we can get our 
    # output to line up nicely.

    class StatusBar:
        update_line = "  {0}:"

        def __init__(self):
            self.w1 = 30

        def init(self, df):
            self.n = len(df)
            self.w2 = len(str(self.n))
            return "{message:<{w1}} {n:>{w2}}".format(
                    message="Total number of designs",
                    n=self.n, w1=self.w1, w2=self.w2)

        def update(self, df, status):
            dn = len(df) - self.n
            self.n = len(df)
            return "{message:<{w1}} {n:>{w2}} {dn:>{w3}}".format(
                    message=self.update_line.format(status),
                    n=self.n, dn='(-{})'.format(abs(dn)),
                    w1=self.w1, w2=self.w2, w3=self.w2+3)

        def adjust_width(self, status):
            self.w1 = max(self.w1, len(self.update_line.format(status)))



    status = StatusBar()
    for query in thresholds:
        status.adjust_width(repr(query))

    print(status.init(metrics))

    # Ignore any designs that are missing data.

    metrics.dropna(inplace=True)
    print(status.update(metrics, "minus missing data"))

    # Keep only the lowest scoring model for each set of identical sequences.

    if not keep_dups:
        groups = metrics.groupby('sequence', group_keys=False)
        metrics = groups.\
                apply(lambda df: df.loc[df.total_score.idxmin()]).\
                reset_index(drop=True)
        print(status.update(metrics, 'minus duplicate sequences'))

    # Remove designs that don't pass the given thresholds.

    for query in thresholds:
        metrics = metrics.query(query)
        print(status.update(metrics, repr(query)))

    # Remove designs that aren't in the Pareto front.

    if pareto:
        def progress(i, depth, j, front): #
            sys.stdout.write('\x1b[2K\r  minus Pareto dominated:    calculating... [{}/{}] [{}/{}]'.format(i, depth, j, front))
            if i == depth and j == front:
                sys.stdout.write('\x1b[2K\r')
            sys.stdout.flush()

        metrics = find_pareto_front(
                metrics, metadata, pareto,
                depth=rules.get('depth', 1),
                epsilon=rules.get('epsilon'),
                progress=progress,
        )
        print(status.update(metrics, 'minus Pareto dominated'))

    # Remove designs that have already been picked.

    existing_inputs = set(
            os.path.abspath(os.path.realpath(x))
            for x in workspace.input_paths)
    metrics = metrics.query('abspath not in @existing_inputs')
    print(status.update(metrics, 'minus current inputs'))

    # Symlink the picked designs into the input directory of the next round.

    if not dry_run:
        existing_ids = set(
                int(x[0:-len('.pdb.gz')])
                for x in os.listdir(workspace.input_dir)
                if x.endswith('.pdb.gz'))
        next_id = max(existing_ids) + 1 if existing_ids else 0

        for id, picked_index in enumerate(metrics.index, next_id):
            target = metrics.loc[picked_index]['abspath']
            link_name = os.path.join(workspace.input_dir, '{0:04}.pdb.gz')
            scripting.relative_symlink(target, link_name.format(id))

    print()
    print("Picked {} designs.".format(len(metrics)))

    if dry_run:
        print("(Dry run: no symlinks created.)")
Beispiel #9
0
def load(pdb_dir, use_cache=True, job_report=None, require_io_dir=True):
    """
    Return a variety of score and distance metrics for the structures found in
    the given directory.  As much information as possible will be cached.  Note
    that new information will only be calculated for file names that haven't
    been seen before.  If a file changes or is deleted, the cache will not be
    updated to reflect this and you may be presented with stale data.
    """

    # Make sure the given directory seems to be a reasonable place to look for
    # data, i.e. it exists and contains PDB files.

    if not os.path.exists(pdb_dir):
        raise IOError("'{}' does not exist".format(pdb_dir))
    if not os.path.isdir(pdb_dir):
        raise IOError("'{}' is not a directory".format(pdb_dir))
    if not os.listdir(pdb_dir):
        raise IOError("'{}' is empty".format(pdb_dir))
    if not glob.glob(os.path.join(pdb_dir, '*.pdb*')):
        raise IOError("'{}' doesn't contain any PDB files".format(pdb_dir))

    # The given directory must also be a workspace, so that the restraint file
    # can be found and used to calculate the "restraint_dist" metric later on.

    try:
        workspace = pipeline.workspace_from_dir(pdb_dir)
    except pipeline.WorkspaceNotFound:
        raise IOError("'{}' is not a workspace".format(pdb_dir))
    if require_io_dir and not any(
            os.path.samefile(pdb_dir, x) for x in workspace.io_dirs):
        raise IOError("'{}' is not an input or output directory".format(pdb_dir))

    # Find all the structures in the given directory, then decide which have
    # already been cached and which haven't.

    pdb_paths = glob.glob(os.path.join(pdb_dir, '*.pdb.gz')) +\
            glob.glob(os.path.join(pdb_dir, '*.pdb'))
    base_pdb_names = set(os.path.basename(x) for x in pdb_paths)
    cache_path = os.path.join(pdb_dir, 'metrics.pkl')
    metadata_path = os.path.join(pdb_dir, 'metrics.yml')

    cached_records = []
    uncached_paths = pdb_paths
    metadata = {}

    if use_cache:
        try:
            cached_records = pd.read_pickle(cache_path).to_dict('records')
            cached_paths = set(x['path'] for x in cached_records)
            uncached_paths = [
                    pdb_path for pdb_path in pdb_paths
                    if os.path.basename(pdb_path) not in cached_paths]

            with codecs.open(metadata_path, encoding='utf8') as file:
                metadata_list = [ScoreMetadata(**x) for x in yaml.safe_load(file)]
                metadata = {x.name: x for x in metadata_list}

        except:
            cached_records = []
            uncached_paths = pdb_paths
            metadata = {}

    # Calculate score and distance metrics for the uncached paths, then combine
    # the cached and uncached data into a single data frame.

    uncached_records, uncached_metadata = \
            read_and_calculate(workspace, uncached_paths)

    all_records = pd.DataFrame(cached_records + uncached_records)
    metadata.update(uncached_metadata)

    # Make sure all the expected metrics were calculated.

    expected_metrics = [
            'total_score',
            #'restraint_dist',
            'sequence',
    ]
    for metric in expected_metrics:
        if metric not in all_records:
            print(list(all_records.keys()))
            raise IOError("'{}' wasn't calculated for the models in '{}'".format(metric, pdb_dir))

    # If everything else looks good, cache the data frame so we can load faster
    # next time.

    all_records.to_pickle(cache_path)
    with codecs.open(metadata_path, 'w', encoding='utf8') as file:
        yaml.safe_dump([v.to_dict() for k,v in list(metadata.items())], file)

    # Report how many structures had to be cached, in case the caller is
    # interested, and return to loaded data frame.

    if job_report is not None:
        job_report['new_records'] = len(uncached_records)
        job_report['old_records'] = len(cached_records)

    return all_records, metadata
Beispiel #10
0
from roseasy.standard_params import filters
import os, sys
from pyrosetta import pose_from_file
from pyrosetta import init
from roseasy.pipeline import workspace_from_dir

#rosetta_dir = '~/software/rosetta'
ws = workspace_from_dir(os.path.expanduser('~/cas/test/'))
print(ws.largest_loop)
print(ws.loops_path)
#sys.exit()
dalphaball_path = os.path.join(ws.rosetta_dir, 'source', 'external',
                               'DAlphaBall', 'DAlphaBall.gcc')
init('-holes:dalphaball {} -in:file:s {}'.format(dalphaball_path,
                                                 ws.input_pdb_path))
pose = pose_from_file(ws.input_pdb_path)

filters = filters.get_filters(ws, score_fragments=True, test_run=True)
#for f in filters:
filters[-1].apply(pose)

pose.dump_pdb('test_out.pdb')
Beispiel #11
0
from pyrosetta import create_score_function
from roseasy.movers import fastdesign
#from roseasy.standard_params.filters import FilterContainer

def get_workspace(root_dir, step):
    return pipeline.ValidationWorkspace(root_dir, step)

if __name__=='__main__':
    workspace, job_info = big_jobs.initiate()
    test_run = job_info.get('test_run', False)

    # Figure out input pdb and create a pose
    silentpath = workspace.input_path(job_info)

    # Input pdb path for RMSD comparisons
    predecessor = pipeline.workspace_from_dir(workspace.predecessor)
    pdbpath = os.path.join(
            workspace.root_dir,
            predecessor.input_dir,
            os.path.basename(silentpath)
            )
    print('CURRENT INPUT: {}'.format(pdbpath))
    dalphaball_path = os.path.join(workspace.rosetta_dir, 'source',
            'external', 'DAlpahBall', 'DAlphaBall.gcc')
    init('-total_threads 1 -holes:dalphaball {} -in:file:s {}'.format(dalphaball_path, pdbpath))
    poses = poses_from_silent(silentpath)
    i = 0
    tasknum = int(job_info['task_id'])
    posedict = {}
    for pose in poses:
        if i%10 == (tasknum - 1)// len(job_info['inputs']):