Beispiel #1
0
def gather_templates_from_pdb(pdbids, uniprot_domain_regex=None, chainids=None, structure_dirs=None, loglevel=None):
    """
    :param pdbids: list of str
    :param uniprot_domain_regex: str
    :param chainids: dict {pdbid (str): [chainid (str)]}
    :param structure_dirs: list of str
    :return:
    """
    ensembler.utils.set_loglevel(loglevel)
    manual_overrides = ensembler.core.ManualOverrides()
    selected_pdbchains = None
    if mpistate.rank == 0:
        for pdbid in pdbids:
            get_structure_files_for_single_pdbchain(pdbid, structure_dirs)
        uniprot_acs = extract_uniprot_acs_from_sifts_files(pdbids)
        logger.debug('Extracted UniProt ACs: {0}'.format(uniprot_acs))
        uniprot_ac_query_string = ensembler.uniprot.build_uniprot_query_string_from_acs(uniprot_acs)
        uniprotxml = ensembler.uniprot.get_uniprot_xml(uniprot_ac_query_string)
        selected_pdbchains = extract_template_pdbchains_from_uniprot_xml(uniprotxml, uniprot_domain_regex=uniprot_domain_regex, manual_overrides=manual_overrides, specified_pdbids=pdbids, specified_chainids=chainids)

    selected_pdbchains = mpistate.comm.bcast(selected_pdbchains, root=0)
    logger.debug('Selected PDB chains: {0}'.format([pdbchain['templateid'] for pdbchain in selected_pdbchains]))

    selected_templates = extract_template_pdb_chain_residues(selected_pdbchains)
    write_template_seqs_to_fasta_file(selected_templates)
    extract_template_structures_from_pdb_files(selected_templates)
    write_gather_templates_from_pdb_metadata(pdbids, uniprot_domain_regex, len(selected_templates), chainids, structure_dirs)
Beispiel #2
0
    def __init__(self, targetid, traj_filepath=None, topol_filepath=None,
           models_data_filepath=None, process_only_these_templates=None, loglevel=None,
           run_main=True):
        """
        Makes trajectory of the model files with added hydrogens, but prior to any refinement.
        For the specified target, makes a single topology pdb file, a single trajectory xtc file,
        and individual pdb files for each model.

        See docs on `MkTraj` for further info on paramters.

        Examples
        --------
        MkTrajImplicitStart(targetid='EGFR_HUMAN_D0')
        """
        ensembler.utils.set_loglevel(loglevel)
        ensembler.core.check_project_toplevel_dir()
        self.models_target_dir = os.path.join(default_project_dirnames.models, targetid)

        logger.debug('Working on target %s' % targetid)

        self.ensembler_stage = 'implicit-start'
        self.model_filename = 'implicit-start.pdb.gz'

        if traj_filepath is None:
            self.traj_filepath = os.path.join(
                self.models_target_dir, 'traj-{0}.xtc'.format(self.ensembler_stage)
            )
        else:
            self.traj_filepath = traj_filepath

        if topol_filepath is None:
            self.topol_filepath = os.path.join(
                self.models_target_dir, 'traj-{0}-topol.pdb'.format(self.ensembler_stage)
            )
        else:
            self.topol_filepath = topol_filepath

        if models_data_filepath is None:
            self.models_data_filepath = os.path.join(
                self.models_target_dir, 'traj-{0}-data.csv'.format(self.ensembler_stage)
            )
        else:
            self.models_data_filepath = models_data_filepath

        if process_only_these_templates:
            self.templateids = process_only_these_templates
        else:
            directories = [ directory for directory in os.walk(self.models_target_dir) ]
            self.templateids = directories[0][1]

        if run_main:
            self._gen_implicit_start_models()
            self._gen_df(model_filename=self.model_filename)
            self.df.to_csv(self.models_data_filepath, columns=['templateid', 'seqid'])
            self._construct_traj()
            self._superpose()
            self._write_traj()
Beispiel #3
0
    def _construct_traj(self):
        logger.debug('Loading Trajectory object for model {0} ({1}/{2})'.format(self.df.templateid.iloc[0], 0, len(self.df.model_filepath)))
        traj = mdtraj.load_pdb(self.df.model_filepath[0])
        remove_disulfide_bonds_from_topology(traj.topology)
        self.traj = traj

        for m, model_filepath in enumerate(self.df.model_filepath[1:]):
            logger.debug('Loading Trajectory object for model {0} ({1}/{2})'.format(self.df.templateid.iloc[m+1], m+1, len(self.df.model_filepath)))
            traj = mdtraj.load_pdb(model_filepath)
            remove_disulfide_bonds_from_topology(traj.topology)
            self.traj += traj
Beispiel #4
0
def calc_pme_parameters(system):
    """Calculate PME parameters using scheme similar to OpenMM OpenCL platform.

    Parameters
    ----------
    system : simtk.openmm.System
        The system for which parameters are to be computed.

    Returns
    -------
    alpha : float
        The PME alpha parameter
    nx, ny, nz : int
        The grid numbers in each dimension

    """

    # Find nonbonded force.
    forces = {
        system.getForce(index).__class__.__name__: system.getForce(index)
        for index in range(system.getNumForces())
    }
    force = forces['NonbondedForce']
    tol = force.getEwaldErrorTolerance()
    boxVectors = system.getDefaultPeriodicBoxVectors()

    from numpy import sqrt, log, ceil
    from math import pow
    alpha = (1.0 / force.getCutoffDistance()) * sqrt(-log(2.0 * tol))
    xsize = int(ceil(2 * alpha * boxVectors[0][0] / (3 * pow(tol, 0.2))))
    ysize = int(ceil(2 * alpha * boxVectors[1][1] / (3 * pow(tol, 0.2))))
    zsize = int(ceil(2 * alpha * boxVectors[2][2] / (3 * pow(tol, 0.2))))

    logger.debug('xsize = %d, ysize = %d, zsize = %d' % (xsize, ysize, zsize))

    def findLegalDimension(minimum):
        while (True):
            # Attempt to factor the current value.
            unfactored = minimum
            for factor in range(2, 8):
                while (unfactored > 1) and (unfactored % factor == 0):
                    unfactored /= factor

            if (unfactored == 1):
                return int(minimum)

            minimum += 1

    nx = findLegalDimension(xsize)
    ny = findLegalDimension(ysize)
    nz = findLegalDimension(zsize)

    return (alpha, nx, ny, nz)
Beispiel #5
0
def create_dir(dirpath, quiet=True):
    """
    :param dirpath: str
    """
    try:
        os.makedirs(dirpath)
        if not quiet:
            logger.info('Created directory "%s"' % dirpath)
    except OSError as e:
        if e.errno == 17:
            logger.debug('Directory "%s" already exists - will not overwrite' % dirpath)
        else:
            raise
Beispiel #6
0
    def _mk_traj(self):
        with ensembler.utils.mk_temp_dir() as tmpdir:
            model_filepaths = []
            for m, model_filepath_gz in enumerate(self.model_filepaths):
                logger.debug('Unzipping model {0}/{1}'.format(m, len(self.model_filepaths)))
                with gzip.open(model_filepath_gz) as model_file:
                    model_filepath = os.path.join(tmpdir, '{0}.pdb'.format(m))
                    model_filepaths.append(model_filepath)
                    model_text = model_file.read()
                with open(model_filepath, 'w') as model_file:
                    model_file.write(model_text)

            self.traj = mdtraj.load(model_filepaths)
Beispiel #7
0
def get_valid_templates_for_target(
    target,
    templates_resolved_seq,
    process_only_these_templates=None,
    model_seqid_cutoff=None,
    model_validation_score_cutoff=None,
    model_validation_score_percentile=None,
):
    logger.debug("Building list of valid templates...")
    models_target_dir = os.path.join(default_project_dirnames.models,
                                     target.id)
    if model_seqid_cutoff:
        selected_template_ids = select_templates_by_seqid_cutoff(
            target.id, seqid_cutoff=model_seqid_cutoff)
    elif model_validation_score_cutoff or model_validation_score_percentile:
        selected_template_ids = select_templates_by_validation_score(
            targetid=target.id,
            validation_score_cutoff=model_validation_score_cutoff,
            validation_score_percentile=model_validation_score_percentile,
        )
    elif process_only_these_templates:
        selected_template_ids = [
            seq_obj.id for seq_obj in templates_resolved_seq
            if seq_obj.id in process_only_these_templates
        ]
    else:
        selected_template_ids = [
            seq_obj.id for seq_obj in templates_resolved_seq
        ]

    valid_templates = []

    for template in selected_template_ids:
        # Check to make sure all files needed are present.
        not_valid = False
        for filename in filenames_necessary_for_fah_packaging:
            fullpath = os.path.join(models_target_dir, template, filename)
            if not (os.path.exists(fullpath)
                    or os.path.exists(fullpath + '.gz')):
                not_valid = True
                break

        if not_valid:
            continue
        else:
            valid_templates.append(template)

    logger.debug('{} valid unique initial starting conditions found'.format(
        len(valid_templates)))

    return valid_templates
Beispiel #8
0
    def _mk_traj(self):
        with ensembler.utils.mk_temp_dir() as tmpdir:
            model_filepaths = []
            for m, model_filepath_gz in enumerate(self.model_filepaths):
                logger.debug('Unzipping model {0}/{1}'.format(
                    m, len(self.model_filepaths)))
                with gzip.open(model_filepath_gz) as model_file:
                    model_filepath = os.path.join(tmpdir, '{0}.pdb'.format(m))
                    model_filepaths.append(model_filepath)
                    model_text = model_file.read()
                with open(model_filepath, 'w') as model_file:
                    model_file.write(model_text)

            self.traj = mdtraj.load(model_filepaths)
Beispiel #9
0
def create_dir(dirpath, quiet=True):
    """
    :param dirpath: str
    """
    try:
        os.makedirs(dirpath)
        if not quiet:
            logger.info('Created directory "%s"' % dirpath)
    except OSError as e:
        if e.errno == 17:
            logger.debug('Directory "%s" already exists - will not overwrite' %
                         dirpath)
        else:
            raise
Beispiel #10
0
def calc_pme_parameters(system):
    """Calculate PME parameters using scheme similar to OpenMM OpenCL platform.

    Parameters
    ----------
    system : simtk.openmm.System
        The system for which parameters are to be computed.

    Returns
    -------
    alpha : float
        The PME alpha parameter
    nx, ny, nz : int
        The grid numbers in each dimension

    """

    # Find nonbonded force.
    forces = { system.getForce(index).__class__.__name__ : system.getForce(index) for index in range(system.getNumForces()) }
    force = forces['NonbondedForce']
    tol = force.getEwaldErrorTolerance()
    boxVectors = system.getDefaultPeriodicBoxVectors()

    from numpy import sqrt, log, ceil
    from math import pow
    alpha = (1.0/force.getCutoffDistance())*sqrt(-log(2.0*tol))
    xsize = int(ceil(2*alpha*boxVectors[0][0]/(3*pow(tol, 0.2))))
    ysize = int(ceil(2*alpha*boxVectors[1][1]/(3*pow(tol, 0.2))))
    zsize = int(ceil(2*alpha*boxVectors[2][2]/(3*pow(tol, 0.2))))

    logger.debug('xsize = %d, ysize = %d, zsize = %d' % (xsize,ysize,zsize))
    def findLegalDimension(minimum):
        while (True):
            # Attempt to factor the current value.
            unfactored = minimum
            for factor in range(2, 8):
                while (unfactored > 1) and (unfactored%factor == 0):
                    unfactored /= factor

            if (unfactored == 1):
                return int(minimum)

            minimum += 1

    nx = findLegalDimension(xsize)
    ny = findLegalDimension(ysize)
    nz = findLegalDimension(zsize)

    return (alpha, nx, ny, nz)
Beispiel #11
0
def molprobity_validation(targetid, ensembler_stage=None, loglevel=None):
    set_loglevel(loglevel)
    valid_model_ids = []
    if mpistate.rank == 0:
        if ensembler_stage is None:
            ensembler_stage = get_most_advanced_ensembler_modeling_stage(
                targetid)
        valid_model_ids = get_valid_model_ids(ensembler_stage, targetid)
    if ensembler_stage is None:
        ensembler_stage = mpistate.comm.bcast(ensembler_stage, root=0)
    valid_model_ids = mpistate.comm.bcast(valid_model_ids, root=0)
    nvalid_model_ids = len(valid_model_ids)
    model_structure_filename = model_filenames_by_ensembler_stage[
        ensembler_stage]

    models_target_dir = os.path.join(default_project_dirnames.models, targetid)
    molprobity_results_filepath = os.path.join(
        models_target_dir,
        'validation_scores_sorted-molprobity-{}'.format(ensembler_stage))

    molprobity_scores_sublist = []
    for model_index in range(mpistate.rank, nvalid_model_ids, mpistate.size):
        model_id = valid_model_ids[model_index]

        logger.debug('MPI process {} working on model {}'.format(
            mpistate.rank, model_id))

        molprobity_score = run_molprobity_oneline_analysis_and_write_results(
            targetid,
            model_id,
            ensembler_stage,
            model_structure_filename=model_structure_filename,
            models_target_dir=models_target_dir,
        )

        molprobity_scores_sublist.append((model_id, molprobity_score))

    molprobity_scores_gathered_list = mpistate.comm.gather(
        molprobity_scores_sublist, root=0)
    if mpistate.rank == 0:
        molprobity_scores_list_of_tuples = [
            item for sublist in molprobity_scores_gathered_list
            for item in sublist
        ]
        molprobity_scores_sorted = sorted(molprobity_scores_list_of_tuples,
                                          key=lambda x: x[1])
        write_molprobity_scores_list(molprobity_scores_sorted,
                                     molprobity_results_filepath)
Beispiel #12
0
    def _construct_traj(self):
        logger.debug(
            'Loading Trajectory object for model {0} ({1}/{2})'.format(
                self.df.templateid.iloc[0], 0, len(self.df.model_filepath)))
        traj = mdtraj.load_pdb(self.df.model_filepath[0])
        remove_disulfide_bonds_from_topology(traj.topology)
        self.traj = traj

        for m, model_filepath in enumerate(self.df.model_filepath[1:]):
            logger.debug(
                'Loading Trajectory object for model {0} ({1}/{2})'.format(
                    self.df.templateid.iloc[m + 1], m + 1,
                    len(self.df.model_filepath)))
            traj = mdtraj.load_pdb(model_filepath)
            remove_disulfide_bonds_from_topology(traj.topology)
            self.traj += traj
Beispiel #13
0
    def __init__(self, targetid, traj_filepath=None, topol_filepath=None,
           models_data_filepath=None, process_only_these_templates=None, loglevel=None,
           run_main=True):
        """Quick hack.
        """
        ensembler.utils.set_loglevel(loglevel)
        ensembler.core.check_project_toplevel_dir()
        self.models_target_dir = os.path.join(default_project_dirnames.models, targetid)

        logger.debug('Working on target %s' % targetid)

        self.ensembler_stage = 'implicit-start'
        self.model_filename = 'implicit-start.pdb.gz'

        if traj_filepath is None:
            self.traj_filepath = os.path.join(
                self.models_target_dir, 'traj-{0}.xtc'.format(self.ensembler_stage)
            )
        else:
            self.traj_filepath = traj_filepath

        if topol_filepath is None:
            self.topol_filepath = os.path.join(
                self.models_target_dir, 'traj-{0}-topol.pdb'.format(self.ensembler_stage)
            )
        else:
            self.topol_filepath = topol_filepath

        if models_data_filepath is None:
            self.models_data_filepath = os.path.join(
                self.models_target_dir, 'traj-{0}-data.csv'.format(self.ensembler_stage)
            )
        else:
            self.models_data_filepath = models_data_filepath

        if process_only_these_templates:
            self.templateids = process_only_these_templates
        else:
            self.templateids = os.walk(self.models_target_dir).next()[1]

        if run_main:
            self._gen_implicit_start_models()
            self._gen_df(model_filename=self.model_filename)
            self.df.to_csv(self.models_data_filepath, columns=['templateid', 'seqid'])
            self._construct_traj()
            self._superpose()
            self._write_traj()
Beispiel #14
0
def setup_system_and_integrator_files(target, template, temperature,
                                      collision_rate, timestep):
    logger.debug(
        'Copying system and integrator files for template {}'.format(template))
    models_target_dir = os.path.join(default_project_dirnames.models,
                                     target.id)
    template_dir = os.path.join(models_target_dir, template)
    target_project_dir = os.path.join(fah_projects_dir, target.id)
    source_system_filepath = os.path.join(template_dir, 'explicit-system.xml')
    source_state_filepath = os.path.join(template_dir, 'explicit-state.xml')
    dest_system_filepath = os.path.join(target_project_dir, 'system.xml')
    dest_integrator_filepath = os.path.join(target_project_dir,
                                            'integrator.xml')

    system = mm.XmlSerializer.deserialize(
        read_file_contents_gz_or_not(source_system_filepath))
    state = mm.XmlSerializer.deserialize(
        read_file_contents_gz_or_not(source_state_filepath))

    # Substitute default box vectors in system with those from state.
    box_vectors = state.getPeriodicBoxVectors()
    system.setDefaultPeriodicBoxVectors(*box_vectors)

    # Set PME parameters explicitly to minimize discrepancy between Reference and OpenCL/CUDA if not already set explicitly.
    ensure_pme_parameters_are_explicit(system)

    # Create new integrator to use.
    integrator = mm.LangevinIntegrator(temperature, collision_rate, timestep)

    # Make sure MonteCarloBarostat temperature matches set temperature.
    forces = {
        system.getForce(index).__class__.__name__: system.getForce(index)
        for index in range(system.getNumForces())
    }
    if 'MonteCarloBarostat' in forces:
        forces['MonteCarloBarostat'].setTemperature(temperature)

    # Serialize System.
    with open(dest_system_filepath, 'w') as dest_system_file:
        dest_system_file.write(mm.XmlSerializer.serialize(system))

    # Serialize Integrator
    with open(dest_integrator_filepath, 'w') as dest_integrator_file:
        dest_integrator_file.write(mm.XmlSerializer.serialize(integrator))

    return system
Beispiel #15
0
def sort_valid_templates_by_seqid(target, valid_templates):
    logger.debug(
        "Sorting templates in order of decreasing sequence identity...")
    models_target_dir = os.path.join(default_project_dirnames.models,
                                     target.id)

    seqids = []

    for template in valid_templates:
        seqids.append(get_seqid_for_model(models_target_dir, template))

    sorted_valid_templates_and_seqids = sorted(zip(valid_templates, seqids),
                                               reverse=True,
                                               key=lambda x: x[1])

    sorted_valid_templates = zip(*sorted_valid_templates_and_seqids)[0]
    return sorted_valid_templates
Beispiel #16
0
def setup_system_and_integrator_files(target,
                                      template,
                                      temperature,
                                      collision_rate,
                                      timestep
                                      ):
    logger.debug('Copying system and integrator files for template {}'.format(template))
    models_target_dir = os.path.join(default_project_dirnames.models, target.id)
    template_dir = os.path.join(models_target_dir, template)
    target_project_dir = os.path.join(fah_projects_dir, target.id)
    source_system_filepath = os.path.join(template_dir, 'explicit-system.xml')
    source_state_filepath = os.path.join(template_dir, 'explicit-state.xml')
    dest_system_filepath = os.path.join(target_project_dir, 'system.xml')
    dest_integrator_filepath = os.path.join(target_project_dir, 'integrator.xml')

    system = mm.XmlSerializer.deserialize(
        read_file_contents_gz_or_not(source_system_filepath)
    )
    state = mm.XmlSerializer.deserialize(
        read_file_contents_gz_or_not(source_state_filepath)
    )

    # Substitute default box vectors in system with those from state.
    box_vectors = state.getPeriodicBoxVectors()
    system.setDefaultPeriodicBoxVectors(*box_vectors)

    # Set PME parameters explicitly to minimize discrepancy between Reference and OpenCL/CUDA if not already set explicitly.
    ensure_pme_parameters_are_explicit(system)

    # Create new integrator to use.
    integrator = mm.LangevinIntegrator(temperature, collision_rate, timestep)

    # Make sure MonteCarloBarostat temperature matches set temperature.
    forces = { system.getForce(index).__class__.__name__ : system.getForce(index) for index in range(system.getNumForces()) }
    if 'MonteCarloBarostat' in forces:
        forces['MonteCarloBarostat'].setTemperature(temperature)

    # Serialize System.
    with open(dest_system_filepath, 'w') as dest_system_file:
        dest_system_file.write(mm.XmlSerializer.serialize(system))

    # Serialize Integrator
    with open(dest_integrator_filepath, 'w') as dest_integrator_file:
        dest_integrator_file.write(mm.XmlSerializer.serialize(integrator))

    return system
Beispiel #17
0
def sort_valid_templates_by_seqid(target, valid_templates):
    logger.debug("Sorting templates in order of decreasing sequence identity...")
    models_target_dir = os.path.join(default_project_dirnames.models, target.id)

    seqids = []

    for template in valid_templates:
        seqids.append(get_seqid_for_model(models_target_dir, template))

    sorted_valid_templates_and_seqids = sorted(
        zip(valid_templates, seqids),
        reverse=True,
        key=lambda x: x[1]
    )

    sorted_valid_templates = zip(*sorted_valid_templates_and_seqids)[0]
    return sorted_valid_templates
Beispiel #18
0
def get_valid_templates_for_target(target,
                                   templates_resolved_seq,
                                   process_only_these_templates=None,
                                   model_seqid_cutoff=None,
                                   model_validation_score_cutoff=None,
                                   model_validation_score_percentile=None,
                                   ):
    logger.debug("Building list of valid templates...")
    models_target_dir = os.path.join(default_project_dirnames.models, target.id)
    if model_seqid_cutoff:
        selected_template_ids = select_templates_by_seqid_cutoff(
            target.id, seqid_cutoff=model_seqid_cutoff
        )
    elif model_validation_score_cutoff or model_validation_score_percentile:
        selected_template_ids = select_templates_by_validation_score(
            targetid=target.id,
            validation_score_cutoff=model_validation_score_cutoff,
            validation_score_percentile=model_validation_score_percentile,
        )
    elif process_only_these_templates:
        selected_template_ids = [
            seq_obj.id for seq_obj in templates_resolved_seq
            if seq_obj.id in process_only_these_templates
        ]
    else:
        selected_template_ids = [seq_obj.id for seq_obj in templates_resolved_seq]

    valid_templates = []

    for template in selected_template_ids:
        # Check to make sure all files needed are present.
        not_valid = False
        for filename in filenames_necessary_for_fah_packaging:
            fullpath = os.path.join(models_target_dir, template, filename)
            if not (os.path.exists(fullpath) or os.path.exists(fullpath+'.gz')):
                not_valid = True
                break

        if not_valid:
            continue
        else:
            valid_templates.append(template)

    logger.debug('{} valid unique initial starting conditions found'.format(len(valid_templates)))

    return valid_templates
Beispiel #19
0
def gather_templates_from_pdb(pdbids,
                              uniprot_domain_regex=None,
                              chainids=None,
                              structure_dirs=None,
                              loglevel=None):
    """
    :param pdbids: list of str
    :param uniprot_domain_regex: str
    :param chainids: dict {pdbid (str): [chainid (str)]}
    :param structure_dirs: list of str
    :return:
    """
    ensembler.utils.set_loglevel(loglevel)
    manual_overrides = ensembler.core.ManualOverrides()
    selected_pdbchains = None
    if mpistate.rank == 0:
        for pdbid in pdbids:
            get_structure_files_for_single_pdbchain(pdbid, structure_dirs)
        uniprot_acs = extract_uniprot_acs_from_sifts_files(pdbids)
        logger.debug('Extracted UniProt ACs: {0}'.format(uniprot_acs))
        uniprot_ac_query_string = ensembler.uniprot.build_uniprot_query_string_from_acs(
            uniprot_acs)
        uniprotxml = ensembler.uniprot.get_uniprot_xml(uniprot_ac_query_string)
        selected_pdbchains = extract_template_pdbchains_from_uniprot_xml(
            uniprotxml,
            uniprot_domain_regex=uniprot_domain_regex,
            manual_overrides=manual_overrides,
            specified_pdbids=pdbids,
            specified_chainids=chainids)

    selected_pdbchains = mpistate.comm.bcast(selected_pdbchains, root=0)
    logger.debug('Selected PDB chains: {0}'.format(
        [pdbchain['templateid'] for pdbchain in selected_pdbchains]))

    selected_templates = extract_template_pdb_chain_residues(
        selected_pdbchains)
    write_template_seqs_to_fasta_file(selected_templates)
    extract_template_structures_from_pdb_files(selected_templates)
    write_gather_templates_from_pdb_metadata(pdbids, uniprot_domain_regex,
                                             len(selected_templates), chainids,
                                             structure_dirs)
Beispiel #20
0
def gather_templates_from_uniprot(uniprot_query_string, uniprot_domain_regex=None, structure_dirs=None, pdbids=None, chainids=None, loglevel=None):
    """# Searches UniProt for a set of template proteins with a user-defined
    query string, then saves IDs, sequences and structures."""
    ensembler.utils.set_loglevel(loglevel)
    manual_overrides = ensembler.core.ManualOverrides()
    selected_pdbchains = None
    if mpistate.rank == 0:
        uniprotxml = ensembler.uniprot.get_uniprot_xml(uniprot_query_string)
        log_unique_domain_names(uniprot_query_string, uniprotxml)
        if uniprot_domain_regex is not None:
            log_unique_domain_names_selected_by_regex(uniprot_domain_regex, uniprotxml)

        selected_pdbchains = extract_template_pdbchains_from_uniprot_xml(uniprotxml, uniprot_domain_regex=uniprot_domain_regex, manual_overrides=manual_overrides, specified_pdbids=pdbids, specified_chainids=chainids)
        get_structure_files(selected_pdbchains, structure_dirs)

    selected_pdbchains = mpistate.comm.bcast(selected_pdbchains, root=0)
    logger.debug('Selected PDB chains: {0}'.format([pdbchain['templateid'] for pdbchain in selected_pdbchains]))

    selected_templates = extract_template_pdb_chain_residues(selected_pdbchains)
    write_template_seqs_to_fasta_file(selected_templates)
    extract_template_structures_from_pdb_files(selected_templates)
    write_gather_templates_from_uniprot_metadata(uniprot_query_string, uniprot_domain_regex, len(selected_templates), structure_dirs)
Beispiel #21
0
def molprobity_validation(targetid, ensembler_stage=None, loglevel=None):
    set_loglevel(loglevel)
    valid_model_ids = []
    if mpistate.rank == 0:
        if ensembler_stage is None:
            ensembler_stage = get_most_advanced_ensembler_modeling_stage(targetid)
        valid_model_ids = get_valid_model_ids(ensembler_stage, targetid)
    if ensembler_stage is None:
        ensembler_stage = mpistate.comm.bcast(ensembler_stage, root=0)
    valid_model_ids = mpistate.comm.bcast(valid_model_ids, root=0)
    nvalid_model_ids = len(valid_model_ids)
    model_structure_filename = model_filenames_by_ensembler_stage[ensembler_stage]

    models_target_dir = os.path.join(default_project_dirnames.models, targetid)
    molprobity_results_filepath = os.path.join(
        models_target_dir, "validation_scores_sorted-molprobity-{}".format(ensembler_stage)
    )

    molprobity_scores_sublist = []
    for model_index in range(mpistate.rank, nvalid_model_ids, mpistate.size):
        model_id = valid_model_ids[model_index]

        logger.debug("MPI process {} working on model {}".format(mpistate.rank, model_id))

        molprobity_score = run_molprobity_oneline_analysis_and_write_results(
            targetid,
            model_id,
            ensembler_stage,
            model_structure_filename=model_structure_filename,
            models_target_dir=models_target_dir,
        )

        molprobity_scores_sublist.append((model_id, molprobity_score))

    molprobity_scores_gathered_list = mpistate.comm.gather(molprobity_scores_sublist, root=0)
    if mpistate.rank == 0:
        molprobity_scores_list_of_tuples = [item for sublist in molprobity_scores_gathered_list for item in sublist]
        molprobity_scores_sorted = sorted(molprobity_scores_list_of_tuples, key=lambda x: x[1])
        write_molprobity_scores_list(molprobity_scores_sorted, molprobity_results_filepath)
Beispiel #22
0
def gather_templates_from_uniprot(uniprot_query_string,
                                  uniprot_domain_regex=None,
                                  structure_dirs=None,
                                  pdbids=None,
                                  chainids=None,
                                  loglevel=None):
    """# Searches UniProt for a set of template proteins with a user-defined
    query string, then saves IDs, sequences and structures."""
    ensembler.utils.set_loglevel(loglevel)
    manual_overrides = ensembler.core.ManualOverrides()
    selected_pdbchains = None
    if mpistate.rank == 0:
        uniprotxml = ensembler.uniprot.get_uniprot_xml(uniprot_query_string)
        log_unique_domain_names(uniprot_query_string, uniprotxml)
        if uniprot_domain_regex is not None:
            log_unique_domain_names_selected_by_regex(uniprot_domain_regex,
                                                      uniprotxml)

        selected_pdbchains = extract_template_pdbchains_from_uniprot_xml(
            uniprotxml,
            uniprot_domain_regex=uniprot_domain_regex,
            manual_overrides=manual_overrides,
            specified_pdbids=pdbids,
            specified_chainids=chainids)
        get_structure_files(selected_pdbchains, structure_dirs)

    selected_pdbchains = mpistate.comm.bcast(selected_pdbchains, root=0)
    logger.debug('Selected PDB chains: {0}'.format(
        [pdbchain['templateid'] for pdbchain in selected_pdbchains]))

    selected_templates = extract_template_pdb_chain_residues(
        selected_pdbchains)
    write_template_seqs_to_fasta_file(selected_templates)
    extract_template_structures_from_pdb_files(selected_templates)
    write_gather_templates_from_uniprot_metadata(uniprot_query_string,
                                                 uniprot_domain_regex,
                                                 len(selected_templates),
                                                 structure_dirs)
Beispiel #23
0
def run_molprobity_oneline_analysis_and_write_results(
    targetid,
    model_id,
    ensembler_stage,
    model_structure_filename=None,
    models_target_dir=None,
    check_for_existing_results=True,
):
    if model_structure_filename is None:
        model_structure_filename = model_filenames_by_ensembler_stage[
            ensembler_stage]
    if models_target_dir is None:
        models_target_dir = os.path.join(default_project_dirnames.models,
                                         targetid)

    results_output_filepath = os.path.join(
        models_target_dir, model_id,
        'molprobity-{}.yaml'.format(ensembler_stage))

    if check_for_existing_results:
        if os.path.exists(results_output_filepath):
            with open(results_output_filepath) as results_output_file:
                prev_results = yaml.load(stream=results_output_file,
                                         Loader=YamlLoader)
            prev_molprobity_score = prev_results.get('MolProbityScore')
            if prev_molprobity_score is not None:
                logger.debug(
                    'Existing MolProbity score of {} found for model {}'.
                    format(prev_molprobity_score, model_id))
                return prev_molprobity_score

    molprobity_results = run_molprobity_oneline_analysis(
        targetid, model_id, model_structure_filename)
    if molprobity_results is None:
        logger.debug(
            'MolProbity returned no results for model {}'.format(model_id))
        return None

    logger.debug('MolProbity score of {} calculated for model {}'.format(
        molprobity_results.get('MolProbityScore'), model_id))

    molprobity_score = molprobity_results.get('MolProbityScore')
    if molprobity_score is not None:
        write_molprobity_results_for_target(molprobity_results,
                                            models_target_dir, model_id,
                                            ensembler_stage)
    return molprobity_score
Beispiel #24
0
def run_molprobity_oneline_analysis_and_write_results(
    targetid,
    model_id,
    ensembler_stage,
    model_structure_filename=None,
    models_target_dir=None,
    check_for_existing_results=True,
):
    if model_structure_filename is None:
        model_structure_filename = model_filenames_by_ensembler_stage[ensembler_stage]
    if models_target_dir is None:
        models_target_dir = os.path.join(default_project_dirnames.models, targetid)

    results_output_filepath = os.path.join(models_target_dir, model_id, "molprobity-{}.yaml".format(ensembler_stage))

    if check_for_existing_results:
        if os.path.exists(results_output_filepath):
            with open(results_output_filepath) as results_output_file:
                prev_results = yaml.load(stream=results_output_file, Loader=YamlLoader)
            prev_molprobity_score = prev_results.get("MolProbityScore")
            if prev_molprobity_score is not None:
                logger.debug(
                    "Existing MolProbity score of {} found for model {}".format(prev_molprobity_score, model_id)
                )
                return prev_molprobity_score

    molprobity_results = run_molprobity_oneline_analysis(targetid, model_id, model_structure_filename)
    if molprobity_results is None:
        logger.debug("MolProbity returned no results for model {}".format(model_id))
        return None

    logger.debug(
        "MolProbity score of {} calculated for model {}".format(molprobity_results.get("MolProbityScore"), model_id)
    )

    molprobity_score = molprobity_results.get("MolProbityScore")
    if molprobity_score is not None:
        write_molprobity_results_for_target(molprobity_results, models_target_dir, model_id, ensembler_stage)
    return molprobity_score
Beispiel #25
0
def build_model(target,
                template_resolved_seq,
                target_setup_data,
                write_modeller_restraints_file=False,
                loglevel=None):
    """Uses Modeller to build a homology model for a given target and
    template.

    Will not run Modeller if the output files already exist.

    Parameters
    ----------
    target : BioPython SeqRecord
    template_resolved_seq : BioPython SeqRecord
        Must be a corresponding .pdb template file with the same ID in the
        templates/structures directory.
    template_resolved_seq : BioPython SeqRecord
        Must be a corresponding .pdb template file with the same ID in the
        templates/structures directory.
    target_setup_data : TargetSetupData obj
    write_modeller_restraints_file : bool
        Write file containing restraints used by Modeller - note that this file can be relatively
        large, e.g. ~300KB per model for a protein kinase domain target.
    loglevel : bool
    """
    ensembler.utils.set_loglevel(loglevel)

    template_structure_dir = os.path.abspath(
        ensembler.core.default_project_dirnames.
        templates_structures_modeled_loops)

    if os.path.exists(
            os.path.join(template_structure_dir,
                         template_resolved_seq.id + '.pdb')):
        remodeled_seq_filepath = os.path.join(
            ensembler.core.default_project_dirnames.
            templates_structures_modeled_loops,
            template_resolved_seq.id + '-pdbfixed.fasta')
        template = list(Bio.SeqIO.parse(remodeled_seq_filepath, 'fasta'))[0]
    else:
        template = template_resolved_seq
        template_structure_dir = os.path.abspath(
            ensembler.core.default_project_dirnames.
            templates_structures_resolved)

    model_dir = os.path.abspath(
        os.path.join(target_setup_data.models_target_dir, template.id))
    if not os.path.exists(model_dir):
        ensembler.utils.create_dir(model_dir)
    model_pdbfilepath = os.path.abspath(os.path.join(model_dir,
                                                     'model.pdb.gz'))
    modeling_log_filepath = os.path.abspath(
        os.path.join(model_dir, 'modeling-log.yaml'))

    check_model_pdbfilepath_ends_in_pdbgz(model_pdbfilepath)
    model_pdbfilepath_uncompressed = model_pdbfilepath[:-3]

    if check_all_model_files_present(model_dir):
        logger.debug(
            "Output files already exist for target '%s' // template '%s'; files were not overwritten."
            % (target.id, template.id))
        return

    logger.info(
        '-------------------------------------------------------------------------\n'
        'Modelling "%s" => "%s"\n'
        '-------------------------------------------------------------------------'
        % (target.id, template.id))

    # aln = align_target_template(target, template)
    aln_filepath = os.path.abspath(os.path.join(model_dir, 'alignment.pir'))
    # write_modeller_pir_aln_file(aln, target, template, pir_aln_filepath=aln_filepath)
    log_file = init_build_model_logfile(modeling_log_filepath)

    with ensembler.utils.enter_temp_dir():
        try:
            start = datetime.datetime.utcnow()
            shutil.copy(aln_filepath, 'alignment.pir')
            run_modeller(
                target,
                template,
                model_dir,
                model_pdbfilepath,
                model_pdbfilepath_uncompressed,
                template_structure_dir,
                write_modeller_restraints_file=write_modeller_restraints_file)
            if os.path.getsize(model_pdbfilepath) < 1:
                raise Exception('Output PDB file is empty.')

            end_successful_build_model_logfile(log_file, start)

        except Exception as e:
            end_exception_build_model_logfile(e, log_file)
Beispiel #26
0
def package_for_fah(process_only_these_targets=None,
                    process_only_these_templates=None,
                    model_seqid_cutoff=None,
                    model_validation_score_cutoff=None,
                    model_validation_score_percentile=None,
                    nclones=1,
                    archive=False,
                    openmm_platform='Reference',
                    temperature=300.0 * unit.kelvin,
                    collision_rate=1.0 / unit.picosecond,
                    timestep=2.0 * unit.femtoseconds,
                    loglevel=None):
    """
    Create the input files and directory structure necessary to start a Folding@Home project.

    MPI-enabled.

    Parameters
    ----------
    archive : Bool
        A .tgz compressed archive will be created for each individual RUN directory.
    """
    set_loglevel(loglevel)

    if mpistate.rank == 0:
        if not os.path.exists(fah_projects_dir):
            os.mkdir(fah_projects_dir)
    mpistate.comm.Barrier()

    targets, templates_resolved_seq = get_targets_and_templates()

    for target in targets:
        if process_only_these_targets and (target.id
                                           not in process_only_these_targets):
            continue

        target_project_dir = os.path.join(fah_projects_dir, target.id)

        models_target_dir = os.path.join(default_project_dirnames.models,
                                         target.id)
        if not os.path.exists(models_target_dir):
            continue

        mpistate.comm.Barrier()

        sorted_valid_templates = []
        system = None
        renumbered_resnums = {}

        if mpistate.rank == 0:
            logger.info(
                '-------------------------------------------------------------------------'
            )
            logger.info('Building FAH OpenMM project for target {}'.format(
                target.id))
            logger.info(
                '-------------------------------------------------------------------------'
            )

            valid_templates = get_valid_templates_for_target(
                target,
                templates_resolved_seq,
                process_only_these_templates=process_only_these_templates,
                model_seqid_cutoff=model_seqid_cutoff,
                model_validation_score_cutoff=model_validation_score_cutoff,
                model_validation_score_percentile=
                model_validation_score_percentile)

            sorted_valid_templates = sort_valid_templates_by_seqid(
                target, valid_templates)

            create_target_project_dir(target)

            system = setup_system_and_integrator_files(
                target, sorted_valid_templates[0], temperature, collision_rate,
                timestep)

            renumbered_resnums = get_renumbered_topol_resnums(target)

        sorted_valid_templates = mpistate.comm.bcast(sorted_valid_templates,
                                                     root=0)
        system = mpistate.comm.bcast(system, root=0)
        renumbered_resnums = mpistate.comm.bcast(renumbered_resnums, root=0)

        logger.debug("Building RUNs in parallel...")

        for run_index in range(mpistate.rank, len(sorted_valid_templates),
                               mpistate.size):
            template = sorted_valid_templates[run_index]

            logger.info(
                '-------------------------------------------------------------------------'
            )
            logger.info('Building RUN{} for template {}'.format(
                run_index, template))
            logger.info(
                '-------------------------------------------------------------------------'
            )

            source_dir = os.path.join(models_target_dir, template)
            generate_fah_run(
                target_project_dir,
                template,
                source_dir,
                system,
                run_index,
                nclones,
                temperature,
                collision_rate,
                timestep,
                openmm_platform,
                renumbered_resnums,
            )

            if archive:
                tgz_fah_run(target, run_index)

    mpistate.comm.Barrier()
    if mpistate.rank == 0:
        logger.info('Done.')
Beispiel #27
0
def mktraj(targetid, ensembler_stage=None, traj_filepath=None, topol_filepath=None, models_data_filepath=None, process_only_these_templates=None):
    """Makes a trajectory for a given target, using mdtraj. The trajectory can be used with other
    software, e.g. for visualization with PyMOL or VMD.

    Parameters
    ----------
    targetid : str
        e.g. 'EGFR_HUMAN_D0'
    ensembler_stage : str
        The Ensembler stage from which to build models, e.g. 'build_models' results in a trajectory
        built from the 'model.pdb.gz' files output by the build_models command.
        options: build_models|refine_implicit_md|refine_explicit_md
        default: most advanced stage for which model files are available
    traj_filepath : str
        default: models/[targetid]/traj-[ensembler_stage].xtc
    topol_filepath : str
        default: models/[targetid]/traj-[ensembler_stage]-topol.pdb
    models_data_filepath :
        default: models/[targetid]/traj-[ensembler_stage]-data.csv
    process_only_these_templates : list of str

    Returns
    -------
    traj : mdtraj.Trajectory
    df : pandas.DataFrame
        models data (e.g. sequence identities):
    """
    ensembler.core.check_project_toplevel_dir()
    models_target_dir = os.path.join(ensembler.core.default_project_dirnames.models, targetid)

    logger.debug('Working on target %s' % targetid)

    if ensembler_stage is None:
        for stagename in ['refine_explicit_md', 'refine_implicit_md', 'build_models']:
            if check_ensembler_modeling_stage_complete(stagename, targetid):
                ensembler_stage = stagename
                break

    if ensembler_stage is None:
        raise Exception('Models have not yet been built for this Ensembler project.')

    if traj_filepath is None:
        traj_filepath = os.path.join(models_target_dir, 'traj-{0}.xtc'.format(ensembler_stage))
    if topol_filepath is None:
        topol_filepath = os.path.join(models_target_dir, 'traj-{0}-topol.pdb'.format(ensembler_stage))
    if models_data_filepath is None:
        models_data_filepath = os.path.join(models_target_dir, 'traj-{0}-data.csv'.format(ensembler_stage))

    if process_only_these_templates:
        templateids = process_only_these_templates
    else:
        dirs = os.walk(models_target_dir).next()[1]
        templateids = [dir for dir in dirs if '_D' in dir]

    model_filename = ensembler.core.model_filenames_by_ensembler_stage[ensembler_stage]
    valid_model_templateids = [templateid for templateid in templateids if os.path.exists(os.path.join(models_target_dir, templateid, model_filename))]
    valid_model_filepaths = [os.path.join(models_target_dir, templateid, model_filename) for templateid in valid_model_templateids]

    seqid_filepaths = [os.path.join(models_target_dir, templateid, 'sequence-identity.txt') for templateid in valid_model_templateids]
    seqids = [float(open(seqid_filepath).read().strip()) if os.path.exists(seqid_filepath) else None for seqid_filepath in seqid_filepaths]

    df = pd.DataFrame({
        'templateid': valid_model_templateids,
        'model_filepath': valid_model_filepaths,
        'seqid': seqids,
    })
    df.sort(columns='seqid', inplace=True, ascending=False)
    df.reset_index(drop=True, inplace=True)

    df.to_csv(models_data_filepath, columns=['templateid', 'seqid'])

    # construct traj
    traj = mdtraj.load_pdb(df.model_filepath[0])
    for model_filepath in df.model_filepath[1:]:
        traj += mdtraj.load_pdb(model_filepath)

    # superpose structured C-alphas
    dssp = mdtraj.compute_dssp(traj[0])[0]
    structured_resis_bool = (dssp == 'H') + (dssp == 'E')
    alpha_indices = traj.topology.select_atom_indices('alpha')
    structured_alpha_indices = np.array([alpha_indices[x] for x in range(traj.n_residues) if structured_resis_bool[x]])
    traj.superpose(reference=traj, frame=0, atom_indices=structured_alpha_indices)

    # write traj, and write first frame as pdb file
    traj[0].save(topol_filepath)
    traj.save(traj_filepath)
    return traj, df
Beispiel #28
0
def build_model(target, template_resolved_seq, target_setup_data,
                write_modeller_restraints_file=False, loglevel=None):
    """Uses Modeller to build a homology model for a given target and
    template.

    Will not run Modeller if the output files already exist.

    Parameters
    ----------
    target : BioPython SeqRecord
    template_resolved_seq : BioPython SeqRecord
        Must be a corresponding .pdb template file with the same ID in the
        templates/structures directory.
    template_resolved_seq : BioPython SeqRecord
        Must be a corresponding .pdb template file with the same ID in the
        templates/structures directory.
    target_setup_data : TargetSetupData obj
    write_modeller_restraints_file : bool
        Write file containing restraints used by Modeller - note that this file can be relatively
        large, e.g. ~300KB per model for a protein kinase domain target.
    loglevel : bool
    """
    ensembler.utils.set_loglevel(loglevel)

    template_structure_dir = os.path.abspath(
        ensembler.core.default_project_dirnames.templates_structures_modeled_loops
    )

    if os.path.exists(os.path.join(template_structure_dir, template_resolved_seq.id + '.pdb')):
        remodeled_seq_filepath = os.path.join(
            ensembler.core.default_project_dirnames.templates_structures_modeled_loops,
            template_resolved_seq.id + '-pdbfixed.fasta'
        )
        template = list(Bio.SeqIO.parse(remodeled_seq_filepath, 'fasta'))[0]
    else:
        template = template_resolved_seq
        template_structure_dir = os.path.abspath(
            ensembler.core.default_project_dirnames.templates_structures_resolved
        )

    model_dir = os.path.abspath(os.path.join(target_setup_data.models_target_dir, template.id))
    if not os.path.exists(model_dir):
        ensembler.utils.create_dir(model_dir)
    model_pdbfilepath = os.path.abspath(os.path.join(model_dir, 'model.pdb.gz'))
    modeling_log_filepath = os.path.abspath(os.path.join(model_dir, 'modeling-log.yaml'))

    check_model_pdbfilepath_ends_in_pdbgz(model_pdbfilepath)
    model_pdbfilepath_uncompressed = model_pdbfilepath[:-3]

    if check_all_model_files_present(model_dir):
        logger.debug(
            "Output files already exist for target '%s' // template '%s'; files were not overwritten." %
            (target.id, template.id)
        )
        return

    logger.info(
        '-------------------------------------------------------------------------\n'
        'Modelling "%s" => "%s"\n'
        '-------------------------------------------------------------------------'
        % (target.id, template.id)
    )

    # aln = align_target_template(target, template)
    aln_filepath = os.path.abspath(os.path.join(model_dir, 'alignment.pir'))
    # write_modeller_pir_aln_file(aln, target, template, pir_aln_filepath=aln_filepath)
    log_file = init_build_model_logfile(modeling_log_filepath)

    with ensembler.utils.enter_temp_dir():
        try:
            start = datetime.datetime.utcnow()
            shutil.copy(aln_filepath, 'alignment.pir')
            run_modeller(target, template, model_dir, model_pdbfilepath,
                         model_pdbfilepath_uncompressed, template_structure_dir,
                         write_modeller_restraints_file=write_modeller_restraints_file)
            if os.path.getsize(model_pdbfilepath) < 1:
                raise Exception('Output PDB file is empty.')

            end_successful_build_model_logfile(log_file, start)

        except Exception as e:
            end_exception_build_model_logfile(e, log_file)
Beispiel #29
0
def cluster_models(process_only_these_targets=None, cutoff=0.06, loglevel=None):
    """Cluster models based on RMSD, and filter out non-unique models as
    determined by a given cutoff.

    Parameters
    ----------

    cutoff : float
        Minimum distance cutoff for RMSD clustering (nm)

    Runs serially.
    """
    # TODO refactor
    ensembler.utils.set_loglevel(loglevel)
    targets, templates_resolved_seq = get_targets_and_templates()
    templates = templates_resolved_seq

    for target in targets:
        if process_only_these_targets and (target.id not in process_only_these_targets): continue

        models_target_dir = os.path.join(ensembler.core.default_project_dirnames.models, target.id)
        if not os.path.exists(models_target_dir): continue

        # =============================
        # Construct a mdtraj trajectory containing all models
        # =============================

        starttime = datetime.datetime.utcnow()

        logger.debug('Building a list of valid models...')

        model_pdbfilenames_compressed = {
            template.id: os.path.join(models_target_dir, template.id, 'model.pdb.gz') for template in templates
        }
        model_pdbfilenames_uncompressed = {
            template.id: os.path.join(models_target_dir, template.id, 'model.pdb') for template in templates
        }
        valid_templateids = [
            templateid for templateid in model_pdbfilenames_compressed
            if os.path.exists(model_pdbfilenames_compressed[templateid])
        ]

        # Write uncompressed model.pdb files from model.pdb.gz if necessary
        for templateid in valid_templateids:
            if not os.path.exists(model_pdbfilenames_uncompressed[templateid]) or os.path.getsize(model_pdbfilenames_uncompressed[templateid]) == 0:
                with gzip.open(model_pdbfilenames_compressed[templateid]) as model_pdbfile_compressed:
                    with open(model_pdbfilenames_uncompressed[templateid], 'w') as model_pdbfile:
                        model_pdbfile.write(model_pdbfile_compressed.read())

        logger.info('Constructing a trajectory containing all valid models...')

        if len(valid_templateids) == 0:
            logger.info('No models found for target {0}.'.format(target.id))
            continue

        valid_model_pdbfilenames_uncompressed = [
            model_pdbfilenames_uncompressed[templateid] for templateid in valid_templateids
        ]

        traj = mdtraj.load(valid_model_pdbfilenames_uncompressed)

        # =============================
        # Clustering
        # =============================

        logger.info('Conducting RMSD-based clustering...')

        # Remove any existing unique_by_clustering files
        for f in glob.glob(models_target_dir+'/*_PK_*/unique_by_clustering'):
            os.unlink(f)

        CAatoms = [a.index for a in traj.topology.atoms if a.name == 'CA']
        unique_templateids = models_regular_spatial_clustering(
            valid_templateids, traj, atom_indices=CAatoms, cutoff=cutoff
        )
        write_unique_by_clustering_files(unique_templateids, models_target_dir)

        with open(os.path.join(models_target_dir, 'unique-models.txt'), 'w') as uniques_file:
            for u in unique_templateids:
                uniques_file.write(u+'\n')
            logger.info(
                '%d unique models (from original set of %d) using cutoff of %.3f nm' %
                        (len(unique_templateids), len(valid_templateids), cutoff)
            )

        for template in templates:
            model_dir = os.path.join(models_target_dir, template.id)
            model_pdbfilename = os.path.join(model_dir, 'model.pdb')
            if os.path.exists(model_pdbfilename):
                os.remove(model_pdbfilename)

        # ========
        # Metadata
        # ========

        project_metadata = ensembler.core.ProjectMetadata(
            project_stage='cluster_models', target_id=target.id
        )
        datestamp = ensembler.core.get_utcnow_formatted()

        timedelta = datetime.datetime.utcnow() - starttime

        metadata = {
            'target_id': target.id,
            'datestamp': datestamp,
            'nunique_models': len(unique_templateids),
            'python_version': sys.version.split('|')[0].strip(),
            'python_full_version': ensembler.core.literal_str(sys.version),
            'ensembler_version': ensembler.version.short_version,
            'ensembler_commit': ensembler.version.git_revision,
            'biopython_version': Bio.__version__,
            'mdtraj_version': mdtraj.version.short_version,
            'mdtraj_commit': mdtraj.version.git_revision,
            'timing': ensembler.core.strf_timedelta(timedelta),
        }

        project_metadata.add_data(metadata)
        project_metadata.write()
Beispiel #30
0
    def _gen_implicit_start_models(self,
                                   ff='amber99sbildn.xml',
                                   implicit_water_model='amber99_obc.xml',
                                   ph=8.0):

        self.ph = ph
        from simtk.openmm import app

        valid_model_templateids = [
            templateid for templateid in self.templateids if os.path.exists(
                os.path.join(
                    self.models_target_dir, templateid, ensembler.core.
                    model_filenames_by_ensembler_stage['refine_implicit_md']))
        ]

        gen_model_templateids = [
            templateid for templateid in valid_model_templateids
            if not os.path.exists(
                os.path.join(self.models_target_dir, templateid,
                             self.model_filename))
        ]

        # make reference model
        forcefield = app.ForceField(ff, implicit_water_model)
        reference_model_id = get_highest_seqid_existing_model(
            models_target_dir=self.models_target_dir)
        logger.debug('Using {0} as reference model'.format(reference_model_id))
        reference_model_path = os.path.join(
            self.models_target_dir, reference_model_id,
            model_filenames_by_ensembler_stage['build_models'])
        with gzip.open(reference_model_path) as reference_pdb_file:
            reference_pdb = app.PDBFile(reference_pdb_file)
        remove_disulfide_bonds_from_topology(reference_pdb.topology)
        reference_topology = reference_pdb.topology
        reference_modeller = app.Modeller(reference_pdb.topology,
                                          reference_pdb.positions)
        reference_variants = reference_modeller.addHydrogens(forcefield,
                                                             pH=self.ph)

        for template_index in range(mpistate.rank, len(gen_model_templateids),
                                    mpistate.size):
            templateid = gen_model_templateids[template_index]
            logger.debug(
                'Generating implicit-start model for {0}'.format(templateid))

            try:
                input_model_filepath = os.path.join(
                    self.models_target_dir, templateid,
                    model_filenames_by_ensembler_stage['build_models'])
                output_model_filepath = os.path.join(self.models_target_dir,
                                                     templateid,
                                                     self.model_filename)

                with gzip.open(input_model_filepath) as pdb_file:
                    pdb = app.PDBFile(pdb_file)

                remove_disulfide_bonds_from_topology(pdb.topology)
                modeller = app.Modeller(reference_topology, pdb.positions)
                modeller.addHydrogens(forcefield,
                                      pH=self.ph,
                                      variants=reference_variants)
                topology = modeller.getTopology()
                positions = modeller.getPositions()

                with gzip.open(output_model_filepath,
                               'wt') as output_model_file:
                    app.PDBFile.writeHeader(topology, file=output_model_file)
                    app.PDBFile.writeFile(topology,
                                          positions,
                                          file=output_model_file)
                    app.PDBFile.writeFooter(topology, file=output_model_file)

            except Exception as e:
                print('Error for model {0}: {1}'.format(templateid, e))
                continue
Beispiel #31
0
def pdbfix_template(template_full_seq, overwrite_structures=False):
    """
    Parameters
    ----------
    template_full_seq: BioPython SeqRecord
        full UniProt sequence for span of the template (including unresolved residues)
    overwrite_structures: bool
    Returns
    -------
    fixer.missingResidues
    """
    try:
        template_pdbfixed_filepath = os.path.join(
            ensembler.core.default_project_dirnames.templates_structures_modeled_loops,
            template_full_seq.id + '-pdbfixed.pdb'
        )
        seq_pdbfixed_filepath = os.path.join(
            ensembler.core.default_project_dirnames.templates_structures_modeled_loops,
            template_full_seq.id + '-pdbfixed.fasta'
        )
        import pdbfixer
        import simtk.openmm.app
        template_filepath = os.path.join(
            ensembler.core.default_project_dirnames.templates_structures_resolved,
            template_full_seq.id + '.pdb'
        )
        fixer = pdbfixer.PDBFixer(filename=template_filepath)
        chainid = next(fixer.structure.iter_chains()).chain_id
        seq_obj = simtk.openmm.app.internal.pdbstructure.Sequence(chainid)
        for r in template_full_seq.seq:
            resi3 = Bio.SeqUtils.seq3(r).upper()
            seq_obj.residues.append(resi3)
        fixer.structure.sequences.append(seq_obj)
        fixer.findMissingResidues()
        remove_missing_residues_at_termini(fixer, len_full_seq=len(template_full_seq.seq))
        if not overwrite_structures and os.path.exists(template_pdbfixed_filepath):
            return fixer.missingResidues
        fixer.findMissingAtoms()
        (newTopology, newPositions, newAtoms, existingAtomMap) = fixer._addAtomsToTopology(True, True)
        fixer.topology = newTopology
        fixer.positions = newPositions
        with open(template_pdbfixed_filepath, 'w') as template_pdbfixed_file:
            simtk.openmm.app.PDBFile.writeFile(
                fixer.topology, fixer.positions, file=template_pdbfixed_file
            )

        # Write sequence to file
        seq_pdbfixed = ''.join([Bio.SeqUtils.seq1(r.name) for r in fixer.topology.residues()])
        seq_record_pdbfixed = SeqRecord(Seq(seq_pdbfixed), id=template_full_seq.id, description=template_full_seq.id)
        Bio.SeqIO.write([seq_record_pdbfixed], seq_pdbfixed_filepath, 'fasta')

        return fixer.missingResidues
    except (KeyboardInterrupt, ImportError):
        raise
    except Exception as e:
        trbk = traceback.format_exc()
        log_filepath = os.path.abspath(os.path.join(
            ensembler.core.default_project_dirnames.templates_structures_modeled_loops,
            template_full_seq.id + '-pdbfixer-log.yaml'
        ))
        logfile = ensembler.core.LogFile(log_filepath)
        logfile.log({
            'templateid': str(template_full_seq.id),
            'exception': e,
            'traceback': ensembler.core.literal_str(trbk),
            'mpi_rank': mpistate.rank,
        })
        logger.error(
            'MPI rank %d pdbfixer error for template %s - see logfile' %
            (mpistate.rank, template_full_seq.id)
        )
        logger.debug(e)
        logger.debug(trbk)
Beispiel #32
0
def refine_implicit_md(
        openmm_platform=None, gpupn=1, process_only_these_targets=None,
        process_only_these_templates=None, model_seqid_cutoff=None,
        write_trajectory=False,
        include_disulfide_bonds=False,
        custom_residue_variants=None,
        ff='amber99sbildn',
        implicit_water_model='amber99_obc',
        sim_length=100.0 * unit.picoseconds,
        timestep=2.0 * unit.femtoseconds,             # timestep
        temperature=300.0 * unit.kelvin,              # simulation temperature
        collision_rate=20.0 / unit.picoseconds,       # Langevin collision rate
        cutoff=None,                                  # nonbonded cutoff
        minimization_tolerance=10.0 * unit.kilojoules_per_mole / unit.nanometer,
        minimization_steps=20,
        nsteps_per_iteration=500,
        ph=None,
        retry_failed_runs=False,
        cpu_platform_threads=1,
        loglevel=None):
    # TODO - refactor
    """Run MD refinement in implicit solvent.

    MPI-enabled.
    """
    ensembler.utils.set_loglevel(loglevel)
    gpuid = mpistate.rank % gpupn
    manual_overrides = ManualOverrides()
    if ph is None:
        if manual_overrides.refinement.ph is not None:
            ph = manual_overrides.refinement.ph
        else:
            ph = 7.0
    if custom_residue_variants is None:
        custom_residue_variants = deepcopy(
            manual_overrides.refinement.custom_residue_variants_by_targetid
        )

    if (sim_length / timestep) < nsteps_per_iteration:
        nsteps_per_iteration = int(sim_length / timestep)

    niterations = int((sim_length / timestep) / nsteps_per_iteration)

    models_dir = os.path.abspath(ensembler.core.default_project_dirnames.models)

    targets, templates_resolved_seq = ensembler.core.get_targets_and_templates()

    if process_only_these_templates:
        selected_template_indices = [i for i, seq in enumerate(templates_resolved_seq) if seq.id in process_only_these_templates]
    else:
        selected_template_indices = range(len(templates_resolved_seq))

    if not openmm_platform:
        openmm_platform = auto_select_openmm_platform()

    if openmm_platform == 'CPU':
        platform_properties = {'CpuThreads': str(cpu_platform_threads)}
    else:
        platform_properties = {}

    ff_files = [ff+'.xml', implicit_water_model+'.xml']
    forcefield = app.ForceField(*ff_files)

    kB = unit.MOLAR_GAS_CONSTANT_R
    kT = kB * temperature

    def simulate_implicit_md():

        logger.debug("Reading model...")
        with gzip.open(model_filename) as model_file:
            pdb = app.PDBFile(model_file)

        # Set up Platform
        platform = openmm.Platform.getPlatformByName(openmm_platform)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ:
            # Set GPU id.
            if openmm_platform == 'CUDA':
                platform.setPropertyDefaultValue('CudaDeviceIndex', '%d' % gpuid)
            elif openmm_platform == 'OpenCL':
                platform.setPropertyDefaultValue('OpenCLDeviceIndex', '%d' % gpuid)

        # Construct Modeller object with same topology as ref structure
        # (necessary to keep disulfide bonds consistent)
        modeller = app.Modeller(reference_topology, pdb.positions)
        # set_openmm_topology_bonds_from_atom_indices(modeller.topology, reference_bonds)
        # Add missing protons.
        modeller.addHydrogens(forcefield, pH=ph, variants=reference_variants)
        topology = modeller.getTopology()
        positions = modeller.getPositions()

        logger.debug("Constructing System object...")
        if cutoff is None:
            system = forcefield.createSystem(topology, nonbondedMethod=app.NoCutoff, constraints=app.HBonds)
        else:
            system = forcefield.createSystem(topology, nonbondedMethod=app.CutoffNonPeriodic, nonbondedCutoff=cutoff, constraints=app.HBonds)

        logger.debug("Creating Context...")
        integrator = openmm.LangevinIntegrator(temperature, collision_rate, timestep)
        context = openmm.Context(system, integrator, platform, platform_properties)
        context.setPositions(positions)

        logger.debug("Minimizing structure...")
        openmm.LocalEnergyMinimizer.minimize(context, minimization_tolerance, minimization_steps)

        if write_trajectory:
            # Open trajectory for writing.
            logger.debug("Opening trajectory for writing...")
            trajectory_filename = os.path.join(model_dir, 'implicit-trajectory.pdb.gz')
            trajectory_outfile = gzip.open(trajectory_filename, 'w')
            app.PDBFile.writeHeader(topology, file=trajectory_outfile)

        # Open energy trajectory for writing
        energy_filename = os.path.join(model_dir, 'implicit-energies.txt')
        energy_outfile = open(energy_filename, 'w')
        energy_outfile.write('# iteration | simulation time (ps) | potential_energy (kT) | kinetic_energy (kT) | ns per day\n')

        logger.debug("Running dynamics...")
        import time
        initial_time = time.time()
        for iteration in range(niterations):
            # integrate dynamics
            integrator.step(nsteps_per_iteration)
            # get current state
            state = context.getState(getEnergy=True, getPositions=True)
            simulation_time = state.getTime()
            potential_energy = state.getPotentialEnergy()
            kinetic_energy = state.getKineticEnergy()
            final_time = time.time()
            elapsed_time = (final_time - initial_time) * unit.seconds
            ns_per_day = (simulation_time / elapsed_time) / (unit.nanoseconds / unit.day)
            logger.debug(
                "  %8.1f ps : potential %8.3f kT | kinetic %8.3f kT | %.3f ns/day | %.3f s remain"
                % (
                    simulation_time / unit.picoseconds, potential_energy / kT, kinetic_energy / kT,
                    ns_per_day,
                    elapsed_time * (niterations-iteration-1) / (iteration+1) / unit.seconds
                )
            )

            # Check energies are still finite.
            if np.isnan(potential_energy/kT) or np.isnan(kinetic_energy/kT):
                raise Exception("Potential or kinetic energies are nan.")

            if write_trajectory:
                app.PDBFile.writeModel(topology, state.getPositions(), file=trajectory_outfile, modelIndex=iteration)

            # write data
            energy_outfile.write("  %8d %8.1f %8.3f %8.3f %.3f\n" % (iteration, simulation_time / unit.picoseconds, potential_energy / kT, kinetic_energy / kT, ns_per_day))
            energy_outfile.flush()

        if write_trajectory:
            app.PDBFile.writeFooter(topology, file=trajectory_outfile)
            trajectory_outfile.close()

        energy_outfile.close()

        # Write final PDB file.
        pdb_outfile = gzip.open(pdb_filename, 'wt')
        app.PDBFile.writeHeader(topology, file=pdb_outfile)
        app.PDBFile.writeFile(topology, state.getPositions(), file=pdb_outfile)
        app.PDBFile.writeFooter(topology, file=pdb_outfile)
        pdb_outfile.close()

    # Process targets
    print('Processing targets...') # DEBUG
    for target in targets:
        if (process_only_these_targets is not None) and (target.id not in process_only_these_targets):
            print('Skipping because %s is not in process_only_these_targets' % target.id)
            print(process_only_these_targets)
            continue
        logger.info('Processing %s' % target)
        models_target_dir = os.path.join(models_dir, target.id)
        if mpistate.rank == 0:
            target_starttime = datetime.datetime.utcnow()
            if not os.path.exists(models_target_dir):
                print('%s does not exist, skipping' % models_target_dir)
                continue

        mpistate.comm.Barrier()

        # ========
        # Determine topology (including protonation state) to use throughout
        # ========

        reference_model_id = get_highest_seqid_existing_model(models_target_dir=models_target_dir)
        if reference_model_id is None:
            continue

        reference_model_path = os.path.join(models_target_dir, reference_model_id, 'model.pdb.gz')

        with gzip.open(reference_model_path) as reference_pdb_file:
            reference_pdb = app.PDBFile(reference_pdb_file)

        logger.debug("Using %s as highest identity model" % (reference_model_id))

        if not include_disulfide_bonds:
            remove_disulfide_bonds_from_topology(reference_pdb.topology)

        # Build topology for reference model
        logger.debug("Creating app.Modeller instance...")
        modeller = app.Modeller(reference_pdb.topology, reference_pdb.positions)
        reference_topology = modeller.topology
        logger.debug("Adding hydrogens...")
        reference_variants = modeller.addHydrogens(forcefield, pH=ph)
        if target.id in custom_residue_variants:
            apply_custom_residue_variants(reference_variants, custom_residue_variants[target.id])
        logger.debug("Reference variants extracted:")
        if reference_variants is not None:
            for (residue_index, residue) in enumerate(reference_variants):
                if residue is not None:
                    logger.debug("%8d %s" % (residue_index+1, residue))
            logger.debug("")
        else:
            logger.debug(reference_variants)

        if model_seqid_cutoff:
            process_only_these_templates = ensembler.core.select_templates_by_seqid_cutoff(target.id, seqid_cutoff=model_seqid_cutoff)
            selected_template_indices = [i for i, seq in enumerate(templates_resolved_seq) if seq.id in process_only_these_templates]

        ntemplates_selected = len(selected_template_indices)

        for template_index in range(mpistate.rank, ntemplates_selected, mpistate.size):
            template = templates_resolved_seq[selected_template_indices[template_index]]

            model_dir = os.path.join(models_target_dir, template.id)
            if not os.path.exists(model_dir): continue

            # Only simulate models that are unique following filtering by clustering.
            unique_by_clustering = os.path.exists(os.path.join(model_dir, 'unique_by_clustering'))
            if not unique_by_clustering: continue

            # Pass if this simulation has already been run.
            log_filepath = os.path.join(model_dir, 'implicit-log.yaml')
            if os.path.exists(log_filepath):
                with open(log_filepath) as log_file:
                    log_data = yaml.load(log_file, Loader=ensembler.core.YamlLoader)
                    if log_data.get('successful') is True:
                        continue
                    if log_data.get('finished') is True and (retry_failed_runs is False and log_data.get('successful') is False):
                        continue

            # Check to make sure the initial model file is present.
            model_filename = os.path.join(model_dir, 'model.pdb.gz')
            if not os.path.exists(model_filename):
                logger.debug('model.pdb.gz not present: target %s template %s rank %d gpuid %d' % (target.id, template.id, mpistate.rank, gpuid))
                continue

            pdb_filename = os.path.join(model_dir, 'implicit-refined.pdb.gz')

            logger.info("-------------------------------------------------------------------------")
            logger.info("Simulating %s => %s in implicit solvent for %.1f ps (MPI rank: %d, GPU ID: %d)" % (target.id, template.id, niterations * nsteps_per_iteration * timestep / unit.picoseconds, mpistate.rank, gpuid))
            logger.info("-------------------------------------------------------------------------")

            # Open log file
            log_data = {
                'mpi_rank': mpistate.rank,
                'gpuid': gpuid if 'CUDA_VISIBLE_DEVICES' not in os.environ else os.environ['CUDA_VISIBLE_DEVICES'],
                'openmm_platform': openmm_platform,
                'finished': False,
                'sim_length': str(sim_length),
                'timestep': str(timestep),
                'temperature': str(temperature),
                'ph': ph,
            }
            log_file = ensembler.core.LogFile(log_filepath)
            log_file.log(new_log_data=log_data)

            try:
                start = datetime.datetime.utcnow()
                simulate_implicit_md()
                timing = ensembler.core.strf_timedelta(datetime.datetime.utcnow() - start)
                log_data = {
                    'finished': True,
                    'timing': timing,
                    'successful': True,
                }
                log_file.log(new_log_data=log_data)
            except Exception as e:
                trbk = traceback.format_exc()
                warnings.warn(
                    '= ERROR start: MPI rank {0} hostname {1} gpuid {2} =\n{3}\n{4}\n= ERROR end: MPI rank {0} hostname {1} gpuid {2}'.format(
                        mpistate.rank, socket.gethostname(), gpuid, e, trbk
                    )
                )
                timing = ensembler.core.strf_timedelta(datetime.datetime.utcnow() - start)
                log_data = {
                    'exception': e,
                    'traceback': ensembler.core.literal_str(trbk),
                    'timing': timing,
                    'finished': True,
                    'successful': False,
                }
                log_file.log(new_log_data=log_data)

        logger.debug('Finished template loop: rank %d' % mpistate.rank)

        mpistate.comm.Barrier()

        if mpistate.rank == 0:
            project_metadata = ensembler.core.ProjectMetadata(project_stage='refine_implicit_md', target_id=target.id)

            datestamp = ensembler.core.get_utcnow_formatted()
            command = ['find', models_target_dir, '-name', 'implicit-refined.pdb.gz']
            output = subprocess.check_output(command)
            nsuccessful_refinements = output.decode('UTF-8').count('\n')
            target_timedelta = datetime.datetime.utcnow() - target_starttime

            metadata = {
                'target_id': target.id,
                'datestamp': datestamp,
                'timing': ensembler.core.strf_timedelta(target_timedelta),
                'openmm_platform': openmm_platform,
                'process_only_these_targets': process_only_these_targets,
                'process_only_these_templates': process_only_these_templates,
                'model_seqid_cutoff': model_seqid_cutoff,
                'write_trajectory': write_trajectory,
                'include_disulfide_bonds': include_disulfide_bonds,
                'custom_residue_variants': custom_residue_variants,
                'ff': ff,
                'implicit_water_model': implicit_water_model,
                'sim_length': str(sim_length),
                'timestep': str(timestep),
                'temperature': str(temperature),
                'collision_rate': str(collision_rate),
                'cutoff': str(cutoff),
                'nsteps_per_iteration': nsteps_per_iteration,
                'ph': ph,
                'nsuccessful_refinements': nsuccessful_refinements,
                'python_version': sys.version.split('|')[0].strip(),
                'python_full_version': ensembler.core.literal_str(sys.version),
                'ensembler_version': ensembler.version.short_version,
                'ensembler_commit': ensembler.version.git_revision,
                'biopython_version': Bio.__version__,
                'openmm_version': simtk.openmm.version.short_version,
                'openmm_commit': simtk.openmm.version.git_revision,
            }

            project_metadata.add_data(metadata)
            project_metadata.write()

        mpistate.comm.Barrier()

    mpistate.comm.Barrier()
    if mpistate.rank == 0:
        logger.info('Done.')
Beispiel #33
0
    def __init__(self,
                 targetid,
                 traj_filepath=None,
                 topol_filepath=None,
                 models_data_filepath=None,
                 process_only_these_templates=None,
                 loglevel=None,
                 run_main=True):
        """
        Makes trajectory of the model files with added hydrogens, but prior to any refinement.
        For the specified target, makes a single topology pdb file, a single trajectory xtc file,
        and individual pdb files for each model.

        See docs on `MkTraj` for further info on paramters.

        Examples
        --------
        MkTrajImplicitStart(targetid='EGFR_HUMAN_D0')
        """
        ensembler.utils.set_loglevel(loglevel)
        ensembler.core.check_project_toplevel_dir()
        self.models_target_dir = os.path.join(default_project_dirnames.models,
                                              targetid)

        logger.debug('Working on target %s' % targetid)

        self.ensembler_stage = 'implicit-start'
        self.model_filename = 'implicit-start.pdb.gz'

        if traj_filepath is None:
            self.traj_filepath = os.path.join(
                self.models_target_dir,
                'traj-{0}.xtc'.format(self.ensembler_stage))
        else:
            self.traj_filepath = traj_filepath

        if topol_filepath is None:
            self.topol_filepath = os.path.join(
                self.models_target_dir,
                'traj-{0}-topol.pdb'.format(self.ensembler_stage))
        else:
            self.topol_filepath = topol_filepath

        if models_data_filepath is None:
            self.models_data_filepath = os.path.join(
                self.models_target_dir,
                'traj-{0}-data.csv'.format(self.ensembler_stage))
        else:
            self.models_data_filepath = models_data_filepath

        if process_only_these_templates:
            self.templateids = process_only_these_templates
        else:
            directories = [
                directory for directory in os.walk(self.models_target_dir)
            ]
            self.templateids = directories[0][1]

        if run_main:
            self._gen_implicit_start_models()
            self._gen_df(model_filename=self.model_filename)
            self.df.to_csv(self.models_data_filepath,
                           columns=['templateid', 'seqid'])
            self._construct_traj()
            self._superpose()
            self._write_traj()
Beispiel #34
0
    def __init__(self,
                 targetid,
                 ensembler_stage=None,
                 traj_filepath=None,
                 topol_filepath=None,
                 models_data_filepath=None,
                 process_only_these_templates=None,
                 loglevel=None,
                 run_main=True):
        """Makes a trajectory for a given target, using mdtraj. The trajectory can be used with other
        software, e.g. for visualization with PyMOL or VMD.

        Parameters
        ----------
        targetid : str
            e.g. 'EGFR_HUMAN_D0'
        ensembler_stage : str
            The Ensembler stage from which to build models, e.g. 'build_models' results in a trajectory
            built from the 'model.pdb.gz' files output by the build_models command.
            options: build_models|refine_implicit_md|refine_explicit_md
            default: most advanced stage for which model files are available
        traj_filepath : str
            default: models/[targetid]/traj-[ensembler_stage].xtc
        topol_filepath : str
            default: models/[targetid]/traj-[ensembler_stage]-topol.pdb
        models_data_filepath :
            default: models/[targetid]/traj-[ensembler_stage]-data.csv
        process_only_these_templates : list of str

        Returns
        -------
        traj : mdtraj.Trajectory
        df : pandas.DataFrame
            models data (e.g. sequence identities):

        """
        ensembler.utils.set_loglevel(loglevel)
        ensembler.core.check_project_toplevel_dir()
        self.models_target_dir = os.path.join(default_project_dirnames.models,
                                              targetid)

        logger.debug('Working on target %s' % targetid)

        if ensembler_stage is None:
            self.ensembler_stage = get_most_advanced_ensembler_modeling_stage(
                targetid)
        else:
            self.ensembler_stage = ensembler_stage

        if traj_filepath is None:
            self.traj_filepath = os.path.join(
                self.models_target_dir,
                'traj-{0}.xtc'.format(self.ensembler_stage))
        else:
            self.traj_filepath = traj_filepath

        if topol_filepath is None:
            self.topol_filepath = os.path.join(
                self.models_target_dir,
                'traj-{0}-topol.pdb'.format(self.ensembler_stage))
        else:
            self.topol_filepath = topol_filepath

        if models_data_filepath is None:
            self.models_data_filepath = os.path.join(
                self.models_target_dir,
                'traj-{0}-data.csv'.format(self.ensembler_stage))
        else:
            self.models_data_filepath = models_data_filepath

        if process_only_these_templates:
            self.templateids = process_only_these_templates
        else:
            directories = [
                directory for directory in os.walk(self.models_target_dir)
            ]
            self.templateids = directories[0][1]

        if run_main:
            self._gen_df()
            self.df.to_csv(self.models_data_filepath,
                           columns=['templateid', 'seqid'])
            self._construct_traj()
            self._superpose()
            self._write_traj()
Beispiel #35
0
def pdbfix_template(template_full_seq, overwrite_structures=False):
    """
    Parameters
    ----------
    template_full_seq: BioPython SeqRecord
        full UniProt sequence for span of the template (including unresolved residues)
    overwrite_structures: bool
    Returns
    -------
    fixer.missingResidues
    """
    try:
        template_pdbfixed_filepath = os.path.join(
            ensembler.core.default_project_dirnames.
            templates_structures_modeled_loops,
            template_full_seq.id + '-pdbfixed.pdb')
        seq_pdbfixed_filepath = os.path.join(
            ensembler.core.default_project_dirnames.
            templates_structures_modeled_loops,
            template_full_seq.id + '-pdbfixed.fasta')
        import pdbfixer
        import simtk.openmm.app
        template_filepath = os.path.join(
            ensembler.core.default_project_dirnames.
            templates_structures_resolved, template_full_seq.id + '.pdb')
        fixer = pdbfixer.PDBFixer(filename=template_filepath)
        chainid = next(fixer.topology.chains()).id
        sequence = [
            Bio.SeqUtils.seq3(r).upper() for r in template_full_seq.seq
        ]
        seq_obj = pdbfixer.pdbfixer.Sequence(chainid, sequence)
        fixer.sequences.append(seq_obj)
        fixer.findMissingResidues()
        remove_missing_residues_at_termini(fixer,
                                           len_full_seq=len(
                                               template_full_seq.seq))
        if not overwrite_structures and os.path.exists(
                template_pdbfixed_filepath):
            return fixer.missingResidues
        fixer.findMissingAtoms()
        (newTopology, newPositions, newAtoms,
         existingAtomMap) = fixer._addAtomsToTopology(True, True)
        fixer.topology = newTopology
        fixer.positions = newPositions
        with open(template_pdbfixed_filepath, 'w') as template_pdbfixed_file:
            simtk.openmm.app.PDBFile.writeFile(fixer.topology,
                                               fixer.positions,
                                               file=template_pdbfixed_file)

        # Write sequence to file
        seq_pdbfixed = ''.join(
            [Bio.SeqUtils.seq1(r.name) for r in fixer.topology.residues()])
        seq_record_pdbfixed = SeqRecord(Seq(seq_pdbfixed),
                                        id=template_full_seq.id,
                                        description=template_full_seq.id)
        Bio.SeqIO.write([seq_record_pdbfixed], seq_pdbfixed_filepath, 'fasta')

        return fixer.missingResidues
    except (KeyboardInterrupt, ImportError):
        raise
    except Exception as e:
        trbk = traceback.format_exc()
        log_filepath = os.path.abspath(
            os.path.join(
                ensembler.core.default_project_dirnames.
                templates_structures_modeled_loops,
                template_full_seq.id + '-pdbfixer-log.yaml'))
        logfile = ensembler.core.LogFile(log_filepath)
        logfile.log({
            'templateid': str(template_full_seq.id),
            'exception': e,
            'traceback': ensembler.core.literal_str(trbk),
            'mpi_rank': mpistate.rank,
        })
        logger.error(
            'MPI rank %d pdbfixer error for template %s - see logfile' %
            (mpistate.rank, template_full_seq.id))
        logger.debug(e)
        logger.debug(trbk)
Beispiel #36
0
def package_for_fah(process_only_these_targets=None,
                    process_only_these_templates=None,
                    model_seqid_cutoff=None,
                    model_validation_score_cutoff=None,
                    model_validation_score_percentile=None,
                    nclones=1, archive=False,
                    openmm_platform='Reference',
                    temperature=300.0 * unit.kelvin,
                    collision_rate=1.0 / unit.picosecond,
                    timestep=2.0 * unit.femtoseconds,
                    loglevel=None):
    """
    Create the input files and directory structure necessary to start a Folding@Home project.

    MPI-enabled.

    Parameters
    ----------
    archive : Bool
        A .tgz compressed archive will be created for each individual RUN directory.
    """
    set_loglevel(loglevel)

    if mpistate.rank == 0:
        if not os.path.exists(fah_projects_dir):
            os.mkdir(fah_projects_dir)
    mpistate.comm.Barrier()

    targets, templates_resolved_seq = get_targets_and_templates()

    for target in targets:
        if process_only_these_targets and (target.id not in process_only_these_targets):
            continue

        target_project_dir = os.path.join(fah_projects_dir, target.id)

        models_target_dir = os.path.join(default_project_dirnames.models, target.id)
        if not os.path.exists(models_target_dir):
            continue

        mpistate.comm.Barrier()

        sorted_valid_templates = []
        system = None
        renumbered_resnums = {}

        if mpistate.rank == 0:
            logger.info('-------------------------------------------------------------------------')
            logger.info('Building FAH OpenMM project for target {}'.format(target.id))
            logger.info('-------------------------------------------------------------------------')

            valid_templates = get_valid_templates_for_target(
                target,
                templates_resolved_seq,
                process_only_these_templates=process_only_these_templates,
                model_seqid_cutoff=model_seqid_cutoff,
                model_validation_score_cutoff=model_validation_score_cutoff,
                model_validation_score_percentile=model_validation_score_percentile
            )

            sorted_valid_templates = sort_valid_templates_by_seqid(
                target,
                valid_templates
            )

            create_target_project_dir(target)

            system = setup_system_and_integrator_files(
                target,
                sorted_valid_templates[0],
                temperature,
                collision_rate,
                timestep
            )

            renumbered_resnums = get_renumbered_topol_resnums(target)

        sorted_valid_templates = mpistate.comm.bcast(sorted_valid_templates, root=0)
        system = mpistate.comm.bcast(system, root=0)
        renumbered_resnums = mpistate.comm.bcast(renumbered_resnums, root=0)

        logger.debug("Building RUNs in parallel...")

        for run_index in range(mpistate.rank, len(sorted_valid_templates), mpistate.size):
            template = sorted_valid_templates[run_index]

            logger.info('-------------------------------------------------------------------------')
            logger.info(
                'Building RUN{} for template {}'.format(
                    run_index, template
                )
            )
            logger.info('-------------------------------------------------------------------------')

            source_dir = os.path.join(models_target_dir, template)
            generate_fah_run(
                target_project_dir,
                template,
                source_dir,
                system,
                run_index,
                nclones,
                temperature,
                collision_rate,
                timestep,
                openmm_platform,
                renumbered_resnums,
            )

            if archive:
                tgz_fah_run(target, run_index)

    mpistate.comm.Barrier()
    if mpistate.rank == 0:
        logger.info('Done.')
Beispiel #37
0
def generate_fah_run(target_project_dir,
                     template,
                     source_dir,
                     system,
                     run_index,
                     nclones,
                     temperature,
                     collision_rate,
                     timestep,
                     openmm_platform,
                     renumbered_resnums,
                     ):
    """
    Build Folding@Home RUN and CLONE subdirectories from (possibly compressed) OpenMM serialized XML files.

    ARGUMENTS

    run (int) - run index
    """
    logger.debug("Building RUN %d" % run_index)

    try:
        # Determine directory and pathnames.
        run_dir = os.path.join(target_project_dir, 'RUN%d' % run_index)
        run_template_id_filepath = os.path.join(run_dir, 'template.txt')
        run_seqid_filepath = os.path.join(run_dir, 'sequence-identity.txt')
        run_protein_structure_filepath = os.path.join(run_dir, 'protein.pdb')
        run_system_structure_filepath = os.path.join(run_dir, 'system.pdb')
        run_final_state_filepath = os.path.join(run_dir, 'state%d.xml' % (nclones - 1))
        source_seqid_filepath = os.path.join(source_dir, 'sequence-identity.txt')
        source_protein_structure_filepath = os.path.join(source_dir, 'implicit-refined.pdb.gz')
        source_system_structure_filepath = os.path.join(source_dir, 'explicit-refined.pdb.gz')
        source_openmm_state_filepath = os.path.join(source_dir, 'explicit-state.xml')

        # Return if this directory has already been set up.
        if os.path.exists(run_dir):
            if (
                    os.path.exists(run_template_id_filepath)
                    and os.path.exists(run_seqid_filepath)
                    and os.path.exists(run_protein_structure_filepath)
                    and os.path.exists(run_system_structure_filepath)
                    and os.path.exists(run_final_state_filepath)
                    ):
                return
        else:
            # Construct run directory if it does not exist.
            if not os.path.exists(run_dir):
                os.makedirs(run_dir)

        # Write template ID
        with open(run_template_id_filepath, 'w') as outfile:
            outfile.write(template + '\n')

        # Write the protein and system structure pdbs
        if 'implicit' in renumbered_resnums:
            write_renumbered_structure(
                source_protein_structure_filepath,
                run_protein_structure_filepath,
                renumbered_resnums['implicit'],
            )
        else:
            with open(run_protein_structure_filepath, 'w') as protein_structure_file:
                protein_structure_file.write(
                    read_file_contents_gz_or_not(source_protein_structure_filepath)
                )

        if 'explicit' in renumbered_resnums:
            write_renumbered_structure(
                source_system_structure_filepath,
                run_system_structure_filepath,
                renumbered_resnums['explicit'],
            )
        else:
            with open(run_system_structure_filepath, 'w') as system_structure_file:
                system_structure_file.write(
                    read_file_contents_gz_or_not(source_system_structure_filepath)
                )

        state = mm.XmlSerializer.deserialize(
            read_file_contents_gz_or_not(source_openmm_state_filepath)
        )

        # Write sequence identity.
        with open(run_seqid_filepath, 'w') as run_seqid_file:
            run_seqid_file.write(read_file_contents_gz_or_not(source_seqid_filepath))

        # Create new integrator to use.
        integrator = mm.LangevinIntegrator(temperature, collision_rate, timestep)

        # Create Context so we can randomize velocities.
        platform = mm.Platform.getPlatformByName(openmm_platform)
        context = mm.Context(system, integrator, platform)
        context.setPositions(state.getPositions())
        box_vectors = state.getPeriodicBoxVectors()
        context.setPeriodicBoxVectors(*box_vectors)

        # Create clones with different random initial velocities.
        for clone_index in range(nclones):
            state_filename = os.path.join(run_dir, 'state%d.xml' % clone_index)
            if os.path.exists(state_filename):
                continue
            context.setVelocitiesToTemperature(temperature)
            state = context.getState(
                getPositions=True,
                getVelocities=True,
                getForces=True,
                getEnergy=True,
                getParameters=True,
                enforcePeriodicBox=True
            )
            with open(state_filename, 'w') as state_file:
                state_file.write(mm.XmlSerializer.serialize(state))

    except Exception as e:
        import traceback
        print(traceback.format_exc())
        print(str(e))
Beispiel #38
0
def cluster_models(process_only_these_targets=None,
                   cutoff=0.06,
                   loglevel=None):
    """Cluster models based on RMSD, and filter out non-unique models as
    determined by a given cutoff.

    Parameters
    ----------

    cutoff : float
        Minimum distance cutoff for RMSD clustering (nm)

    Runs serially.
    """
    # TODO refactor
    ensembler.utils.set_loglevel(loglevel)
    targets, templates_resolved_seq = get_targets_and_templates()
    templates = templates_resolved_seq

    for target in targets:
        if process_only_these_targets and (target.id
                                           not in process_only_these_targets):
            continue

        models_target_dir = os.path.join(
            ensembler.core.default_project_dirnames.models, target.id)
        if not os.path.exists(models_target_dir): continue

        # =============================
        # Construct a mdtraj trajectory containing all models
        # =============================

        starttime = datetime.datetime.utcnow()

        logger.debug('Building a list of valid models...')

        model_pdbfilenames_compressed = {
            template.id: os.path.join(models_target_dir, template.id,
                                      'model.pdb.gz')
            for template in templates
        }
        model_pdbfilenames_uncompressed = {
            template.id: os.path.join(models_target_dir, template.id,
                                      'model.pdb')
            for template in templates
        }
        valid_templateids = [
            templateid for templateid in model_pdbfilenames_compressed
            if os.path.exists(model_pdbfilenames_compressed[templateid])
        ]

        # Write uncompressed model.pdb files from model.pdb.gz if necessary
        for templateid in valid_templateids:
            if not os.path.exists(
                    model_pdbfilenames_uncompressed[templateid]
            ) or os.path.getsize(
                    model_pdbfilenames_uncompressed[templateid]) == 0:
                with gzip.open(model_pdbfilenames_compressed[templateid]
                               ) as model_pdbfile_compressed:
                    with open(model_pdbfilenames_uncompressed[templateid],
                              'w') as model_pdbfile:
                        model_pdbfile.write(model_pdbfile_compressed.read())

        logger.info('Constructing a trajectory containing all valid models...')

        if len(valid_templateids) == 0:
            logger.info('No models found for target {0}.'.format(target.id))
            continue

        valid_model_pdbfilenames_uncompressed = [
            model_pdbfilenames_uncompressed[templateid]
            for templateid in valid_templateids
        ]

        traj = mdtraj.load(valid_model_pdbfilenames_uncompressed)

        # =============================
        # Clustering
        # =============================

        logger.info('Conducting RMSD-based clustering...')

        # Remove any existing unique_by_clustering files
        for f in glob.glob(models_target_dir + '/*_PK_*/unique_by_clustering'):
            os.unlink(f)

        CAatoms = [a.index for a in traj.topology.atoms if a.name == 'CA']
        unique_templateids = models_regular_spatial_clustering(
            valid_templateids, traj, atom_indices=CAatoms, cutoff=cutoff)
        write_unique_by_clustering_files(unique_templateids, models_target_dir)

        with open(os.path.join(models_target_dir, 'unique-models.txt'),
                  'w') as uniques_file:
            for u in unique_templateids:
                uniques_file.write(u + '\n')
            logger.info(
                '%d unique models (from original set of %d) using cutoff of %.3f nm'
                % (len(unique_templateids), len(valid_templateids), cutoff))

        for template in templates:
            model_dir = os.path.join(models_target_dir, template.id)
            model_pdbfilename = os.path.join(model_dir, 'model.pdb')
            if os.path.exists(model_pdbfilename):
                os.remove(model_pdbfilename)

        # ========
        # Metadata
        # ========

        project_metadata = ensembler.core.ProjectMetadata(
            project_stage='cluster_models', target_id=target.id)
        datestamp = ensembler.core.get_utcnow_formatted()

        timedelta = datetime.datetime.utcnow() - starttime

        metadata = {
            'target_id': target.id,
            'datestamp': datestamp,
            'nunique_models': len(unique_templateids),
            'python_version': sys.version.split('|')[0].strip(),
            'python_full_version': ensembler.core.literal_str(sys.version),
            'ensembler_version': ensembler.version.short_version,
            'ensembler_commit': ensembler.version.git_revision,
            'biopython_version': Bio.__version__,
            'mdtraj_version': mdtraj.version.short_version,
            'mdtraj_commit': mdtraj.version.git_revision,
            'timing': ensembler.core.strf_timedelta(timedelta),
        }

        project_metadata.add_data(metadata)
        project_metadata.write()
Beispiel #39
0
    def _gen_implicit_start_models(
            self,
            ff='amber99sbildn.xml', implicit_water_model='amber99_obc.xml',
            ph=8.0):

        self.ph = ph
        from simtk.openmm import app

        valid_model_templateids = [
            templateid for templateid in self.templateids
            if os.path.exists(
                os.path.join(
                    self.models_target_dir, templateid,
                    ensembler.core.model_filenames_by_ensembler_stage['refine_implicit_md']
                )
            )
        ]

        gen_model_templateids = [
            templateid for templateid in valid_model_templateids
            if not os.path.exists(
                os.path.join(self.models_target_dir, templateid, self.model_filename)
            )
        ]

        # make reference model
        forcefield = app.ForceField(ff, implicit_water_model)
        reference_model_id = get_highest_seqid_existing_model(models_target_dir=self.models_target_dir)
        logger.debug('Using {0} as reference model'.format(reference_model_id))
        reference_model_path = os.path.join(self.models_target_dir, reference_model_id, model_filenames_by_ensembler_stage['build_models'])
        with gzip.open(reference_model_path) as reference_pdb_file:
            reference_pdb = app.PDBFile(reference_pdb_file)
        remove_disulfide_bonds_from_topology(reference_pdb.topology)
        reference_topology = reference_pdb.topology
        reference_modeller = app.Modeller(reference_pdb.topology, reference_pdb.positions)
        reference_variants = reference_modeller.addHydrogens(forcefield, pH=self.ph)

        for template_index in range(mpistate.rank, len(gen_model_templateids), mpistate.size):
            templateid = gen_model_templateids[template_index]
            logger.debug('Generating implicit-start model for {0}'.format(templateid))

            try:
                input_model_filepath = os.path.join(self.models_target_dir, templateid, model_filenames_by_ensembler_stage['build_models'])
                output_model_filepath = os.path.join(self.models_target_dir, templateid, self.model_filename)

                with gzip.open(input_model_filepath) as pdb_file:
                    pdb = app.PDBFile(pdb_file)

                remove_disulfide_bonds_from_topology(pdb.topology)
                modeller = app.Modeller(reference_topology, pdb.positions)
                modeller.addHydrogens(forcefield, pH=self.ph, variants=reference_variants)
                topology = modeller.getTopology()
                positions = modeller.getPositions()

                with gzip.open(output_model_filepath, 'wt') as output_model_file:
                    app.PDBFile.writeHeader(topology, file=output_model_file)
                    app.PDBFile.writeFile(topology, positions, file=output_model_file)
                    app.PDBFile.writeFooter(topology, file=output_model_file)

            except Exception as e:
                print('Error for model {0}: {1}'.format(templateid, e))
                continue
Beispiel #40
0
    def simulate_implicit_md():

        logger.debug("Reading model...")
        with gzip.open(model_filename) as model_file:
            pdb = app.PDBFile(model_file)

        # Set up Platform
        platform = openmm.Platform.getPlatformByName(openmm_platform)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ:
            # Set GPU id.
            if openmm_platform == 'CUDA':
                platform.setPropertyDefaultValue('CudaDeviceIndex', '%d' % gpuid)
            elif openmm_platform == 'OpenCL':
                platform.setPropertyDefaultValue('OpenCLDeviceIndex', '%d' % gpuid)

        # Construct Modeller object with same topology as ref structure
        # (necessary to keep disulfide bonds consistent)
        modeller = app.Modeller(reference_topology, pdb.positions)
        # set_openmm_topology_bonds_from_atom_indices(modeller.topology, reference_bonds)
        # Add missing protons.
        modeller.addHydrogens(forcefield, pH=ph, variants=reference_variants)
        topology = modeller.getTopology()
        positions = modeller.getPositions()

        logger.debug("Constructing System object...")
        if cutoff is None:
            system = forcefield.createSystem(topology, nonbondedMethod=app.NoCutoff, constraints=app.HBonds)
        else:
            system = forcefield.createSystem(topology, nonbondedMethod=app.CutoffNonPeriodic, nonbondedCutoff=cutoff, constraints=app.HBonds)

        logger.debug("Creating Context...")
        integrator = openmm.LangevinIntegrator(temperature, collision_rate, timestep)
        context = openmm.Context(system, integrator, platform, platform_properties)
        context.setPositions(positions)

        logger.debug("Minimizing structure...")
        openmm.LocalEnergyMinimizer.minimize(context, minimization_tolerance, minimization_steps)

        if write_trajectory:
            # Open trajectory for writing.
            logger.debug("Opening trajectory for writing...")
            trajectory_filename = os.path.join(model_dir, 'implicit-trajectory.pdb.gz')
            trajectory_outfile = gzip.open(trajectory_filename, 'w')
            app.PDBFile.writeHeader(topology, file=trajectory_outfile)

        # Open energy trajectory for writing
        energy_filename = os.path.join(model_dir, 'implicit-energies.txt')
        energy_outfile = open(energy_filename, 'w')
        energy_outfile.write('# iteration | simulation time (ps) | potential_energy (kT) | kinetic_energy (kT) | ns per day\n')

        logger.debug("Running dynamics...")
        import time
        initial_time = time.time()
        for iteration in range(niterations):
            # integrate dynamics
            integrator.step(nsteps_per_iteration)
            # get current state
            state = context.getState(getEnergy=True, getPositions=True)
            simulation_time = state.getTime()
            potential_energy = state.getPotentialEnergy()
            kinetic_energy = state.getKineticEnergy()
            final_time = time.time()
            elapsed_time = (final_time - initial_time) * unit.seconds
            ns_per_day = (simulation_time / elapsed_time) / (unit.nanoseconds / unit.day)
            logger.debug(
                "  %8.1f ps : potential %8.3f kT | kinetic %8.3f kT | %.3f ns/day | %.3f s remain"
                % (
                    simulation_time / unit.picoseconds, potential_energy / kT, kinetic_energy / kT,
                    ns_per_day,
                    elapsed_time * (niterations-iteration-1) / (iteration+1) / unit.seconds
                )
            )

            # Check energies are still finite.
            if np.isnan(potential_energy/kT) or np.isnan(kinetic_energy/kT):
                raise Exception("Potential or kinetic energies are nan.")

            if write_trajectory:
                app.PDBFile.writeModel(topology, state.getPositions(), file=trajectory_outfile, modelIndex=iteration)

            # write data
            energy_outfile.write("  %8d %8.1f %8.3f %8.3f %.3f\n" % (iteration, simulation_time / unit.picoseconds, potential_energy / kT, kinetic_energy / kT, ns_per_day))
            energy_outfile.flush()

        if write_trajectory:
            app.PDBFile.writeFooter(topology, file=trajectory_outfile)
            trajectory_outfile.close()

        energy_outfile.close()

        # Write final PDB file.
        pdb_outfile = gzip.open(pdb_filename, 'wt')
        app.PDBFile.writeHeader(topology, file=pdb_outfile)
        app.PDBFile.writeFile(topology, state.getPositions(), file=pdb_outfile)
        app.PDBFile.writeFooter(topology, file=pdb_outfile)
        pdb_outfile.close()
Beispiel #41
0
def generate_fah_run(
    target_project_dir,
    template,
    source_dir,
    system,
    run_index,
    nclones,
    temperature,
    collision_rate,
    timestep,
    openmm_platform,
    renumbered_resnums,
):
    """
    Build Folding@Home RUN and CLONE subdirectories from (possibly compressed) OpenMM serialized XML files.

    ARGUMENTS

    run (int) - run index
    """
    logger.debug("Building RUN %d" % run_index)

    try:
        # Determine directory and pathnames.
        run_dir = os.path.join(target_project_dir, 'RUN%d' % run_index)
        run_template_id_filepath = os.path.join(run_dir, 'template.txt')
        run_seqid_filepath = os.path.join(run_dir, 'sequence-identity.txt')
        run_protein_structure_filepath = os.path.join(run_dir, 'protein.pdb')
        run_system_structure_filepath = os.path.join(run_dir, 'system.pdb')
        run_final_state_filepath = os.path.join(run_dir,
                                                'state%d.xml' % (nclones - 1))
        source_seqid_filepath = os.path.join(source_dir,
                                             'sequence-identity.txt')
        source_protein_structure_filepath = os.path.join(
            source_dir, 'implicit-refined.pdb.gz')
        source_system_structure_filepath = os.path.join(
            source_dir, 'explicit-refined.pdb.gz')
        source_openmm_state_filepath = os.path.join(source_dir,
                                                    'explicit-state.xml')

        # Return if this directory has already been set up.
        if os.path.exists(run_dir):
            if (os.path.exists(run_template_id_filepath)
                    and os.path.exists(run_seqid_filepath)
                    and os.path.exists(run_protein_structure_filepath)
                    and os.path.exists(run_system_structure_filepath)
                    and os.path.exists(run_final_state_filepath)):
                return
        else:
            # Construct run directory if it does not exist.
            if not os.path.exists(run_dir):
                os.makedirs(run_dir)

        # Write template ID
        with open(run_template_id_filepath, 'w') as outfile:
            outfile.write(template + '\n')

        # Write the protein and system structure pdbs
        if 'implicit' in renumbered_resnums:
            write_renumbered_structure(
                source_protein_structure_filepath,
                run_protein_structure_filepath,
                renumbered_resnums['implicit'],
            )
        else:
            with open(run_protein_structure_filepath,
                      'w') as protein_structure_file:
                protein_structure_file.write(
                    read_file_contents_gz_or_not(
                        source_protein_structure_filepath))

        if 'explicit' in renumbered_resnums:
            write_renumbered_structure(
                source_system_structure_filepath,
                run_system_structure_filepath,
                renumbered_resnums['explicit'],
            )
        else:
            with open(run_system_structure_filepath,
                      'w') as system_structure_file:
                system_structure_file.write(
                    read_file_contents_gz_or_not(
                        source_system_structure_filepath))

        state = mm.XmlSerializer.deserialize(
            read_file_contents_gz_or_not(source_openmm_state_filepath))

        # Write sequence identity.
        with open(run_seqid_filepath, 'w') as run_seqid_file:
            run_seqid_file.write(
                read_file_contents_gz_or_not(source_seqid_filepath))

        # Create new integrator to use.
        integrator = mm.LangevinIntegrator(temperature, collision_rate,
                                           timestep)

        # Create Context so we can randomize velocities.
        platform = mm.Platform.getPlatformByName(openmm_platform)
        context = mm.Context(system, integrator, platform)
        context.setPositions(state.getPositions())
        box_vectors = state.getPeriodicBoxVectors()
        context.setPeriodicBoxVectors(*box_vectors)

        # Create clones with different random initial velocities.
        for clone_index in range(nclones):
            state_filename = os.path.join(run_dir, 'state%d.xml' % clone_index)
            if os.path.exists(state_filename):
                continue
            context.setVelocitiesToTemperature(temperature)
            state = context.getState(getPositions=True,
                                     getVelocities=True,
                                     getForces=True,
                                     getEnergy=True,
                                     getParameters=True,
                                     enforcePeriodicBox=True)
            with open(state_filename, 'w') as state_file:
                state_file.write(mm.XmlSerializer.serialize(state))

    except Exception as e:
        import traceback
        print(traceback.format_exc())
        print(str(e))
Beispiel #42
0
    def __init__(self, targetid, ensembler_stage=None, traj_filepath=None, topol_filepath=None,
           models_data_filepath=None, process_only_these_templates=None, loglevel=None,
           run_main=True):
        """Makes a trajectory for a given target, using mdtraj. The trajectory can be used with other
        software, e.g. for visualization with PyMOL or VMD.

        Parameters
        ----------
        targetid : str
            e.g. 'EGFR_HUMAN_D0'
        ensembler_stage : str
            The Ensembler stage from which to build models, e.g. 'build_models' results in a trajectory
            built from the 'model.pdb.gz' files output by the build_models command.
            options: build_models|refine_implicit_md|refine_explicit_md
            default: most advanced stage for which model files are available
        traj_filepath : str
            default: models/[targetid]/traj-[ensembler_stage].xtc
        topol_filepath : str
            default: models/[targetid]/traj-[ensembler_stage]-topol.pdb
        models_data_filepath :
            default: models/[targetid]/traj-[ensembler_stage]-data.csv
        process_only_these_templates : list of str

        Returns
        -------
        traj : mdtraj.Trajectory
        df : pandas.DataFrame
            models data (e.g. sequence identities):

        """
        ensembler.utils.set_loglevel(loglevel)
        ensembler.core.check_project_toplevel_dir()
        self.models_target_dir = os.path.join(default_project_dirnames.models, targetid)

        logger.debug('Working on target %s' % targetid)

        if ensembler_stage is None:
            self.ensembler_stage = get_most_advanced_ensembler_modeling_stage(targetid)
        else:
            self.ensembler_stage = ensembler_stage

        if traj_filepath is None:
            self.traj_filepath = os.path.join(
                self.models_target_dir, 'traj-{0}.xtc'.format(self.ensembler_stage)
            )
        else:
            self.traj_filepath = traj_filepath

        if topol_filepath is None:
            self.topol_filepath = os.path.join(
                self.models_target_dir, 'traj-{0}-topol.pdb'.format(self.ensembler_stage)
            )
        else:
            self.topol_filepath = topol_filepath

        if models_data_filepath is None:
            self.models_data_filepath = os.path.join(
                self.models_target_dir, 'traj-{0}-data.csv'.format(self.ensembler_stage)
            )
        else:
            self.models_data_filepath = models_data_filepath

        if process_only_these_templates:
            self.templateids = process_only_these_templates
        else:
            directories = [ directory for directory in os.walk(self.models_target_dir) ]
            self.templateids = directories[0][1]

        if run_main:
            self._gen_df()
            self.df.to_csv(self.models_data_filepath, columns=['templateid', 'seqid'])
            self._construct_traj()
            self._superpose()
            self._write_traj()