Beispiel #1
0
def log_unique_domain_names(uniprot_query_string, uniprotxml):
    # Example query string: 'domain:"Protein kinase" AND reviewed:yes'
    domain_match = re.search('domain:([\"\'].*[\"\'])', uniprot_query_string)
    if domain_match and len(domain_match.groups()) > 0:
        query_string_domain_selection = domain_match.groups()[0].replace(
            '\'', '').replace('\"', '')
        uniprot_query_string_domains = uniprotxml.xpath(
            'entry/feature[@type="domain"][match_regex(@description, "%s")]' %
            query_string_domain_selection,
            extensions={
                (None, 'match_regex'):
                ensembler.core.xpath_match_regex_case_insensitive
            })
        uniprot_unique_domain_names = set([
            domain.get('description')
            for domain in uniprot_query_string_domains
        ])
        logger.info(
            'Set of unique domain names selected by the domain selector \'%s\' during the initial UniProt search:\n%s\n'
            % (query_string_domain_selection, uniprot_unique_domain_names))

    else:
        uniprot_domains = uniprotxml.xpath('entry/feature[@type="domain"]')
        uniprot_unique_domain_names = set(
            [domain.get('description') for domain in uniprot_domains])
        logger.info(
            'Set of unique domain names returned from the initial UniProt search using the query string \'%s\':\n%s\n'
            % (uniprot_query_string, uniprot_unique_domain_names))
Beispiel #2
0
def loopmodel_templates(templates,
                        missing_residues,
                        process_only_these_templates=None,
                        overwrite_structures=False):
    """
    Parameters
    ----------
    templates:  list of BioPython SeqRecord
        only the id is used
    missing_residues: list of list of OpenMM Residue
    process_only_these_templates: bool
    overwrite_structures: bool
    """
    for template_index in range(mpistate.rank, len(templates), mpistate.size):
        template = templates[template_index]
        if process_only_these_templates and template.id not in process_only_these_templates:
            continue
        if mpistate.size > 1:
            logger.info('MPI rank %d modeling missing loops for template %s' %
                        (mpistate.rank, template.id))
        else:
            logger.info('Modeling missing loops for template %s' % template.id)
        loopmodel_template(template,
                           missing_residues[template_index],
                           overwrite_structures=overwrite_structures)
Beispiel #3
0
def molprobity_validation_multiple_targets(targetids=None, modeling_stage=None, loglevel=None):
    """
Calculate model quality using MolProbity ``oneline-analysis`` command.

For each target, this function outputs a text file named
``models/[targetid]/validation_scores_sorted-[method]-[ensembler_stage]`` which contains a list of
targetids sorted by validation score. This can be used by the subsequent ``package_models`` command
to filter out models below a specified quality threshold.

Typically, this should be run after models have been refined to the desired extent (e.g. after
implicit or explicit MD refinement)

More detailed validation results are written to the individual model directories.

MPI-enabled.

    Parameters
    ----------
    targetids: list of str or str
    modeling_stage: str
        {None|build_models|refine_implicit_md|refine_explicit_md}
        Default: None (automatically selects most advanced stage)
    """
    set_loglevel(loglevel)
    if targetids is None:
        targetids = [target.id for target in get_targets()]
    elif type(targetids) is str:
        targetids = [targetids]
    for targetid in targetids:
        logger.info("Working on target {}".format(targetid))
        molprobity_validation(targetid=targetid, ensembler_stage=modeling_stage, loglevel=loglevel)
Beispiel #4
0
def molprobity_validation_multiple_targets(targetids=None,
                                           modeling_stage=None,
                                           loglevel=None):
    """
Calculate model quality using MolProbity ``oneline-analysis`` command.

For each target, this function outputs a text file named
``models/[targetid]/validation_scores_sorted-[method]-[ensembler_stage]`` which contains a list of
targetids sorted by validation score. This can be used by the subsequent ``package_models`` command
to filter out models below a specified quality threshold.

Typically, this should be run after models have been refined to the desired extent (e.g. after
implicit or explicit MD refinement)

More detailed validation results are written to the individual model directories.

MPI-enabled.

    Parameters
    ----------
    targetids: list of str or str
    modeling_stage: str
        {None|build_models|refine_implicit_md|refine_explicit_md}
        Default: None (automatically selects most advanced stage)
    """
    set_loglevel(loglevel)
    if targetids is None:
        targetids = [target.id for target in get_targets()]
    elif type(targetids) is str:
        targetids = [targetids]
    for targetid in targetids:
        logger.info('Working on target {}'.format(targetid))
        molprobity_validation(targetid=targetid,
                              ensembler_stage=modeling_stage,
                              loglevel=loglevel)
Beispiel #5
0
def log_unique_domain_names_selected_by_regex(uniprot_domain_regex, uniprotxml):
    regex_matched_domains = uniprotxml.xpath(
        'entry/feature[@type="domain"][match_regex(@description, "%s")]' % uniprot_domain_regex,
        extensions={(None, 'match_regex'): ensembler.core.xpath_match_regex_case_sensitive}
    )
    regex_matched_domains_unique_names = set([domain.get('description') for domain in regex_matched_domains])
    logger.info('Unique domain names selected after searching with the case-sensitive regex string \'%s\':\n%s\n'
        % (uniprot_domain_regex, regex_matched_domains_unique_names))
Beispiel #6
0
def align_targets_and_templates(process_only_these_targets=None, process_only_these_templates=None, loglevel=None):
    """
    Conducts pairwise alignments of target sequences against template sequences.
    Stores Modeller-compatible 'alignment.pir' files in each model directory,
    and also outputs a table of model IDs, sorted by sequence identity.

    :param process_only_these_targets:
    :param process_only_these_templates:
    :param loglevel:
    :return:
    """
    ensembler.utils.set_loglevel(loglevel)
    targets, templates_resolved_seq = ensembler.core.get_targets_and_templates()
    ntemplates = len(templates_resolved_seq)
    nselected_templates = len(process_only_these_templates) if process_only_these_templates else ntemplates
    for target in targets:
        if process_only_these_targets and target.id not in process_only_these_targets: continue

        if mpistate.rank == 0:
            logger.info('Working on target %s...' % target.id)

        models_target_dir = os.path.join(ensembler.core.default_project_dirnames.models, target.id)
        ensembler.utils.create_dir(models_target_dir)

        seq_identity_data_sublist = []

        for template_index in range(mpistate.rank, ntemplates, mpistate.size):
            template_id = templates_resolved_seq[template_index].id
            if os.path.exists(os.path.join(ensembler.core.default_project_dirnames.templates_structures_modeled_loops, template_id + '.pdb')):
                remodeled_seq_filepath = os.path.join(ensembler.core.default_project_dirnames.templates_structures_modeled_loops, template_id + '-pdbfixed.fasta')
                template = list(Bio.SeqIO.parse(remodeled_seq_filepath, 'fasta'))[0]
            else:
                template = templates_resolved_seq[template_index]

            if process_only_these_templates and template_id not in process_only_these_templates: continue

            model_dir = os.path.abspath(os.path.join(ensembler.core.default_project_dirnames.models, target.id, template_id))
            ensembler.utils.create_dir(model_dir)
            aln = align_target_template(target, template)
            aln_filepath = os.path.join(model_dir, 'alignment.pir')
            write_modeller_pir_aln_file(aln, target, template, pir_aln_filepath=aln_filepath)
            seq_identity_data_sublist.append({
                'templateid': template_id,
                'seq_identity': calculate_seq_identity(aln),
            })

        seq_identity_data_gathered = mpistate.comm.gather(seq_identity_data_sublist, root=0)

        seq_identity_data = []
        if mpistate.rank == 0:
            seq_identity_data = [None] * nselected_templates
            for i in range(nselected_templates):
                seq_identity_data[i] = seq_identity_data_gathered[i % mpistate.size][i // mpistate.size]

        seq_identity_data = mpistate.comm.bcast(seq_identity_data, root=0)

        seq_identity_data = sorted(seq_identity_data, key=lambda x: x['seq_identity'], reverse=True)
        write_sorted_seq_identities(target, seq_identity_data)
def dep_extract_template_pdbchains_from_uniprot_xml(uniprotxml, uniprot_domain_regex=None, manual_overrides=None, specified_pdbids=None, specified_chainids=None):
    selected_pdbchains = []
    all_uniprot_entries = uniprotxml.findall('entry')
    for entry in all_uniprot_entries:
        entry_name = entry.find('name').text
        if uniprot_domain_regex:
            selected_domains = entry.xpath(
                'feature[@type="domain"][match_regex(@description, "%s")]' % uniprot_domain_regex,
                extensions={(None, 'match_regex'): ensembler.core.xpath_match_regex_case_sensitive}
            )
        else:
            selected_domains = entry.findall('feature[@type="domain"]')

        domain_iter = 0
        for domain in selected_domains:
            domain_id = '%s_D%d' % (entry_name, domain_iter)
            domain_span = [int(domain.find('location/begin').get('position')), int(domain.find('location/end').get('position'))]
            if manual_overrides and domain_id in manual_overrides.template.domain_spans:
                domain_span = [int(x) for x in manual_overrides.template.domain_spans[domain_id].split('-')]
            domain_len = domain_span[1] - domain_span[0] + 1
            if manual_overrides and manual_overrides.template.min_domain_len is not None and domain_len < manual_overrides.template.min_domain_len:
                continue
            if manual_overrides and manual_overrides.template.max_domain_len is not None and domain_len > manual_overrides.template.max_domain_len:
                continue

            domain_iter += 1
            pdbs = domain.getparent().xpath(
                'dbReference[@type="PDB"]/property[@type="method"][@value="X-ray" or @value="NMR"]/..')

            for pdb in pdbs:
                pdbid = pdb.get('id')
                if manual_overrides and pdbid in manual_overrides.template.skip_pdbs:
                    continue
                if specified_pdbids and pdbid not in specified_pdbids:
                    continue
                pdb_chain_span_nodes = pdb.findall('property[@type="chains"]')

                for PDB_chain_span_node in pdb_chain_span_nodes:
                    chain_span_string = PDB_chain_span_node.get('value')
                    chain_spans = ensembler.UniProt.parse_uniprot_pdbref_chains(chain_span_string)

                    for chainid in chain_spans.keys():
                        if specified_chainids and len(specified_chainids[pdbid]) > 0 and chainid not in specified_chainids[pdbid]:
                            continue
                        span = chain_spans[chainid]
                        if (span[0] < domain_span[0] + 30) & (span[1] > domain_span[1] - 30):
                            templateid = '%s_%s_%s' % (domain_id, pdbid, chainid)
                            data = {
                                'templateid': templateid,
                                'pdbid': pdbid,
                                'chainid': chainid,
                                'domain_span': domain_span
                            }
                            selected_pdbchains.append(data)
    logger.info('%d PDB chains selected.' % len(selected_pdbchains))
    return selected_pdbchains
Beispiel #8
0
def auto_select_openmm_platform():
    for platform_name in ['CUDA', 'OpenCL', 'CPU', 'Reference']:
        try:
            platform = openmm.Platform.getPlatformByName(platform_name)
            if type(platform) == openmm.Platform:
                logger.info('Auto-selected OpenMM platform: %s' % platform_name)
                return platform_name
        except Exception:
            continue
    raise Exception('No OpenMM platform found')
Beispiel #9
0
def extract_template_structures_from_pdb_files(selected_templates):
    logger.info('Writing template structures...')
    for template in selected_templates:
        pdb_filename = os.path.join(
            ensembler.core.default_project_dirnames.structures_pdb,
            template.pdbid + '.pdb.gz')
        template_resolved_filename = os.path.join(
            ensembler.core.default_project_dirnames.
            templates_structures_resolved, template.templateid + '.pdb')
        ensembler.pdb.extract_residues_by_resnum(template_resolved_filename,
                                                 pdb_filename, template)
Beispiel #10
0
def extract_template_pdb_chain_residues(selected_pdbchains):
    selected_templates = None
    if mpistate.rank == 0:
        logger.info('Extracting residues from PDB chains...')
        selected_templates = []
        for pdbchain in selected_pdbchains:
            extracted_pdb_template_seq_data = extract_pdb_template_seq(pdbchain)
            if extracted_pdb_template_seq_data is not None:
                selected_templates.append(extracted_pdb_template_seq_data)
        logger.info('%d templates selected.\n' % len(selected_templates))
    selected_templates = mpistate.comm.bcast(selected_templates, root=0)
    return selected_templates
Beispiel #11
0
def get_renumbered_topol_resnums(target):
    models_target_dir = os.path.join(default_project_dirnames.models, target.id)
    renumbered_resnums = {}
    for topol_type in ['implicit', 'explicit']:
        topol_path = os.path.join(models_target_dir, 'topol-renumbered-{}.pdb'.format(topol_type))
        if not os.path.exists(topol_path):
            continue
        traj = mdtraj.load_pdb(topol_path)
        res_numbers = [resi.resSeq for resi in traj.top.residues]
        renumbered_resnums[topol_type] = res_numbers
        logger.info('Will use renumbered residues from {} for target {}'.format(topol_path, target.id))
    return renumbered_resnums
Beispiel #12
0
def extract_template_pdb_chain_residues(selected_pdbchains):
    selected_templates = None
    if mpistate.rank == 0:
        logger.info('Extracting residues from PDB chains...')
        selected_templates = []
        for pdbchain in selected_pdbchains:
            extracted_pdb_template_seq_data = extract_pdb_template_seq(
                pdbchain)
            if extracted_pdb_template_seq_data is not None:
                selected_templates.append(extracted_pdb_template_seq_data)
        logger.info('%d templates selected.\n' % len(selected_templates))
    selected_templates = mpistate.comm.bcast(selected_templates, root=0)
    return selected_templates
Beispiel #13
0
def create_dir(dirpath, quiet=True):
    """
    :param dirpath: str
    """
    try:
        os.makedirs(dirpath)
        if not quiet:
            logger.info('Created directory "%s"' % dirpath)
    except OSError as e:
        if e.errno == 17:
            logger.debug('Directory "%s" already exists - will not overwrite' % dirpath)
        else:
            raise
Beispiel #14
0
def create_dir(dirpath, quiet=True):
    """
    :param dirpath: str
    """
    try:
        os.makedirs(dirpath)
        if not quiet:
            logger.info('Created directory "%s"' % dirpath)
    except OSError as e:
        if e.errno == 17:
            logger.debug('Directory "%s" already exists - will not overwrite' %
                         dirpath)
        else:
            raise
Beispiel #15
0
def log_unique_domain_names_selected_by_regex(uniprot_domain_regex,
                                              uniprotxml):
    regex_matched_domains = uniprotxml.xpath(
        'entry/feature[@type="domain"][match_regex(@description, "%s")]' %
        uniprot_domain_regex,
        extensions={
            (None, 'match_regex'):
            ensembler.core.xpath_match_regex_case_sensitive
        })
    regex_matched_domains_unique_names = set(
        [domain.get('description') for domain in regex_matched_domains])
    logger.info(
        'Unique domain names selected after searching with the case-sensitive regex string \'%s\':\n%s\n'
        % (uniprot_domain_regex, regex_matched_domains_unique_names))
Beispiel #16
0
def build_models_target_setup(target):
    target_setup_data = None
    if mpistate.rank == 0:
        models_target_dir = os.path.join(
            ensembler.core.default_project_dirnames.models, target.id)
        target_starttime = datetime.datetime.utcnow()
        logger.info(
            '=========================================================================\n'
            'Working on target "%s"\n'
            '========================================================================='
            % target.id)
        target_setup_data = TargetSetupData(
            target_starttime=target_starttime,
            models_target_dir=models_target_dir)
    target_setup_data = mpistate.comm.bcast(target_setup_data, root=0)
    return target_setup_data
Beispiel #17
0
def get_renumbered_topol_resnums(target):
    models_target_dir = os.path.join(default_project_dirnames.models,
                                     target.id)
    renumbered_resnums = {}
    for topol_type in ['implicit', 'explicit']:
        topol_path = os.path.join(models_target_dir,
                                  'topol-renumbered-{}.pdb'.format(topol_type))
        if not os.path.exists(topol_path):
            continue
        traj = mdtraj.load_pdb(topol_path)
        res_numbers = [resi.resSeq for resi in traj.top.residues]
        renumbered_resnums[topol_type] = res_numbers
        logger.info(
            'Will use renumbered residues from {} for target {}'.format(
                topol_path, target.id))
    return renumbered_resnums
Beispiel #18
0
    def _gather_targets(self):
        logger.info('Querying UniProt web server...')

        get_uniprot_xml_args = {}
        if self._save_uniprot_xml:
            get_uniprot_xml_args['write_to_filepath'] = 'targets-uniprot.xml'

        uniprotxml = ensembler.UniProt.get_uniprot_xml(self.uniprot_query_string, **get_uniprot_xml_args)

        logger.info('Number of entries returned from initial UniProt search: %r\n' % len(uniprotxml))
        log_unique_domain_names(self.uniprot_query_string, uniprotxml)
        if self.uniprot_domain_regex is not None:
            log_unique_domain_names_selected_by_regex(self.uniprot_domain_regex, uniprotxml)
        fasta_ofilepath = os.path.join(ensembler.core.default_project_dirnames.targets, 'targets.fa')
        self.targets = self._extract_targets_from_uniprot_xml(uniprotxml)
        Bio.SeqIO.write(self.targets, fasta_ofilepath, 'fasta')
        self._write_metadata()
Beispiel #19
0
def build_models_target_setup(target):
    target_setup_data = None
    if mpistate.rank == 0:
        models_target_dir = os.path.join(ensembler.core.default_project_dirnames.models, target.id)
        target_starttime = datetime.datetime.utcnow()
        logger.info(
            '=========================================================================\n'
            'Working on target "%s"\n'
            '========================================================================='
            % target.id
        )
        target_setup_data = TargetSetupData(
            target_starttime=target_starttime,
            models_target_dir=models_target_dir
        )
    target_setup_data = mpistate.comm.bcast(target_setup_data, root=0)
    return target_setup_data
Beispiel #20
0
def loopmodel_templates(templates, missing_residues, process_only_these_templates=None, overwrite_structures=False):
    """
    Parameters
    ----------
    templates:  list of BioPython SeqRecord
        only the id is used
    missing_residues: list of list of OpenMM Residue
    process_only_these_templates: bool
    overwrite_structures: bool
    """
    for template_index in range(mpistate.rank, len(templates), mpistate.size):
        template = templates[template_index]
        if process_only_these_templates and template.id not in process_only_these_templates:
            continue
        if mpistate.size > 1:
            logger.info('MPI rank %d modeling missing loops for template %s' % (mpistate.rank, template.id))
        else:
            logger.info('Modeling missing loops for template %s' % template.id)
        loopmodel_template(template, missing_residues[template_index], overwrite_structures=overwrite_structures)
Beispiel #21
0
def log_unique_domain_names(uniprot_query_string, uniprotxml):
    # Example query string: 'domain:"Protein kinase" AND reviewed:yes'
    domain_match = re.search('domain:([\"\'].*[\"\'])', uniprot_query_string)
    if domain_match and len(domain_match.groups()) > 0:
        query_string_domain_selection = domain_match.groups()[0].replace('\'', '').replace('\"', '')
        uniprot_query_string_domains = uniprotxml.xpath(
            'entry/feature[@type="domain"][match_regex(@description, "%s")]' % query_string_domain_selection,
            extensions={
                (None, 'match_regex'): ensembler.core.xpath_match_regex_case_insensitive
            }
        )
        uniprot_unique_domain_names = set([domain.get('description') for domain in uniprot_query_string_domains])
        logger.info('Set of unique domain names selected by the domain selector \'%s\' during the initial UniProt search:\n%s\n'
                    % (query_string_domain_selection, uniprot_unique_domain_names))

    else:
        uniprot_domains = uniprotxml.xpath('entry/feature[@type="domain"]')
        uniprot_unique_domain_names = set([domain.get('description') for domain in uniprot_domains])
        logger.info('Set of unique domain names returned from the initial UniProt search using the query string \'%s\':\n%s\n'
                    % (uniprot_query_string, uniprot_unique_domain_names))
Beispiel #22
0
    def _gather_targets(self, write_output_files=True):
        logger.info('Querying UniProt web server...')

        get_uniprot_xml_args = {}
        if self._save_uniprot_xml:
            get_uniprot_xml_args['write_to_filepath'] = 'targets-uniprot.xml'

        self.uniprotxml = ensembler.uniprot.get_uniprot_xml(
            self.uniprot_query_string, **get_uniprot_xml_args)

        logger.info(
            'Number of entries returned from initial UniProt search: %r\n' %
            len(self.uniprotxml))
        log_unique_domain_names(self.uniprot_query_string, self.uniprotxml)
        if self.uniprot_domain_regex:
            log_unique_domain_names_selected_by_regex(
                self.uniprot_domain_regex, self.uniprotxml)
        fasta_ofilepath = os.path.join(
            ensembler.core.default_project_dirnames.targets, 'targets.fa')
        self._extract_targets_from_uniprot_xml()
        if write_output_files:
            Bio.SeqIO.write(self.targets, fasta_ofilepath, 'fasta')
            self._write_metadata()
Beispiel #23
0
def build_model(target, template_resolved_seq, target_setup_data,
                write_modeller_restraints_file=False, loglevel=None):
    """Uses Modeller to build a homology model for a given target and
    template.

    Will not run Modeller if the output files already exist.

    Parameters
    ----------
    target : BioPython SeqRecord
    template_resolved_seq : BioPython SeqRecord
        Must be a corresponding .pdb template file with the same ID in the
        templates/structures directory.
    template_resolved_seq : BioPython SeqRecord
        Must be a corresponding .pdb template file with the same ID in the
        templates/structures directory.
    target_setup_data : TargetSetupData obj
    write_modeller_restraints_file : bool
        Write file containing restraints used by Modeller - note that this file can be relatively
        large, e.g. ~300KB per model for a protein kinase domain target.
    loglevel : bool
    """
    ensembler.utils.set_loglevel(loglevel)

    template_structure_dir = os.path.abspath(
        ensembler.core.default_project_dirnames.templates_structures_modeled_loops
    )

    if os.path.exists(os.path.join(template_structure_dir, template_resolved_seq.id + '.pdb')):
        remodeled_seq_filepath = os.path.join(
            ensembler.core.default_project_dirnames.templates_structures_modeled_loops,
            template_resolved_seq.id + '-pdbfixed.fasta'
        )
        template = list(Bio.SeqIO.parse(remodeled_seq_filepath, 'fasta'))[0]
    else:
        template = template_resolved_seq
        template_structure_dir = os.path.abspath(
            ensembler.core.default_project_dirnames.templates_structures_resolved
        )

    model_dir = os.path.abspath(os.path.join(target_setup_data.models_target_dir, template.id))
    if not os.path.exists(model_dir):
        ensembler.utils.create_dir(model_dir)
    model_pdbfilepath = os.path.abspath(os.path.join(model_dir, 'model.pdb.gz'))
    modeling_log_filepath = os.path.abspath(os.path.join(model_dir, 'modeling-log.yaml'))

    check_model_pdbfilepath_ends_in_pdbgz(model_pdbfilepath)
    model_pdbfilepath_uncompressed = model_pdbfilepath[:-3]

    if check_all_model_files_present(model_dir):
        logger.debug(
            "Output files already exist for target '%s' // template '%s'; files were not overwritten." %
            (target.id, template.id)
        )
        return

    logger.info(
        '-------------------------------------------------------------------------\n'
        'Modelling "%s" => "%s"\n'
        '-------------------------------------------------------------------------'
        % (target.id, template.id)
    )

    # aln = align_target_template(target, template)
    aln_filepath = os.path.abspath(os.path.join(model_dir, 'alignment.pir'))
    # write_modeller_pir_aln_file(aln, target, template, pir_aln_filepath=aln_filepath)
    log_file = init_build_model_logfile(modeling_log_filepath)

    with ensembler.utils.enter_temp_dir():
        try:
            start = datetime.datetime.utcnow()
            shutil.copy(aln_filepath, 'alignment.pir')
            run_modeller(target, template, model_dir, model_pdbfilepath,
                         model_pdbfilepath_uncompressed, template_structure_dir,
                         write_modeller_restraints_file=write_modeller_restraints_file)
            if os.path.getsize(model_pdbfilepath) < 1:
                raise Exception('Output PDB file is empty.')

            end_successful_build_model_logfile(log_file, start)

        except Exception as e:
            end_exception_build_model_logfile(e, log_file)
 def _finish(self):
     logger.info('Done.')
Beispiel #25
0
def download_sifts_file(pdbid, project_sifts_filepath):
    logger.info('Downloading sifts file for: %s', pdbid)
    sifts_page = ensembler.pdb.retrieve_sifts(pdbid)
    with gzip.open(project_sifts_filepath, 'wb') as project_sifts_file:
        project_sifts_file.write(sifts_page)
Beispiel #26
0
def refine_implicit_md(
        openmm_platform=None, gpupn=1, process_only_these_targets=None,
        process_only_these_templates=None, model_seqid_cutoff=None,
        write_trajectory=False,
        include_disulfide_bonds=False,
        custom_residue_variants=None,
        ff='amber99sbildn',
        implicit_water_model='amber99_obc',
        sim_length=100.0 * unit.picoseconds,
        timestep=2.0 * unit.femtoseconds,             # timestep
        temperature=300.0 * unit.kelvin,              # simulation temperature
        collision_rate=20.0 / unit.picoseconds,       # Langevin collision rate
        cutoff=None,                                  # nonbonded cutoff
        minimization_tolerance=10.0 * unit.kilojoules_per_mole / unit.nanometer,
        minimization_steps=20,
        nsteps_per_iteration=500,
        ph=None,
        retry_failed_runs=False,
        cpu_platform_threads=1,
        loglevel=None):
    # TODO - refactor
    """Run MD refinement in implicit solvent.

    MPI-enabled.
    """
    ensembler.utils.set_loglevel(loglevel)
    gpuid = mpistate.rank % gpupn
    manual_overrides = ManualOverrides()
    if ph is None:
        if manual_overrides.refinement.ph is not None:
            ph = manual_overrides.refinement.ph
        else:
            ph = 7.0
    if custom_residue_variants is None:
        custom_residue_variants = deepcopy(
            manual_overrides.refinement.custom_residue_variants_by_targetid
        )

    if (sim_length / timestep) < nsteps_per_iteration:
        nsteps_per_iteration = int(sim_length / timestep)

    niterations = int((sim_length / timestep) / nsteps_per_iteration)

    models_dir = os.path.abspath(ensembler.core.default_project_dirnames.models)

    targets, templates_resolved_seq = ensembler.core.get_targets_and_templates()

    if process_only_these_templates:
        selected_template_indices = [i for i, seq in enumerate(templates_resolved_seq) if seq.id in process_only_these_templates]
    else:
        selected_template_indices = range(len(templates_resolved_seq))

    if not openmm_platform:
        openmm_platform = auto_select_openmm_platform()

    if openmm_platform == 'CPU':
        platform_properties = {'CpuThreads': str(cpu_platform_threads)}
    else:
        platform_properties = {}

    ff_files = [ff+'.xml', implicit_water_model+'.xml']
    forcefield = app.ForceField(*ff_files)

    kB = unit.MOLAR_GAS_CONSTANT_R
    kT = kB * temperature

    def simulate_implicit_md():

        logger.debug("Reading model...")
        with gzip.open(model_filename) as model_file:
            pdb = app.PDBFile(model_file)

        # Set up Platform
        platform = openmm.Platform.getPlatformByName(openmm_platform)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ:
            # Set GPU id.
            if openmm_platform == 'CUDA':
                platform.setPropertyDefaultValue('CudaDeviceIndex', '%d' % gpuid)
            elif openmm_platform == 'OpenCL':
                platform.setPropertyDefaultValue('OpenCLDeviceIndex', '%d' % gpuid)

        # Construct Modeller object with same topology as ref structure
        # (necessary to keep disulfide bonds consistent)
        modeller = app.Modeller(reference_topology, pdb.positions)
        # set_openmm_topology_bonds_from_atom_indices(modeller.topology, reference_bonds)
        # Add missing protons.
        modeller.addHydrogens(forcefield, pH=ph, variants=reference_variants)
        topology = modeller.getTopology()
        positions = modeller.getPositions()

        logger.debug("Constructing System object...")
        if cutoff is None:
            system = forcefield.createSystem(topology, nonbondedMethod=app.NoCutoff, constraints=app.HBonds)
        else:
            system = forcefield.createSystem(topology, nonbondedMethod=app.CutoffNonPeriodic, nonbondedCutoff=cutoff, constraints=app.HBonds)

        logger.debug("Creating Context...")
        integrator = openmm.LangevinIntegrator(temperature, collision_rate, timestep)
        context = openmm.Context(system, integrator, platform, platform_properties)
        context.setPositions(positions)

        logger.debug("Minimizing structure...")
        openmm.LocalEnergyMinimizer.minimize(context, minimization_tolerance, minimization_steps)

        if write_trajectory:
            # Open trajectory for writing.
            logger.debug("Opening trajectory for writing...")
            trajectory_filename = os.path.join(model_dir, 'implicit-trajectory.pdb.gz')
            trajectory_outfile = gzip.open(trajectory_filename, 'w')
            app.PDBFile.writeHeader(topology, file=trajectory_outfile)

        # Open energy trajectory for writing
        energy_filename = os.path.join(model_dir, 'implicit-energies.txt')
        energy_outfile = open(energy_filename, 'w')
        energy_outfile.write('# iteration | simulation time (ps) | potential_energy (kT) | kinetic_energy (kT) | ns per day\n')

        logger.debug("Running dynamics...")
        import time
        initial_time = time.time()
        for iteration in range(niterations):
            # integrate dynamics
            integrator.step(nsteps_per_iteration)
            # get current state
            state = context.getState(getEnergy=True, getPositions=True)
            simulation_time = state.getTime()
            potential_energy = state.getPotentialEnergy()
            kinetic_energy = state.getKineticEnergy()
            final_time = time.time()
            elapsed_time = (final_time - initial_time) * unit.seconds
            ns_per_day = (simulation_time / elapsed_time) / (unit.nanoseconds / unit.day)
            logger.debug(
                "  %8.1f ps : potential %8.3f kT | kinetic %8.3f kT | %.3f ns/day | %.3f s remain"
                % (
                    simulation_time / unit.picoseconds, potential_energy / kT, kinetic_energy / kT,
                    ns_per_day,
                    elapsed_time * (niterations-iteration-1) / (iteration+1) / unit.seconds
                )
            )

            # Check energies are still finite.
            if np.isnan(potential_energy/kT) or np.isnan(kinetic_energy/kT):
                raise Exception("Potential or kinetic energies are nan.")

            if write_trajectory:
                app.PDBFile.writeModel(topology, state.getPositions(), file=trajectory_outfile, modelIndex=iteration)

            # write data
            energy_outfile.write("  %8d %8.1f %8.3f %8.3f %.3f\n" % (iteration, simulation_time / unit.picoseconds, potential_energy / kT, kinetic_energy / kT, ns_per_day))
            energy_outfile.flush()

        if write_trajectory:
            app.PDBFile.writeFooter(topology, file=trajectory_outfile)
            trajectory_outfile.close()

        energy_outfile.close()

        # Write final PDB file.
        pdb_outfile = gzip.open(pdb_filename, 'wt')
        app.PDBFile.writeHeader(topology, file=pdb_outfile)
        app.PDBFile.writeFile(topology, state.getPositions(), file=pdb_outfile)
        app.PDBFile.writeFooter(topology, file=pdb_outfile)
        pdb_outfile.close()

    # Process targets
    print('Processing targets...') # DEBUG
    for target in targets:
        if (process_only_these_targets is not None) and (target.id not in process_only_these_targets):
            print('Skipping because %s is not in process_only_these_targets' % target.id)
            print(process_only_these_targets)
            continue
        logger.info('Processing %s' % target)
        models_target_dir = os.path.join(models_dir, target.id)
        if mpistate.rank == 0:
            target_starttime = datetime.datetime.utcnow()
            if not os.path.exists(models_target_dir):
                print('%s does not exist, skipping' % models_target_dir)
                continue

        mpistate.comm.Barrier()

        # ========
        # Determine topology (including protonation state) to use throughout
        # ========

        reference_model_id = get_highest_seqid_existing_model(models_target_dir=models_target_dir)
        if reference_model_id is None:
            continue

        reference_model_path = os.path.join(models_target_dir, reference_model_id, 'model.pdb.gz')

        with gzip.open(reference_model_path) as reference_pdb_file:
            reference_pdb = app.PDBFile(reference_pdb_file)

        logger.debug("Using %s as highest identity model" % (reference_model_id))

        if not include_disulfide_bonds:
            remove_disulfide_bonds_from_topology(reference_pdb.topology)

        # Build topology for reference model
        logger.debug("Creating app.Modeller instance...")
        modeller = app.Modeller(reference_pdb.topology, reference_pdb.positions)
        reference_topology = modeller.topology
        logger.debug("Adding hydrogens...")
        reference_variants = modeller.addHydrogens(forcefield, pH=ph)
        if target.id in custom_residue_variants:
            apply_custom_residue_variants(reference_variants, custom_residue_variants[target.id])
        logger.debug("Reference variants extracted:")
        if reference_variants is not None:
            for (residue_index, residue) in enumerate(reference_variants):
                if residue is not None:
                    logger.debug("%8d %s" % (residue_index+1, residue))
            logger.debug("")
        else:
            logger.debug(reference_variants)

        if model_seqid_cutoff:
            process_only_these_templates = ensembler.core.select_templates_by_seqid_cutoff(target.id, seqid_cutoff=model_seqid_cutoff)
            selected_template_indices = [i for i, seq in enumerate(templates_resolved_seq) if seq.id in process_only_these_templates]

        ntemplates_selected = len(selected_template_indices)

        for template_index in range(mpistate.rank, ntemplates_selected, mpistate.size):
            template = templates_resolved_seq[selected_template_indices[template_index]]

            model_dir = os.path.join(models_target_dir, template.id)
            if not os.path.exists(model_dir): continue

            # Only simulate models that are unique following filtering by clustering.
            unique_by_clustering = os.path.exists(os.path.join(model_dir, 'unique_by_clustering'))
            if not unique_by_clustering: continue

            # Pass if this simulation has already been run.
            log_filepath = os.path.join(model_dir, 'implicit-log.yaml')
            if os.path.exists(log_filepath):
                with open(log_filepath) as log_file:
                    log_data = yaml.load(log_file, Loader=ensembler.core.YamlLoader)
                    if log_data.get('successful') is True:
                        continue
                    if log_data.get('finished') is True and (retry_failed_runs is False and log_data.get('successful') is False):
                        continue

            # Check to make sure the initial model file is present.
            model_filename = os.path.join(model_dir, 'model.pdb.gz')
            if not os.path.exists(model_filename):
                logger.debug('model.pdb.gz not present: target %s template %s rank %d gpuid %d' % (target.id, template.id, mpistate.rank, gpuid))
                continue

            pdb_filename = os.path.join(model_dir, 'implicit-refined.pdb.gz')

            logger.info("-------------------------------------------------------------------------")
            logger.info("Simulating %s => %s in implicit solvent for %.1f ps (MPI rank: %d, GPU ID: %d)" % (target.id, template.id, niterations * nsteps_per_iteration * timestep / unit.picoseconds, mpistate.rank, gpuid))
            logger.info("-------------------------------------------------------------------------")

            # Open log file
            log_data = {
                'mpi_rank': mpistate.rank,
                'gpuid': gpuid if 'CUDA_VISIBLE_DEVICES' not in os.environ else os.environ['CUDA_VISIBLE_DEVICES'],
                'openmm_platform': openmm_platform,
                'finished': False,
                'sim_length': str(sim_length),
                'timestep': str(timestep),
                'temperature': str(temperature),
                'ph': ph,
            }
            log_file = ensembler.core.LogFile(log_filepath)
            log_file.log(new_log_data=log_data)

            try:
                start = datetime.datetime.utcnow()
                simulate_implicit_md()
                timing = ensembler.core.strf_timedelta(datetime.datetime.utcnow() - start)
                log_data = {
                    'finished': True,
                    'timing': timing,
                    'successful': True,
                }
                log_file.log(new_log_data=log_data)
            except Exception as e:
                trbk = traceback.format_exc()
                warnings.warn(
                    '= ERROR start: MPI rank {0} hostname {1} gpuid {2} =\n{3}\n{4}\n= ERROR end: MPI rank {0} hostname {1} gpuid {2}'.format(
                        mpistate.rank, socket.gethostname(), gpuid, e, trbk
                    )
                )
                timing = ensembler.core.strf_timedelta(datetime.datetime.utcnow() - start)
                log_data = {
                    'exception': e,
                    'traceback': ensembler.core.literal_str(trbk),
                    'timing': timing,
                    'finished': True,
                    'successful': False,
                }
                log_file.log(new_log_data=log_data)

        logger.debug('Finished template loop: rank %d' % mpistate.rank)

        mpistate.comm.Barrier()

        if mpistate.rank == 0:
            project_metadata = ensembler.core.ProjectMetadata(project_stage='refine_implicit_md', target_id=target.id)

            datestamp = ensembler.core.get_utcnow_formatted()
            command = ['find', models_target_dir, '-name', 'implicit-refined.pdb.gz']
            output = subprocess.check_output(command)
            nsuccessful_refinements = output.decode('UTF-8').count('\n')
            target_timedelta = datetime.datetime.utcnow() - target_starttime

            metadata = {
                'target_id': target.id,
                'datestamp': datestamp,
                'timing': ensembler.core.strf_timedelta(target_timedelta),
                'openmm_platform': openmm_platform,
                'process_only_these_targets': process_only_these_targets,
                'process_only_these_templates': process_only_these_templates,
                'model_seqid_cutoff': model_seqid_cutoff,
                'write_trajectory': write_trajectory,
                'include_disulfide_bonds': include_disulfide_bonds,
                'custom_residue_variants': custom_residue_variants,
                'ff': ff,
                'implicit_water_model': implicit_water_model,
                'sim_length': str(sim_length),
                'timestep': str(timestep),
                'temperature': str(temperature),
                'collision_rate': str(collision_rate),
                'cutoff': str(cutoff),
                'nsteps_per_iteration': nsteps_per_iteration,
                'ph': ph,
                'nsuccessful_refinements': nsuccessful_refinements,
                'python_version': sys.version.split('|')[0].strip(),
                'python_full_version': ensembler.core.literal_str(sys.version),
                'ensembler_version': ensembler.version.short_version,
                'ensembler_commit': ensembler.version.git_revision,
                'biopython_version': Bio.__version__,
                'openmm_version': simtk.openmm.version.short_version,
                'openmm_commit': simtk.openmm.version.git_revision,
            }

            project_metadata.add_data(metadata)
            project_metadata.write()

        mpistate.comm.Barrier()

    mpistate.comm.Barrier()
    if mpistate.rank == 0:
        logger.info('Done.')
Beispiel #27
0
 def _finish(self):
     logger.info('Done.')
Beispiel #28
0
def log_done():
    logger.info('Done.')
Beispiel #29
0
def cluster_models(process_only_these_targets=None,
                   cutoff=0.06,
                   loglevel=None):
    """Cluster models based on RMSD, and filter out non-unique models as
    determined by a given cutoff.

    Parameters
    ----------

    cutoff : float
        Minimum distance cutoff for RMSD clustering (nm)

    Runs serially.
    """
    # TODO refactor
    ensembler.utils.set_loglevel(loglevel)
    targets, templates_resolved_seq = get_targets_and_templates()
    templates = templates_resolved_seq

    for target in targets:
        if process_only_these_targets and (target.id
                                           not in process_only_these_targets):
            continue

        models_target_dir = os.path.join(
            ensembler.core.default_project_dirnames.models, target.id)
        if not os.path.exists(models_target_dir): continue

        # =============================
        # Construct a mdtraj trajectory containing all models
        # =============================

        starttime = datetime.datetime.utcnow()

        logger.debug('Building a list of valid models...')

        model_pdbfilenames_compressed = {
            template.id: os.path.join(models_target_dir, template.id,
                                      'model.pdb.gz')
            for template in templates
        }
        model_pdbfilenames_uncompressed = {
            template.id: os.path.join(models_target_dir, template.id,
                                      'model.pdb')
            for template in templates
        }
        valid_templateids = [
            templateid for templateid in model_pdbfilenames_compressed
            if os.path.exists(model_pdbfilenames_compressed[templateid])
        ]

        # Write uncompressed model.pdb files from model.pdb.gz if necessary
        for templateid in valid_templateids:
            if not os.path.exists(
                    model_pdbfilenames_uncompressed[templateid]
            ) or os.path.getsize(
                    model_pdbfilenames_uncompressed[templateid]) == 0:
                with gzip.open(model_pdbfilenames_compressed[templateid]
                               ) as model_pdbfile_compressed:
                    with open(model_pdbfilenames_uncompressed[templateid],
                              'w') as model_pdbfile:
                        model_pdbfile.write(model_pdbfile_compressed.read())

        logger.info('Constructing a trajectory containing all valid models...')

        if len(valid_templateids) == 0:
            logger.info('No models found for target {0}.'.format(target.id))
            continue

        valid_model_pdbfilenames_uncompressed = [
            model_pdbfilenames_uncompressed[templateid]
            for templateid in valid_templateids
        ]

        traj = mdtraj.load(valid_model_pdbfilenames_uncompressed)

        # =============================
        # Clustering
        # =============================

        logger.info('Conducting RMSD-based clustering...')

        # Remove any existing unique_by_clustering files
        for f in glob.glob(models_target_dir + '/*_PK_*/unique_by_clustering'):
            os.unlink(f)

        CAatoms = [a.index for a in traj.topology.atoms if a.name == 'CA']
        unique_templateids = models_regular_spatial_clustering(
            valid_templateids, traj, atom_indices=CAatoms, cutoff=cutoff)
        write_unique_by_clustering_files(unique_templateids, models_target_dir)

        with open(os.path.join(models_target_dir, 'unique-models.txt'),
                  'w') as uniques_file:
            for u in unique_templateids:
                uniques_file.write(u + '\n')
            logger.info(
                '%d unique models (from original set of %d) using cutoff of %.3f nm'
                % (len(unique_templateids), len(valid_templateids), cutoff))

        for template in templates:
            model_dir = os.path.join(models_target_dir, template.id)
            model_pdbfilename = os.path.join(model_dir, 'model.pdb')
            if os.path.exists(model_pdbfilename):
                os.remove(model_pdbfilename)

        # ========
        # Metadata
        # ========

        project_metadata = ensembler.core.ProjectMetadata(
            project_stage='cluster_models', target_id=target.id)
        datestamp = ensembler.core.get_utcnow_formatted()

        timedelta = datetime.datetime.utcnow() - starttime

        metadata = {
            'target_id': target.id,
            'datestamp': datestamp,
            'nunique_models': len(unique_templateids),
            'python_version': sys.version.split('|')[0].strip(),
            'python_full_version': ensembler.core.literal_str(sys.version),
            'ensembler_version': ensembler.version.short_version,
            'ensembler_commit': ensembler.version.git_revision,
            'biopython_version': Bio.__version__,
            'mdtraj_version': mdtraj.version.short_version,
            'mdtraj_commit': mdtraj.version.git_revision,
            'timing': ensembler.core.strf_timedelta(timedelta),
        }

        project_metadata.add_data(metadata)
        project_metadata.write()
Beispiel #30
0
def extract_template_pdbchains_from_uniprot_xml(uniprotxml,
                                                uniprot_domain_regex=None,
                                                manual_overrides=None,
                                                specified_pdbids=None,
                                                specified_chainids=None):
    """
    Parameters
    ----------
    uniprotxml: lxml.etree.Element
    uniprot_domain_regex: str
    manual_overrides: ensembler.core.ManualOverrides
    specified_pdbids: list of str
        ['2QR8', '4GU9']
    specified_chainids: dict of list of str
        {'2QR8': ['A'], '4GU9': ['A', 'B']}

    Returns
    -------
    selected_pdbchains: list of dict
        [
            {
                'templateid': str,
                'pdbid': str,
                'chainid': str,
                'residue_span': [
                    start (int),   # 1-based inclusive
                    end (int)      # 1-based inclusive
                ]
            }
        ]
    """
    selected_pdbchains = []
    all_uniprot_entries = uniprotxml.findall('entry')
    for entry in all_uniprot_entries:
        entry_name = entry.find('name').text
        if uniprot_domain_regex:
            selected_domains = entry.xpath(
                'feature[@type="domain"][match_regex(@description, "%s")]' %
                uniprot_domain_regex,
                extensions={
                    (None, 'match_regex'):
                    ensembler.core.xpath_match_regex_case_sensitive
                })

            domain_iter = 0
            for domain in selected_domains:
                domain_id = '%s_D%d' % (entry_name, domain_iter)
                domain_span = [
                    int(domain.find('location/begin').get('position')),
                    int(domain.find('location/end').get('position'))
                ]
                if manual_overrides and domain_id in manual_overrides.template.domain_spans:
                    domain_span = [
                        int(x) for x in manual_overrides.template.
                        domain_spans[domain_id].split('-')
                    ]
                domain_len = domain_span[1] - domain_span[0] + 1
                if manual_overrides and manual_overrides.template.min_domain_len is not None and domain_len < manual_overrides.template.min_domain_len:
                    continue
                if manual_overrides and manual_overrides.template.max_domain_len is not None and domain_len > manual_overrides.template.max_domain_len:
                    continue

                domain_iter += 1
                pdbs = domain.getparent().xpath(
                    'dbReference[@type="PDB"]/property[@type="method"][@value="X-ray" or @value="NMR"]/..'
                )

                for pdb in pdbs:
                    pdbid = pdb.get('id')
                    if manual_overrides and pdbid in manual_overrides.template.skip_pdbs:
                        continue
                    if specified_pdbids and pdbid not in specified_pdbids:
                        continue
                    pdb_chain_span_nodes = pdb.findall(
                        'property[@type="chains"]')

                    for pdb_chain_span_node in pdb_chain_span_nodes:
                        chain_span_string = pdb_chain_span_node.get('value')
                        chain_spans = ensembler.uniprot.parse_uniprot_pdbref_chains(
                            chain_span_string)

                        for chainid in chain_spans.keys():
                            if specified_chainids and len(
                                    specified_chainids[pdbid]
                            ) > 0 and chainid not in specified_chainids[pdbid]:
                                continue
                            span = chain_spans[chainid]
                            if (span[0] < domain_span[0] + 30) & (
                                    span[1] > domain_span[1] - 30):
                                templateid = '%s_%s_%s' % (domain_id, pdbid,
                                                           chainid)
                                data = {
                                    'templateid': templateid,
                                    'pdbid': pdbid,
                                    'chainid': chainid,
                                    'residue_span': domain_span
                                }
                                selected_pdbchains.append(data)

        else:
            pdbs = entry.xpath(
                'dbReference[@type="PDB"]/property[@type="method"][@value="X-ray" or @value="NMR"]/..'
            )

            for pdb in pdbs:
                pdbid = pdb.get('id')
                if manual_overrides and pdbid in manual_overrides.template.skip_pdbs:
                    continue
                if specified_pdbids and pdbid not in specified_pdbids:
                    continue
                pdb_chain_span_nodes = pdb.findall('property[@type="chains"]')

                for pdb_chain_span_node in pdb_chain_span_nodes:
                    chain_span_string = pdb_chain_span_node.get('value')
                    chain_spans = ensembler.uniprot.parse_uniprot_pdbref_chains(
                        chain_span_string)

                    for chainid in chain_spans.keys():
                        if specified_chainids and len(
                                specified_chainids[pdbid]
                        ) > 0 and chainid not in specified_chainids[pdbid]:
                            continue
                        span = chain_spans[chainid]
                        templateid = '%s_%s_%s' % (entry_name, pdbid, chainid)
                        data = {
                            'templateid': templateid,
                            'pdbid': pdbid,
                            'chainid': chainid,
                            'residue_span': span
                        }
                        selected_pdbchains.append(data)

    logger.info('%d PDB chains selected.' % len(selected_pdbchains))
    return selected_pdbchains
Beispiel #31
0
def cluster_models(process_only_these_targets=None, cutoff=0.06, loglevel=None):
    """Cluster models based on RMSD, and filter out non-unique models as
    determined by a given cutoff.

    Parameters
    ----------

    cutoff : float
        Minimum distance cutoff for RMSD clustering (nm)

    Runs serially.
    """
    # TODO refactor
    ensembler.utils.set_loglevel(loglevel)
    targets, templates_resolved_seq = get_targets_and_templates()
    templates = templates_resolved_seq

    for target in targets:
        if process_only_these_targets and (target.id not in process_only_these_targets): continue

        models_target_dir = os.path.join(ensembler.core.default_project_dirnames.models, target.id)
        if not os.path.exists(models_target_dir): continue

        # =============================
        # Construct a mdtraj trajectory containing all models
        # =============================

        starttime = datetime.datetime.utcnow()

        logger.debug('Building a list of valid models...')

        model_pdbfilenames_compressed = {
            template.id: os.path.join(models_target_dir, template.id, 'model.pdb.gz') for template in templates
        }
        model_pdbfilenames_uncompressed = {
            template.id: os.path.join(models_target_dir, template.id, 'model.pdb') for template in templates
        }
        valid_templateids = [
            templateid for templateid in model_pdbfilenames_compressed
            if os.path.exists(model_pdbfilenames_compressed[templateid])
        ]

        # Write uncompressed model.pdb files from model.pdb.gz if necessary
        for templateid in valid_templateids:
            if not os.path.exists(model_pdbfilenames_uncompressed[templateid]) or os.path.getsize(model_pdbfilenames_uncompressed[templateid]) == 0:
                with gzip.open(model_pdbfilenames_compressed[templateid]) as model_pdbfile_compressed:
                    with open(model_pdbfilenames_uncompressed[templateid], 'w') as model_pdbfile:
                        model_pdbfile.write(model_pdbfile_compressed.read())

        logger.info('Constructing a trajectory containing all valid models...')

        if len(valid_templateids) == 0:
            logger.info('No models found for target {0}.'.format(target.id))
            continue

        valid_model_pdbfilenames_uncompressed = [
            model_pdbfilenames_uncompressed[templateid] for templateid in valid_templateids
        ]

        traj = mdtraj.load(valid_model_pdbfilenames_uncompressed)

        # =============================
        # Clustering
        # =============================

        logger.info('Conducting RMSD-based clustering...')

        # Remove any existing unique_by_clustering files
        for f in glob.glob(models_target_dir+'/*_PK_*/unique_by_clustering'):
            os.unlink(f)

        CAatoms = [a.index for a in traj.topology.atoms if a.name == 'CA']
        unique_templateids = models_regular_spatial_clustering(
            valid_templateids, traj, atom_indices=CAatoms, cutoff=cutoff
        )
        write_unique_by_clustering_files(unique_templateids, models_target_dir)

        with open(os.path.join(models_target_dir, 'unique-models.txt'), 'w') as uniques_file:
            for u in unique_templateids:
                uniques_file.write(u+'\n')
            logger.info(
                '%d unique models (from original set of %d) using cutoff of %.3f nm' %
                        (len(unique_templateids), len(valid_templateids), cutoff)
            )

        for template in templates:
            model_dir = os.path.join(models_target_dir, template.id)
            model_pdbfilename = os.path.join(model_dir, 'model.pdb')
            if os.path.exists(model_pdbfilename):
                os.remove(model_pdbfilename)

        # ========
        # Metadata
        # ========

        project_metadata = ensembler.core.ProjectMetadata(
            project_stage='cluster_models', target_id=target.id
        )
        datestamp = ensembler.core.get_utcnow_formatted()

        timedelta = datetime.datetime.utcnow() - starttime

        metadata = {
            'target_id': target.id,
            'datestamp': datestamp,
            'nunique_models': len(unique_templateids),
            'python_version': sys.version.split('|')[0].strip(),
            'python_full_version': ensembler.core.literal_str(sys.version),
            'ensembler_version': ensembler.version.short_version,
            'ensembler_commit': ensembler.version.git_revision,
            'biopython_version': Bio.__version__,
            'mdtraj_version': mdtraj.version.short_version,
            'mdtraj_commit': mdtraj.version.git_revision,
            'timing': ensembler.core.strf_timedelta(timedelta),
        }

        project_metadata.add_data(metadata)
        project_metadata.write()
Beispiel #32
0
def package_for_fah(process_only_these_targets=None,
                    process_only_these_templates=None,
                    model_seqid_cutoff=None,
                    model_validation_score_cutoff=None,
                    model_validation_score_percentile=None,
                    nclones=1,
                    archive=False,
                    openmm_platform='Reference',
                    temperature=300.0 * unit.kelvin,
                    collision_rate=1.0 / unit.picosecond,
                    timestep=2.0 * unit.femtoseconds,
                    loglevel=None):
    """
    Create the input files and directory structure necessary to start a Folding@Home project.

    MPI-enabled.

    Parameters
    ----------
    archive : Bool
        A .tgz compressed archive will be created for each individual RUN directory.
    """
    set_loglevel(loglevel)

    if mpistate.rank == 0:
        if not os.path.exists(fah_projects_dir):
            os.mkdir(fah_projects_dir)
    mpistate.comm.Barrier()

    targets, templates_resolved_seq = get_targets_and_templates()

    for target in targets:
        if process_only_these_targets and (target.id
                                           not in process_only_these_targets):
            continue

        target_project_dir = os.path.join(fah_projects_dir, target.id)

        models_target_dir = os.path.join(default_project_dirnames.models,
                                         target.id)
        if not os.path.exists(models_target_dir):
            continue

        mpistate.comm.Barrier()

        sorted_valid_templates = []
        system = None
        renumbered_resnums = {}

        if mpistate.rank == 0:
            logger.info(
                '-------------------------------------------------------------------------'
            )
            logger.info('Building FAH OpenMM project for target {}'.format(
                target.id))
            logger.info(
                '-------------------------------------------------------------------------'
            )

            valid_templates = get_valid_templates_for_target(
                target,
                templates_resolved_seq,
                process_only_these_templates=process_only_these_templates,
                model_seqid_cutoff=model_seqid_cutoff,
                model_validation_score_cutoff=model_validation_score_cutoff,
                model_validation_score_percentile=
                model_validation_score_percentile)

            sorted_valid_templates = sort_valid_templates_by_seqid(
                target, valid_templates)

            create_target_project_dir(target)

            system = setup_system_and_integrator_files(
                target, sorted_valid_templates[0], temperature, collision_rate,
                timestep)

            renumbered_resnums = get_renumbered_topol_resnums(target)

        sorted_valid_templates = mpistate.comm.bcast(sorted_valid_templates,
                                                     root=0)
        system = mpistate.comm.bcast(system, root=0)
        renumbered_resnums = mpistate.comm.bcast(renumbered_resnums, root=0)

        logger.debug("Building RUNs in parallel...")

        for run_index in range(mpistate.rank, len(sorted_valid_templates),
                               mpistate.size):
            template = sorted_valid_templates[run_index]

            logger.info(
                '-------------------------------------------------------------------------'
            )
            logger.info('Building RUN{} for template {}'.format(
                run_index, template))
            logger.info(
                '-------------------------------------------------------------------------'
            )

            source_dir = os.path.join(models_target_dir, template)
            generate_fah_run(
                target_project_dir,
                template,
                source_dir,
                system,
                run_index,
                nclones,
                temperature,
                collision_rate,
                timestep,
                openmm_platform,
                renumbered_resnums,
            )

            if archive:
                tgz_fah_run(target, run_index)

    mpistate.comm.Barrier()
    if mpistate.rank == 0:
        logger.info('Done.')
Beispiel #33
0
def download_sifts_file(pdbid, project_sifts_filepath):
    logger.info('Downloading sifts file for: %s', pdbid)
    sifts_page = ensembler.pdb.retrieve_sifts(pdbid)
    with gzip.open(project_sifts_filepath, 'wb') as project_sifts_file:
        project_sifts_file.write(sifts_page)
Beispiel #34
0
def download_pdb_file(pdbid, project_pdb_filepath):
    logger.info('Downloading PDB file for: %s' % pdbid)
    pdbgz_page = ensembler.pdb.retrieve_pdb(pdbid, compressed='yes')
    with open(project_pdb_filepath, 'w') as pdbgz_file:
        pdbgz_file.write(pdbgz_page)
Beispiel #35
0
def align_targets_and_templates(process_only_these_targets=None,
                                process_only_these_templates=None,
                                substitution_matrix='gonnet',
                                gap_open=-10,
                                gap_extend=-0.5,
                                loglevel=None):
    """
    Conducts pairwise alignments of target sequences against template sequences.
    Stores Modeller-compatible 'alignment.pir' files in each model directory,
    and also outputs a table of model IDs, sorted by sequence identity.

    Parameters
    ----------
    process_only_these_targets:
    process_only_these_templates:
    substitution_matrix: str
        Specify an amino acid substitution matrix available from Bio.SubsMat.MatrixInfo
    """
    ensembler.utils.set_loglevel(loglevel)
    targets, templates_resolved_seq = ensembler.core.get_targets_and_templates(
    )
    ntemplates = len(templates_resolved_seq)
    nselected_templates = len(process_only_these_templates
                              ) if process_only_these_templates else ntemplates
    for target in targets:
        if process_only_these_targets and target.id not in process_only_these_targets:
            continue

        if mpistate.rank == 0:
            logger.info('Working on target %s...' % target.id)

        models_target_dir = os.path.join(
            ensembler.core.default_project_dirnames.models, target.id)
        ensembler.utils.create_dir(models_target_dir)

        seq_identity_data_sublist = []

        for template_index in range(mpistate.rank, ntemplates, mpistate.size):
            template_id = templates_resolved_seq[template_index].id
            if os.path.exists(
                    os.path.join(
                        ensembler.core.default_project_dirnames.
                        templates_structures_modeled_loops,
                        template_id + '.pdb')):
                remodeled_seq_filepath = os.path.join(
                    ensembler.core.default_project_dirnames.
                    templates_structures_modeled_loops,
                    template_id + '-pdbfixed.fasta')
                template = list(
                    Bio.SeqIO.parse(remodeled_seq_filepath, 'fasta'))[0]
            else:
                template = templates_resolved_seq[template_index]

            if process_only_these_templates and template_id not in process_only_these_templates:
                continue

            model_dir = os.path.abspath(
                os.path.join(ensembler.core.default_project_dirnames.models,
                             target.id, template_id))
            ensembler.utils.create_dir(model_dir)
            aln = align_target_template(
                target,
                template,
                substitution_matrix=substitution_matrix,
                gap_open=gap_open,
                gap_extend=gap_extend)
            aln_filepath = os.path.join(model_dir, 'alignment.pir')
            write_modeller_pir_aln_file(aln,
                                        target,
                                        template,
                                        pir_aln_filepath=aln_filepath)
            seq_identity_data_sublist.append({
                'templateid':
                template_id,
                'seq_identity':
                calculate_seq_identity(aln),
            })

        seq_identity_data_gathered = mpistate.comm.gather(
            seq_identity_data_sublist, root=0)

        seq_identity_data = []
        if mpistate.rank == 0:
            seq_identity_data = [None] * nselected_templates
            for i in range(nselected_templates):
                seq_identity_data[i] = seq_identity_data_gathered[
                    i % mpistate.size][i // mpistate.size]

        seq_identity_data = mpistate.comm.bcast(seq_identity_data, root=0)

        seq_identity_data = sorted(seq_identity_data,
                                   key=lambda x: x['seq_identity'],
                                   reverse=True)
        write_sorted_seq_identities(target, seq_identity_data)
Beispiel #36
0
def log_done():
    logger.info('Done.')
Beispiel #37
0
def build_model(target,
                template_resolved_seq,
                target_setup_data,
                write_modeller_restraints_file=False,
                loglevel=None):
    """Uses Modeller to build a homology model for a given target and
    template.

    Will not run Modeller if the output files already exist.

    Parameters
    ----------
    target : BioPython SeqRecord
    template_resolved_seq : BioPython SeqRecord
        Must be a corresponding .pdb template file with the same ID in the
        templates/structures directory.
    template_resolved_seq : BioPython SeqRecord
        Must be a corresponding .pdb template file with the same ID in the
        templates/structures directory.
    target_setup_data : TargetSetupData obj
    write_modeller_restraints_file : bool
        Write file containing restraints used by Modeller - note that this file can be relatively
        large, e.g. ~300KB per model for a protein kinase domain target.
    loglevel : bool
    """
    ensembler.utils.set_loglevel(loglevel)

    template_structure_dir = os.path.abspath(
        ensembler.core.default_project_dirnames.
        templates_structures_modeled_loops)

    if os.path.exists(
            os.path.join(template_structure_dir,
                         template_resolved_seq.id + '.pdb')):
        remodeled_seq_filepath = os.path.join(
            ensembler.core.default_project_dirnames.
            templates_structures_modeled_loops,
            template_resolved_seq.id + '-pdbfixed.fasta')
        template = list(Bio.SeqIO.parse(remodeled_seq_filepath, 'fasta'))[0]
    else:
        template = template_resolved_seq
        template_structure_dir = os.path.abspath(
            ensembler.core.default_project_dirnames.
            templates_structures_resolved)

    model_dir = os.path.abspath(
        os.path.join(target_setup_data.models_target_dir, template.id))
    if not os.path.exists(model_dir):
        ensembler.utils.create_dir(model_dir)
    model_pdbfilepath = os.path.abspath(os.path.join(model_dir,
                                                     'model.pdb.gz'))
    modeling_log_filepath = os.path.abspath(
        os.path.join(model_dir, 'modeling-log.yaml'))

    check_model_pdbfilepath_ends_in_pdbgz(model_pdbfilepath)
    model_pdbfilepath_uncompressed = model_pdbfilepath[:-3]

    if check_all_model_files_present(model_dir):
        logger.debug(
            "Output files already exist for target '%s' // template '%s'; files were not overwritten."
            % (target.id, template.id))
        return

    logger.info(
        '-------------------------------------------------------------------------\n'
        'Modelling "%s" => "%s"\n'
        '-------------------------------------------------------------------------'
        % (target.id, template.id))

    # aln = align_target_template(target, template)
    aln_filepath = os.path.abspath(os.path.join(model_dir, 'alignment.pir'))
    # write_modeller_pir_aln_file(aln, target, template, pir_aln_filepath=aln_filepath)
    log_file = init_build_model_logfile(modeling_log_filepath)

    with ensembler.utils.enter_temp_dir():
        try:
            start = datetime.datetime.utcnow()
            shutil.copy(aln_filepath, 'alignment.pir')
            run_modeller(
                target,
                template,
                model_dir,
                model_pdbfilepath,
                model_pdbfilepath_uncompressed,
                template_structure_dir,
                write_modeller_restraints_file=write_modeller_restraints_file)
            if os.path.getsize(model_pdbfilepath) < 1:
                raise Exception('Output PDB file is empty.')

            end_successful_build_model_logfile(log_file, start)

        except Exception as e:
            end_exception_build_model_logfile(e, log_file)
Beispiel #38
0
def download_pdb_file(pdbid, project_pdb_filepath):
    logger.info('Downloading PDB file for: %s' % pdbid)
    pdbgz_page = ensembler.pdb.retrieve_pdb(pdbid, compressed='yes')
    with open(project_pdb_filepath, 'w') as pdbgz_file:
        pdbgz_file.write(pdbgz_page)
Beispiel #39
0
def extract_template_structures_from_pdb_files(selected_templates):
    logger.info('Writing template structures...')
    for template in selected_templates:
        pdb_filename = os.path.join(ensembler.core.default_project_dirnames.structures_pdb, template.pdbid + '.pdb.gz')
        template_resolved_filename = os.path.join(ensembler.core.default_project_dirnames.templates_structures_resolved, template.templateid + '.pdb')
        ensembler.pdb.extract_residues_by_resnum(template_resolved_filename, pdb_filename, template)
Beispiel #40
0
def package_for_fah(process_only_these_targets=None,
                    process_only_these_templates=None,
                    model_seqid_cutoff=None,
                    model_validation_score_cutoff=None,
                    model_validation_score_percentile=None,
                    nclones=1, archive=False,
                    openmm_platform='Reference',
                    temperature=300.0 * unit.kelvin,
                    collision_rate=1.0 / unit.picosecond,
                    timestep=2.0 * unit.femtoseconds,
                    loglevel=None):
    """
    Create the input files and directory structure necessary to start a Folding@Home project.

    MPI-enabled.

    Parameters
    ----------
    archive : Bool
        A .tgz compressed archive will be created for each individual RUN directory.
    """
    set_loglevel(loglevel)

    if mpistate.rank == 0:
        if not os.path.exists(fah_projects_dir):
            os.mkdir(fah_projects_dir)
    mpistate.comm.Barrier()

    targets, templates_resolved_seq = get_targets_and_templates()

    for target in targets:
        if process_only_these_targets and (target.id not in process_only_these_targets):
            continue

        target_project_dir = os.path.join(fah_projects_dir, target.id)

        models_target_dir = os.path.join(default_project_dirnames.models, target.id)
        if not os.path.exists(models_target_dir):
            continue

        mpistate.comm.Barrier()

        sorted_valid_templates = []
        system = None
        renumbered_resnums = {}

        if mpistate.rank == 0:
            logger.info('-------------------------------------------------------------------------')
            logger.info('Building FAH OpenMM project for target {}'.format(target.id))
            logger.info('-------------------------------------------------------------------------')

            valid_templates = get_valid_templates_for_target(
                target,
                templates_resolved_seq,
                process_only_these_templates=process_only_these_templates,
                model_seqid_cutoff=model_seqid_cutoff,
                model_validation_score_cutoff=model_validation_score_cutoff,
                model_validation_score_percentile=model_validation_score_percentile
            )

            sorted_valid_templates = sort_valid_templates_by_seqid(
                target,
                valid_templates
            )

            create_target_project_dir(target)

            system = setup_system_and_integrator_files(
                target,
                sorted_valid_templates[0],
                temperature,
                collision_rate,
                timestep
            )

            renumbered_resnums = get_renumbered_topol_resnums(target)

        sorted_valid_templates = mpistate.comm.bcast(sorted_valid_templates, root=0)
        system = mpistate.comm.bcast(system, root=0)
        renumbered_resnums = mpistate.comm.bcast(renumbered_resnums, root=0)

        logger.debug("Building RUNs in parallel...")

        for run_index in range(mpistate.rank, len(sorted_valid_templates), mpistate.size):
            template = sorted_valid_templates[run_index]

            logger.info('-------------------------------------------------------------------------')
            logger.info(
                'Building RUN{} for template {}'.format(
                    run_index, template
                )
            )
            logger.info('-------------------------------------------------------------------------')

            source_dir = os.path.join(models_target_dir, template)
            generate_fah_run(
                target_project_dir,
                template,
                source_dir,
                system,
                run_index,
                nclones,
                temperature,
                collision_rate,
                timestep,
                openmm_platform,
                renumbered_resnums,
            )

            if archive:
                tgz_fah_run(target, run_index)

    mpistate.comm.Barrier()
    if mpistate.rank == 0:
        logger.info('Done.')
Beispiel #41
0
def determine_nwaters(process_only_these_targets=None,
                      process_only_these_templates=None, model_seqid_cutoff=None,
                      verbose=False,
                      select_at_percentile=None):
    '''Determine distribution of nwaters, and select the value at a certain percentile.
    If not user-specified, the percentile is set to 100 if there are less than 10 templates, otherwise it is set to 68.
    '''

    # Run serially
    if mpistate.rank == 0:
        models_dir = os.path.abspath(ensembler.core.default_project_dirnames.models)

        targets, templates_resolved_seq = ensembler.core.get_targets_and_templates()

        if process_only_these_templates:
            selected_template_indices = [i for i, seq in enumerate(templates_resolved_seq) if seq.id in process_only_these_templates]
        else:
            selected_template_indices = range(len(templates_resolved_seq))

        for target in targets:

            # Process only specified targets if directed.
            if process_only_these_targets and (target.id not in process_only_these_targets): continue

            models_target_dir = os.path.join(models_dir, target.id)
            if not os.path.exists(models_target_dir): continue

            if model_seqid_cutoff:
                process_only_these_templates = ensembler.core.select_templates_by_seqid_cutoff(target.id, seqid_cutoff=model_seqid_cutoff)
                selected_template_indices = [i for i, seq in enumerate(templates_resolved_seq) if seq.id in process_only_these_templates]

            ntemplates_selected = len(selected_template_indices)

            if not select_at_percentile:
                select_at_percentile = 100 if ntemplates_selected < 10 else 68

            if verbose: print("Determining number of waters in each system from target '%s'..." % target.id)

            nwaters_list = []
            for template_index in range(ntemplates_selected):
                template = templates_resolved_seq[selected_template_indices[template_index]]
                if process_only_these_templates and template.id not in process_only_these_templates:
                    continue

                model_dir = os.path.join(models_target_dir, template.id)
                if not os.path.exists(model_dir): continue

                try:
                    nwaters_filename = os.path.join(model_dir, 'nwaters.txt')
                    with open(nwaters_filename, 'r') as nwaters_file:
                        firstline = nwaters_file.readline()
                    nwaters = int(firstline)
                    nwaters_list.append(nwaters)

                except Exception:
                    pass

            nwaters_array = np.array(nwaters_list)
            nwaters_array.sort()

            nwaters_list_filename = os.path.join(models_target_dir, 'nwaters-list.txt')
            with open(nwaters_list_filename, 'w') as nwaters_list_file:
                for nwaters in nwaters_array:
                    nwaters_list_file.write('%12d\n' % nwaters)

            # display statistics
            index_selected = int((len(nwaters_array) - 1) * (float(select_at_percentile) / 100.0))
            index68 = int((len(nwaters_array) - 1) * 0.68)
            index95 = int((len(nwaters_array) - 1) * 0.95)
            if len(nwaters_array) > 0:
                logger.info('Number of waters in solvated models (target: %s): min = %d, max = %d, '
                            'mean = %.1f, 68%% = %.0f, 95%% = %.0f, chosen_percentile (%d%%) = %.0f' %
                            (
                                target.id,
                                nwaters_array.min(),
                                nwaters_array.max(),
                                nwaters_array.mean(),
                                nwaters_array[index68],
                                nwaters_array[index95],
                                select_at_percentile,
                                nwaters_array[index_selected]
                            )
                            )

                filename = os.path.join(models_target_dir, 'nwaters-max.txt')
                with open(filename, 'w') as outfile:
                    outfile.write('%d\n' % nwaters_array.max())

                filename = os.path.join(models_target_dir, 'nwaters-use.txt')
                with open(filename, 'w') as outfile:
                    outfile.write('%d\n' % nwaters_array[index_selected])

            else:
                logger.info('No nwaters information found.')

            project_metadata = ensembler.core.ProjectMetadata(project_stage='determine_nwaters', target_id=target.id)

            datestamp = ensembler.core.get_utcnow_formatted()

            metadata = {
                'target_id': target.id,
                'datestamp': datestamp,
                'model_seqid_cutoff': model_seqid_cutoff,
                'select_at_percentile': select_at_percentile,
                'process_only_these_targets': process_only_these_targets,
                'process_only_these_templates': process_only_these_templates,
                'python_version': sys.version.split('|')[0].strip(),
                'python_full_version': ensembler.core.literal_str(sys.version),
                'ensembler_version': ensembler.version.short_version,
                'ensembler_commit': ensembler.version.git_revision,
                'biopython_version': Bio.__version__,
            }

            project_metadata.add_data(metadata)
            project_metadata.write()

        mpistate.comm.Barrier()

    mpistate.comm.Barrier()
    if mpistate.rank == 0:
        print('Done.')