Exemple #1
0
def model_core_from_theseus(models, alignment_file, var_by_res, work_dir=None):
    """
    Only residues from the first protein are listed in the theseus output, but then not even all of them
    
    We assume the output is based on the original alignment so that where each residue in the first protein 
    lines up with either another residue in one of the other proteins or a gap
    
    SO - we need to go through the theseus data and for each residue that is core find the corresponding residues 
    in the other proteins
    
    We use the resSeq numbers to match the residues across the alignment
    """
    if not os.path.isdir(work_dir):
        os.mkdir(work_dir)

    seqalign = sequence_util.Sequence(fasta=alignment_file)

    # We now need to add the list of pdbs, chains and resSeqs of the other models to the Sequence object
    for m in models:
        seqalign.add_pdb_data(m)

    # Sanity check that the names of the pdb files match those from the fasta header
    # Format is expected to be: '>1ujb.pdb(A)'
    names = [h[1:].split('(')[0] for h in seqalign.headers]
    if not seqalign.pdbs == names:
        raise RuntimeError(
            "headers and names of pdb files do not match!\n{0}\n{1}".format(
                seqalign.pdbs, names))

    # Get the name of the first pdb that the alignment is based on
    first = seqalign.pdbs[0]

    # Dictionary mapping model pdb to resSeqs that are core
    model2core = {}
    for p in seqalign.pdbs:
        model2core[p] = []  # initialise

    # Get list of core resSeqs in the first sequence
    model2core[first] = [x.resSeq for x in var_by_res if x.core]

    # Now go through the first sequence and get the resSeqs of the corresponding core for the other models
    pointer = 0  # Tracks where we are in the first sequence
    for i, resSeq in enumerate(seqalign.resseqs[0]):
        if model2core[first][pointer] == resSeq:
            # Core residue in first sequence so append the corresponding resSeqs for the other proteins
            for j, pdb in enumerate(seqalign.pdbs[1:]):
                model2core[pdb].append(seqalign.resseqs[j + 1][i])
            pointer += 1
            if pointer >= len(model2core[first]):
                break

    core_models = []
    for m in models:
        name = os.path.basename(m)
        pdbout = ample_util.filename_append(m, astr='core', directory=work_dir)
        pdb_edit.select_residues(m, pdbout, tokeep=model2core[name])
        core_models.append(pdbout)

    return core_models
Exemple #2
0
def model_core_from_fasta(models, alignment_file, work_dir=None, case_sensitive=False):
    if not os.path.isdir(work_dir): os.mkdir(work_dir)
    
    # Read in alignment to get
    align_seq = sequence_util.Sequence(fasta=alignment_file)
    
    # Check all alignments the same length
    
    # Get pdb names from alignment headers
    seq_names = [ h[1:].strip() for h in align_seq.headers ]
    
    # Need to check if the alignment file is from gesamt, in which case, the names have the
    # chain names in brackets appended
    for i, s in enumerate(seq_names):
        x = re.search("\([a-zA-Z]*\)$", s)
        if x: seq_names[i] = s.replace(x.group(0), "")
    
    # Get array specifying which positions are core. If the positions all align, then there
    # will be a capital letter for the residue. Gaps are signified by "-" and non-structurally-
    # aligned residues by lower-case letters
    GAP = '-'
    # Can't use below as Theseus ignores lower-case letters in the alignment
    if case_sensitive:
        core = [ all([ x in pdb_edit.one2three.keys() for x in t ]) for t in zip(*align_seq.sequences) ]
    else:
        core = [ all([ x != GAP for x in t ]) for t in zip(*align_seq.sequences) ]

    if not any(core): raise RuntimeError("Cannot generate core for models: {0}".format(models))
    
    # For each sequence, get a list of which positions are core
    core_positions = []
    for seq in align_seq.sequences:
        p = []
        count = 0
        for i, pos in enumerate(seq):
            if pos != GAP:
                if core[i]: p.append(count)
                count += 1
        core_positions.append(p)
        
    # Should check lengths of sequences match the length of the aa in the pdbs
        
    # Create dict mapping seq_names to core positions
    core_dict = dict((s, core_positions[i]) for i, s in enumerate(seq_names))
    
    # Cut the models down to core
    core_models = []
    for m in models:
        name = os.path.basename(m)
        pdbout = ample_util.filename_append(m, astr='core', directory=work_dir)
        pdb_edit.select_residues(m, pdbout, tokeep_idx=core_dict[name])
        core_models.append(pdbout)
        
    return core_models
Exemple #3
0
def model_core_from_theseus(models, alignment_file, var_by_res, work_dir=None):
    """
    Only residues from the first protein are listed in the theseus output, but then not even all of them
    
    We assume the output is based on the original alignment so that where each residue in the first protein 
    lines up with either another residue in one of the other proteins or a gap
    
    SO - we need to go through the theseus data and for each residue that is core find the corresponding residues 
    in the other proteins
    
    We use the resSeq numbers to match the residues across the alignment
    """
    if not os.path.isdir(work_dir): os.mkdir(work_dir)

    seqalign = sequence_util.Sequence(fasta=alignment_file)

    # We now need to add the list of pdbs, chains and resSeqs of the other models to the Sequence object
    for m in models: seqalign.add_pdb_data(m)
    
    # Sanity check that the names of the pdb files match those from the fasta header
    # Format is expected to be: '>1ujb.pdb(A)'
    names = [ h[1:].split('(')[0] for h in seqalign.headers ]
    if not seqalign.pdbs == names:
        raise RuntimeError, "headers and names of pdb files do not match!\n{0}\n{1}".format(seqalign.pdbs, names)
    
    # Get the name of the first pdb that the alignment is based on
    first = seqalign.pdbs[0]
    
    # Dictionary mapping model pdb to resSeqs that are core
    model2core = {}
    for p in seqalign.pdbs: model2core[p] = [] # initialise
    
    # Get list of core resSeqs in the first sequence
    model2core[first] = [ x.resSeq for x in var_by_res if x.core ]
    
    # Now go through the first sequence and get the resSeqs of the corresponding core for the other models
    pointer = 0 # Tracks where we are in the first sequence
    for i, resSeq in enumerate(seqalign.resseqs[0]):
        if model2core[first][pointer] == resSeq:
            # Core residue in first sequence so append the corresponding resSeqs for the other proteins
            for j, pdb in enumerate(seqalign.pdbs[1:]):
                model2core[pdb].append(seqalign.resseqs[j+1][i])
            pointer += 1
            if pointer >= len(model2core[first]): break
            
    core_models = []
    for m in models:
        name = os.path.basename(m)
        pdbout = ample_util.filename_append(m, astr='core', directory=work_dir)
        pdb_edit.select_residues(m, pdbout, tokeep=model2core[name])
        core_models.append(pdbout)
        
    return core_models
Exemple #4
0
    def test_align_models_homo(self):
        work_dir = os.path.join(self.tests_dir, 'theseus_align_homo')
        if not os.path.isdir(work_dir):
            os.mkdir(work_dir)
        pdb_list = ['1D7M.pdb', '1GU8.pdb', '2UUI.pdb', '1K33.pdb', '1BYZ.pdb']
        models = []
        tokeep_idx = [i for i in range(12)]
        for pdb in pdb_list:
            pdbin = os.path.join(self.testfiles_dir, pdb)
            name = os.path.splitext(pdb)[0]
            pdbout = os.path.join(self.testfiles_dir,
                                  "{0}_cut.pdb".format(name))
            pdb_edit.select_residues(pdbin, pdbout, tokeep_idx=tokeep_idx)
            models.append(pdbout)

        homologs = True
        rtheseus = theseus.Theseus(work_dir=work_dir,
                                   theseus_exe=self.theseus_exe)
        rtheseus.superpose_models(models, homologs=homologs)
        var_by_res = rtheseus.var_by_res
        # Below with theseus 3.1.1 on osx 10.9.5
        ref = [
            (0, 243, 8.049061),
            (1, 244, 2.614031),
            (2, 245, 1.343609),
            (3, 246, 2.261761),
            (4, 247, 1.112115),
            (5, 248, 0.574936),
            (6, 249, 0.03114),
            (7, 250, 0.002894),
            (8, 251, 0.002314),
            (9, 252, 0.002174),
            (10, 253, 0.016252),
            (11, 254, 0.109965),
        ]

        self.assertEqual([x.idx for x in var_by_res], [x[0] for x in ref])
        self.assertEqual([x.resSeq for x in var_by_res], [x[1] for x in ref])
        for i, (t, r) in enumerate(
                zip([x.variance for x in var_by_res], [x[2] for x in ref])):
            self.assertTrue(
                abs(t - r) < 0.0001,
                "Mismatch for: {0} {1} {2}".format(i, t, r))
        self.assertTrue(
            all([
                os.path.isfile(os.path.join(work_dir, m))
                for m in rtheseus.aligned_models
            ]))
        for m in models:
            os.unlink(m)
        shutil.rmtree(work_dir)
Exemple #5
0
 def truncate_models(
     self,
     models,
     max_cluster_size=200,
     truncation_method=None,
     percent_truncation=None,
     percent_fixed_intervals=None,
     truncation_pruning=None,
     residue_scores=None,
     homologs=False,
     alignment_file=None,
     work_dir=None,
 ):
     """Generate a set of Truncation objects, referencing a set of truncated models generated from the supplied models"""
     truncations = self.calculate_truncations(
         models=models,
         truncation_method=truncation_method,
         percent_truncation=percent_truncation,
         percent_fixed_intervals=percent_fixed_intervals,
         truncation_pruning=truncation_pruning,
         residue_scores=residue_scores,
         alignment_file=alignment_file,
         homologs=homologs,
     )
     if truncations is None or len(truncations) < 1:
         logger.critical("Unable to truncate the ensembles - no viable truncations")
         return []
     # Loop through the Truncation objects, truncating the models based on the truncation data and adding
     # the truncated models to the Truncation.models attribute
     for truncation in truncations:
         truncation.directory = os.path.join(self.work_dir, 'tlevel_{0}'.format(truncation.level))
         os.mkdir(truncation.directory)
         logger.info('Truncating at: %s in directory %s', truncation.level, truncation.directory)
         truncation.models = []
         for infile in self.models:
             pdbout = ample_util.filename_append(infile, str(truncation.level), directory=truncation.directory)
             # Loop through PDB files and create new ones that only contain the residues left after truncation
             pdb_edit.select_residues(pdbin=infile, pdbout=pdbout, tokeep_idx=truncation.residues_idxs)
             truncation.models.append(pdbout)
     self.truncations = truncations
     return truncations
Exemple #6
0
 def truncate_models(self,
                     models,
                     max_cluster_size=200,
                     truncation_method=None,
                     percent_truncation=None,
                     percent_fixed_intervals=None,
                     truncation_pruning=None,
                     residue_scores=None,
                     homologs=False,
                     alignment_file=None,
                     work_dir=None):
     """Generate a set of Truncation objects, referencing a set of truncated models generated from the supplied models"""
     truncations = self.calculate_truncations(
         models=models,
         truncation_method=truncation_method,
         percent_truncation=percent_truncation,
         percent_fixed_intervals=percent_fixed_intervals,
         truncation_pruning=truncation_pruning,
         residue_scores=residue_scores,
         alignment_file=alignment_file,
         homologs=homologs)
     if truncations is None or len(truncations) < 1:
         logger.critical("Unable to truncate the ensembles - no viable truncations")
         return []
     # Loop through the Truncation objects, truncating the models based on the truncation data and adding
     # the truncated models to the Truncation.models attribute
     for truncation in truncations:
         truncation.directory = os.path.join(self.work_dir, 'tlevel_{0}'.format(truncation.level))
         os.mkdir(truncation.directory)
         logger.info('Truncating at: %s in directory %s', truncation.level, truncation.directory)
         truncation.models = []
         for infile in self.models:
             pdbout = ample_util.filename_append(infile, str(truncation.level), directory=truncation.directory)
             # Loop through PDB files and create new ones that only contain the residues left after truncation
             pdb_edit.select_residues(pdbin=infile, pdbout=pdbout, tokeep_idx=truncation.residues_idxs)
             truncation.models.append(pdbout)
     self.truncations = truncations
     return truncations
Exemple #7
0
    def _mod_structures(self, model_aln, structure_aln, model_pdb,
                        structure_pdb):
        """

        Parameters
        ----------
        model_aln : str
           A string containing the aligned sequence of the model
        structure_aln : str
           A string containing the alignment sequence of the structure
        model_pdb : str
           The path to the model pdb file
        structure_pdb : str
           The path to the structure pdb file

        Returns
        -------
        model_pdb_ret : str
           The path to the modified model pdb file
        structure_pdb_ret : str
           The path to the modified structure pdb file

        """
        random_suffix = ''.join(
            random.SystemRandom().choice(string.ascii_lowercase +
                                         string.digits) for _ in range(10))

        model_name = os.path.basename(model_pdb).rsplit(".", 1)[0]
        model_pdb_ret = os.path.join(
            self.tmp_dir, "_".join([model_name, random_suffix, "mod.pdb"]))

        structure_name = os.path.basename(structure_pdb).rsplit(".", 1)[0]
        structure_pdb_ret = os.path.join(
            self.tmp_dir, "_".join([structure_name, random_suffix, "mod.pdb"]))

        if os.path.isfile(model_pdb_ret) or os.path.isfile(structure_pdb_ret):
            msg = "Comparison structures exist. Move, delete or rename before continuing"
            logger.critical(msg)
            raise RuntimeError(msg)

        _model_pdb_tmp_stage1 = ample_util.tmp_file_name(
            delete=False, directory=self.tmp_dir, suffix=".pdb")
        _model_pdb_tmp_stage2 = ample_util.tmp_file_name(
            delete=False, directory=self.tmp_dir, suffix=".pdb")

        _structure_pdb_tmp_stage1 = ample_util.tmp_file_name(
            delete=False, directory=self.tmp_dir, suffix=".pdb")
        _structure_pdb_tmp_stage2 = ample_util.tmp_file_name(
            delete=False, directory=self.tmp_dir, suffix=".pdb")

        model_gaps = self._find_gaps(model_aln)
        structure_gaps = self._find_gaps(structure_aln)

        pdb_edit.renumber_residues_gaps(model_pdb, _model_pdb_tmp_stage1,
                                        model_gaps)
        pdb_edit.renumber_residues_gaps(structure_pdb,
                                        _structure_pdb_tmp_stage1,
                                        structure_gaps)

        model_gaps_indices = [
            i + 1 for i, is_gap in enumerate(model_gaps) if is_gap
        ]
        structure_gaps_indices = [
            i + 1 for i, is_gap in enumerate(structure_gaps) if is_gap
        ]

        pdb_edit.select_residues(_model_pdb_tmp_stage1,
                                 _model_pdb_tmp_stage2,
                                 delete=structure_gaps_indices)
        pdb_edit.select_residues(_structure_pdb_tmp_stage1,
                                 _structure_pdb_tmp_stage2,
                                 delete=model_gaps_indices)

        pdb_edit.renumber_residues(_model_pdb_tmp_stage2, model_pdb_ret)
        pdb_edit.renumber_residues(_structure_pdb_tmp_stage2,
                                   structure_pdb_ret)

        _model_data = list(self._pdb_info(model_pdb_ret))
        _structure_data = list(self._pdb_info(structure_pdb_ret))

        # Alignment not always identical, let's aim for 90%
        identical = set(_model_data).intersection(set(_structure_data))
        if len(identical) / float(len(_model_data)) < 0.90:
            msg = "Differing residues in model and structure. Affected PDBs %s - %s\n%s\n%s"
            raise RuntimeError(
                msg % (model_name, structure_name, model_aln, structure_aln))

        files = [
            _model_pdb_tmp_stage1, _model_pdb_tmp_stage2,
            _structure_pdb_tmp_stage1, _structure_pdb_tmp_stage2
        ]
        for i in range(4):
            os.unlink(files[i])

        if not os.path.isfile(model_pdb_ret):
            raise RuntimeError("Modified model %s does not exist!" %
                               model_pdb_ret)
        if not os.path.isfile(structure_pdb_ret):
            raise RuntimeError("Modified reference %s does not exist!" %
                               structure_pdb_ret)

        return model_pdb_ret, structure_pdb_ret
Exemple #8
0
def model_core_from_fasta(models,
                          alignment_file,
                          work_dir=None,
                          case_sensitive=False):
    if not os.path.isdir(work_dir):
        os.mkdir(work_dir)

    # Read in alignment to get
    align_seq = sequence_util.Sequence(fasta=alignment_file)

    # Check all alignments the same length

    # Get pdb names from alignment headers
    seq_names = [h[1:].strip() for h in align_seq.headers]

    # Need to check if the alignment file is from gesamt, in which case, the names have the
    # chain names in brackets appended
    for i, s in enumerate(seq_names):
        x = re.search("\([a-zA-Z]*\)$", s)
        if x:
            seq_names[i] = s.replace(x.group(0), "")

    # Get array specifying which positions are core. If the positions all align, then there
    # will be a capital letter for the residue. Gaps are signified by "-" and non-structurally-
    # aligned residues by lower-case letters
    GAP = '-'
    # Can't use below as Theseus ignores lower-case letters in the alignment
    if case_sensitive:
        core = [
            all([x in ample_util.one2three.keys() for x in t])
            for t in zip(*align_seq.sequences)
        ]
    else:
        core = [all([x != GAP for x in t]) for t in zip(*align_seq.sequences)]

    if not any(core):
        raise RuntimeError(
            "Cannot generate core for models: {0}".format(models))

    # For each sequence, get a list of which positions are core
    core_positions = []
    for seq in align_seq.sequences:
        p = []
        count = 0
        for i, pos in enumerate(seq):
            if pos != GAP:
                if core[i]:
                    p.append(count)
                count += 1
        core_positions.append(p)

    # Should check lengths of sequences match the length of the aa in the pdbs

    # Create dict mapping seq_names to core positions
    core_dict = dict((s, core_positions[i]) for i, s in enumerate(seq_names))

    # Cut the models down to core
    core_models = []
    for m in models:
        name = os.path.basename(m)
        pdbout = ample_util.filename_append(m, astr='core', directory=work_dir)
        pdb_edit.select_residues(m, pdbout, tokeep_idx=core_dict[name])
        core_models.append(pdbout)

    return core_models
Exemple #9
0
    def _mod_structures(self, model_aln, structure_aln, model_pdb,
                        structure_pdb):
        """

        Parameters
        ----------
        model_aln : str
           A string containing the aligned sequence of the model
        structure_aln : str
           A string containing the alignment sequence of the structure
        model_pdb : str
           The path to the model pdb file
        structure_pdb : str
           The path to the structure pdb file

        Returns
        -------
        model_pdb_ret : str
           The path to the modified model pdb file
        structure_pdb_ret : str
           The path to the modified structure pdb file

        """

        # ================================
        # File definitions
        # ================================

        # Create a storage for the files
        work_dir_mod = os.path.join(self.work_dir, "tm_util_pdbs")
        if not os.path.isdir(work_dir_mod):
            os.mkdir(work_dir_mod)

        # Create a random file suffix to avoid overwriting file names if duplicate
        # Taken from http://stackoverflow.com/a/2257449
        random_suffix = ''.join(
            random.SystemRandom().choice(string.ascii_lowercase +
                                         string.digits) for _ in range(10))

        # File names and output files
        model_name = os.path.basename(model_pdb).rsplit(".", 1)[0]
        model_pdb_ret = os.path.join(
            work_dir_mod, "_".join([model_name, random_suffix, "mod.pdb"]))

        structure_name = os.path.basename(structure_pdb).rsplit(".", 1)[0]
        structure_pdb_ret = os.path.join(
            work_dir_mod, "_".join([structure_name, random_suffix, "mod.pdb"]))

        # Check if the files we are to create for comparison do not exist
        if os.path.isfile(model_pdb_ret) or os.path.isfile(structure_pdb_ret):
            msg = "Comparison structures exist. Move, delete or rename before continuing"
            logger.critical(msg)
            raise RuntimeError(msg)

        # Create temporary files
        _model_pdb_tmp_stage1 = ample_util.tmp_file_name(
            delete=False, directory=work_dir_mod, suffix=".pdb")
        _model_pdb_tmp_stage2 = ample_util.tmp_file_name(
            delete=False, directory=work_dir_mod, suffix=".pdb")

        _structure_pdb_tmp_stage1 = ample_util.tmp_file_name(
            delete=False, directory=work_dir_mod, suffix=".pdb")
        _structure_pdb_tmp_stage2 = ample_util.tmp_file_name(
            delete=False, directory=work_dir_mod, suffix=".pdb")

        # ==================================
        # File manipulation and modification
        # ==================================

        # Get the gap positions in both sequences
        model_gaps = self._find_gaps(model_aln)
        structure_gaps = self._find_gaps(structure_aln)

        # Renumber the pdb files - required in case there are any gaps
        pdb_edit.renumber_residues_gaps(model_pdb, _model_pdb_tmp_stage1,
                                        model_gaps)
        pdb_edit.renumber_residues_gaps(structure_pdb,
                                        _structure_pdb_tmp_stage1,
                                        structure_gaps)

        # Determine the gap indeces
        model_gaps_indeces = [
            i + 1 for i, is_gap in enumerate(model_gaps) if is_gap
        ]
        structure_gaps_indeces = [
            i + 1 for i, is_gap in enumerate(structure_gaps) if is_gap
        ]

        # Use gaps of other sequence to even out
        pdb_edit.select_residues(_model_pdb_tmp_stage1,
                                 _model_pdb_tmp_stage2,
                                 delete=structure_gaps_indeces)
        pdb_edit.select_residues(_structure_pdb_tmp_stage1,
                                 _structure_pdb_tmp_stage2,
                                 delete=model_gaps_indeces)

        # Renumber the pdb files - required by TMscore binary
        pdb_edit.renumber_residues(_model_pdb_tmp_stage2, model_pdb_ret)
        pdb_edit.renumber_residues(_structure_pdb_tmp_stage2,
                                   structure_pdb_ret)

        # ==================================
        # Checks and validations
        # ==================================

        # Extract some information from each PDB structure file
        _model_data = list(self._pdb_info(model_pdb_ret))
        _structure_data = list(self._pdb_info(structure_pdb_ret))

        # Make sure our structures contain the same residues with correct indeces
        if set(_model_data) != set(_structure_data):
            msg = "Residues in model and structure non-identical. Affected PDBs {0} - {1}".format(
                model_name, structure_name)
            logger.critical(msg)
            raise RuntimeError(msg)

        # Remove the temporary files
        for f in [
                _model_pdb_tmp_stage1, _model_pdb_tmp_stage2,
                _structure_pdb_tmp_stage1, _structure_pdb_tmp_stage2
        ]:
            os.unlink(f)

        return model_pdb_ret, structure_pdb_ret