def model_core_from_theseus(models, alignment_file, var_by_res, work_dir=None): """ Only residues from the first protein are listed in the theseus output, but then not even all of them We assume the output is based on the original alignment so that where each residue in the first protein lines up with either another residue in one of the other proteins or a gap SO - we need to go through the theseus data and for each residue that is core find the corresponding residues in the other proteins We use the resSeq numbers to match the residues across the alignment """ if not os.path.isdir(work_dir): os.mkdir(work_dir) seqalign = sequence_util.Sequence(fasta=alignment_file) # We now need to add the list of pdbs, chains and resSeqs of the other models to the Sequence object for m in models: seqalign.add_pdb_data(m) # Sanity check that the names of the pdb files match those from the fasta header # Format is expected to be: '>1ujb.pdb(A)' names = [h[1:].split('(')[0] for h in seqalign.headers] if not seqalign.pdbs == names: raise RuntimeError( "headers and names of pdb files do not match!\n{0}\n{1}".format( seqalign.pdbs, names)) # Get the name of the first pdb that the alignment is based on first = seqalign.pdbs[0] # Dictionary mapping model pdb to resSeqs that are core model2core = {} for p in seqalign.pdbs: model2core[p] = [] # initialise # Get list of core resSeqs in the first sequence model2core[first] = [x.resSeq for x in var_by_res if x.core] # Now go through the first sequence and get the resSeqs of the corresponding core for the other models pointer = 0 # Tracks where we are in the first sequence for i, resSeq in enumerate(seqalign.resseqs[0]): if model2core[first][pointer] == resSeq: # Core residue in first sequence so append the corresponding resSeqs for the other proteins for j, pdb in enumerate(seqalign.pdbs[1:]): model2core[pdb].append(seqalign.resseqs[j + 1][i]) pointer += 1 if pointer >= len(model2core[first]): break core_models = [] for m in models: name = os.path.basename(m) pdbout = ample_util.filename_append(m, astr='core', directory=work_dir) pdb_edit.select_residues(m, pdbout, tokeep=model2core[name]) core_models.append(pdbout) return core_models
def model_core_from_fasta(models, alignment_file, work_dir=None, case_sensitive=False): if not os.path.isdir(work_dir): os.mkdir(work_dir) # Read in alignment to get align_seq = sequence_util.Sequence(fasta=alignment_file) # Check all alignments the same length # Get pdb names from alignment headers seq_names = [ h[1:].strip() for h in align_seq.headers ] # Need to check if the alignment file is from gesamt, in which case, the names have the # chain names in brackets appended for i, s in enumerate(seq_names): x = re.search("\([a-zA-Z]*\)$", s) if x: seq_names[i] = s.replace(x.group(0), "") # Get array specifying which positions are core. If the positions all align, then there # will be a capital letter for the residue. Gaps are signified by "-" and non-structurally- # aligned residues by lower-case letters GAP = '-' # Can't use below as Theseus ignores lower-case letters in the alignment if case_sensitive: core = [ all([ x in pdb_edit.one2three.keys() for x in t ]) for t in zip(*align_seq.sequences) ] else: core = [ all([ x != GAP for x in t ]) for t in zip(*align_seq.sequences) ] if not any(core): raise RuntimeError("Cannot generate core for models: {0}".format(models)) # For each sequence, get a list of which positions are core core_positions = [] for seq in align_seq.sequences: p = [] count = 0 for i, pos in enumerate(seq): if pos != GAP: if core[i]: p.append(count) count += 1 core_positions.append(p) # Should check lengths of sequences match the length of the aa in the pdbs # Create dict mapping seq_names to core positions core_dict = dict((s, core_positions[i]) for i, s in enumerate(seq_names)) # Cut the models down to core core_models = [] for m in models: name = os.path.basename(m) pdbout = ample_util.filename_append(m, astr='core', directory=work_dir) pdb_edit.select_residues(m, pdbout, tokeep_idx=core_dict[name]) core_models.append(pdbout) return core_models
def model_core_from_theseus(models, alignment_file, var_by_res, work_dir=None): """ Only residues from the first protein are listed in the theseus output, but then not even all of them We assume the output is based on the original alignment so that where each residue in the first protein lines up with either another residue in one of the other proteins or a gap SO - we need to go through the theseus data and for each residue that is core find the corresponding residues in the other proteins We use the resSeq numbers to match the residues across the alignment """ if not os.path.isdir(work_dir): os.mkdir(work_dir) seqalign = sequence_util.Sequence(fasta=alignment_file) # We now need to add the list of pdbs, chains and resSeqs of the other models to the Sequence object for m in models: seqalign.add_pdb_data(m) # Sanity check that the names of the pdb files match those from the fasta header # Format is expected to be: '>1ujb.pdb(A)' names = [ h[1:].split('(')[0] for h in seqalign.headers ] if not seqalign.pdbs == names: raise RuntimeError, "headers and names of pdb files do not match!\n{0}\n{1}".format(seqalign.pdbs, names) # Get the name of the first pdb that the alignment is based on first = seqalign.pdbs[0] # Dictionary mapping model pdb to resSeqs that are core model2core = {} for p in seqalign.pdbs: model2core[p] = [] # initialise # Get list of core resSeqs in the first sequence model2core[first] = [ x.resSeq for x in var_by_res if x.core ] # Now go through the first sequence and get the resSeqs of the corresponding core for the other models pointer = 0 # Tracks where we are in the first sequence for i, resSeq in enumerate(seqalign.resseqs[0]): if model2core[first][pointer] == resSeq: # Core residue in first sequence so append the corresponding resSeqs for the other proteins for j, pdb in enumerate(seqalign.pdbs[1:]): model2core[pdb].append(seqalign.resseqs[j+1][i]) pointer += 1 if pointer >= len(model2core[first]): break core_models = [] for m in models: name = os.path.basename(m) pdbout = ample_util.filename_append(m, astr='core', directory=work_dir) pdb_edit.select_residues(m, pdbout, tokeep=model2core[name]) core_models.append(pdbout) return core_models
def test_align_models_homo(self): work_dir = os.path.join(self.tests_dir, 'theseus_align_homo') if not os.path.isdir(work_dir): os.mkdir(work_dir) pdb_list = ['1D7M.pdb', '1GU8.pdb', '2UUI.pdb', '1K33.pdb', '1BYZ.pdb'] models = [] tokeep_idx = [i for i in range(12)] for pdb in pdb_list: pdbin = os.path.join(self.testfiles_dir, pdb) name = os.path.splitext(pdb)[0] pdbout = os.path.join(self.testfiles_dir, "{0}_cut.pdb".format(name)) pdb_edit.select_residues(pdbin, pdbout, tokeep_idx=tokeep_idx) models.append(pdbout) homologs = True rtheseus = theseus.Theseus(work_dir=work_dir, theseus_exe=self.theseus_exe) rtheseus.superpose_models(models, homologs=homologs) var_by_res = rtheseus.var_by_res # Below with theseus 3.1.1 on osx 10.9.5 ref = [ (0, 243, 8.049061), (1, 244, 2.614031), (2, 245, 1.343609), (3, 246, 2.261761), (4, 247, 1.112115), (5, 248, 0.574936), (6, 249, 0.03114), (7, 250, 0.002894), (8, 251, 0.002314), (9, 252, 0.002174), (10, 253, 0.016252), (11, 254, 0.109965), ] self.assertEqual([x.idx for x in var_by_res], [x[0] for x in ref]) self.assertEqual([x.resSeq for x in var_by_res], [x[1] for x in ref]) for i, (t, r) in enumerate( zip([x.variance for x in var_by_res], [x[2] for x in ref])): self.assertTrue( abs(t - r) < 0.0001, "Mismatch for: {0} {1} {2}".format(i, t, r)) self.assertTrue( all([ os.path.isfile(os.path.join(work_dir, m)) for m in rtheseus.aligned_models ])) for m in models: os.unlink(m) shutil.rmtree(work_dir)
def truncate_models( self, models, max_cluster_size=200, truncation_method=None, percent_truncation=None, percent_fixed_intervals=None, truncation_pruning=None, residue_scores=None, homologs=False, alignment_file=None, work_dir=None, ): """Generate a set of Truncation objects, referencing a set of truncated models generated from the supplied models""" truncations = self.calculate_truncations( models=models, truncation_method=truncation_method, percent_truncation=percent_truncation, percent_fixed_intervals=percent_fixed_intervals, truncation_pruning=truncation_pruning, residue_scores=residue_scores, alignment_file=alignment_file, homologs=homologs, ) if truncations is None or len(truncations) < 1: logger.critical("Unable to truncate the ensembles - no viable truncations") return [] # Loop through the Truncation objects, truncating the models based on the truncation data and adding # the truncated models to the Truncation.models attribute for truncation in truncations: truncation.directory = os.path.join(self.work_dir, 'tlevel_{0}'.format(truncation.level)) os.mkdir(truncation.directory) logger.info('Truncating at: %s in directory %s', truncation.level, truncation.directory) truncation.models = [] for infile in self.models: pdbout = ample_util.filename_append(infile, str(truncation.level), directory=truncation.directory) # Loop through PDB files and create new ones that only contain the residues left after truncation pdb_edit.select_residues(pdbin=infile, pdbout=pdbout, tokeep_idx=truncation.residues_idxs) truncation.models.append(pdbout) self.truncations = truncations return truncations
def truncate_models(self, models, max_cluster_size=200, truncation_method=None, percent_truncation=None, percent_fixed_intervals=None, truncation_pruning=None, residue_scores=None, homologs=False, alignment_file=None, work_dir=None): """Generate a set of Truncation objects, referencing a set of truncated models generated from the supplied models""" truncations = self.calculate_truncations( models=models, truncation_method=truncation_method, percent_truncation=percent_truncation, percent_fixed_intervals=percent_fixed_intervals, truncation_pruning=truncation_pruning, residue_scores=residue_scores, alignment_file=alignment_file, homologs=homologs) if truncations is None or len(truncations) < 1: logger.critical("Unable to truncate the ensembles - no viable truncations") return [] # Loop through the Truncation objects, truncating the models based on the truncation data and adding # the truncated models to the Truncation.models attribute for truncation in truncations: truncation.directory = os.path.join(self.work_dir, 'tlevel_{0}'.format(truncation.level)) os.mkdir(truncation.directory) logger.info('Truncating at: %s in directory %s', truncation.level, truncation.directory) truncation.models = [] for infile in self.models: pdbout = ample_util.filename_append(infile, str(truncation.level), directory=truncation.directory) # Loop through PDB files and create new ones that only contain the residues left after truncation pdb_edit.select_residues(pdbin=infile, pdbout=pdbout, tokeep_idx=truncation.residues_idxs) truncation.models.append(pdbout) self.truncations = truncations return truncations
def _mod_structures(self, model_aln, structure_aln, model_pdb, structure_pdb): """ Parameters ---------- model_aln : str A string containing the aligned sequence of the model structure_aln : str A string containing the alignment sequence of the structure model_pdb : str The path to the model pdb file structure_pdb : str The path to the structure pdb file Returns ------- model_pdb_ret : str The path to the modified model pdb file structure_pdb_ret : str The path to the modified structure pdb file """ random_suffix = ''.join( random.SystemRandom().choice(string.ascii_lowercase + string.digits) for _ in range(10)) model_name = os.path.basename(model_pdb).rsplit(".", 1)[0] model_pdb_ret = os.path.join( self.tmp_dir, "_".join([model_name, random_suffix, "mod.pdb"])) structure_name = os.path.basename(structure_pdb).rsplit(".", 1)[0] structure_pdb_ret = os.path.join( self.tmp_dir, "_".join([structure_name, random_suffix, "mod.pdb"])) if os.path.isfile(model_pdb_ret) or os.path.isfile(structure_pdb_ret): msg = "Comparison structures exist. Move, delete or rename before continuing" logger.critical(msg) raise RuntimeError(msg) _model_pdb_tmp_stage1 = ample_util.tmp_file_name( delete=False, directory=self.tmp_dir, suffix=".pdb") _model_pdb_tmp_stage2 = ample_util.tmp_file_name( delete=False, directory=self.tmp_dir, suffix=".pdb") _structure_pdb_tmp_stage1 = ample_util.tmp_file_name( delete=False, directory=self.tmp_dir, suffix=".pdb") _structure_pdb_tmp_stage2 = ample_util.tmp_file_name( delete=False, directory=self.tmp_dir, suffix=".pdb") model_gaps = self._find_gaps(model_aln) structure_gaps = self._find_gaps(structure_aln) pdb_edit.renumber_residues_gaps(model_pdb, _model_pdb_tmp_stage1, model_gaps) pdb_edit.renumber_residues_gaps(structure_pdb, _structure_pdb_tmp_stage1, structure_gaps) model_gaps_indices = [ i + 1 for i, is_gap in enumerate(model_gaps) if is_gap ] structure_gaps_indices = [ i + 1 for i, is_gap in enumerate(structure_gaps) if is_gap ] pdb_edit.select_residues(_model_pdb_tmp_stage1, _model_pdb_tmp_stage2, delete=structure_gaps_indices) pdb_edit.select_residues(_structure_pdb_tmp_stage1, _structure_pdb_tmp_stage2, delete=model_gaps_indices) pdb_edit.renumber_residues(_model_pdb_tmp_stage2, model_pdb_ret) pdb_edit.renumber_residues(_structure_pdb_tmp_stage2, structure_pdb_ret) _model_data = list(self._pdb_info(model_pdb_ret)) _structure_data = list(self._pdb_info(structure_pdb_ret)) # Alignment not always identical, let's aim for 90% identical = set(_model_data).intersection(set(_structure_data)) if len(identical) / float(len(_model_data)) < 0.90: msg = "Differing residues in model and structure. Affected PDBs %s - %s\n%s\n%s" raise RuntimeError( msg % (model_name, structure_name, model_aln, structure_aln)) files = [ _model_pdb_tmp_stage1, _model_pdb_tmp_stage2, _structure_pdb_tmp_stage1, _structure_pdb_tmp_stage2 ] for i in range(4): os.unlink(files[i]) if not os.path.isfile(model_pdb_ret): raise RuntimeError("Modified model %s does not exist!" % model_pdb_ret) if not os.path.isfile(structure_pdb_ret): raise RuntimeError("Modified reference %s does not exist!" % structure_pdb_ret) return model_pdb_ret, structure_pdb_ret
def model_core_from_fasta(models, alignment_file, work_dir=None, case_sensitive=False): if not os.path.isdir(work_dir): os.mkdir(work_dir) # Read in alignment to get align_seq = sequence_util.Sequence(fasta=alignment_file) # Check all alignments the same length # Get pdb names from alignment headers seq_names = [h[1:].strip() for h in align_seq.headers] # Need to check if the alignment file is from gesamt, in which case, the names have the # chain names in brackets appended for i, s in enumerate(seq_names): x = re.search("\([a-zA-Z]*\)$", s) if x: seq_names[i] = s.replace(x.group(0), "") # Get array specifying which positions are core. If the positions all align, then there # will be a capital letter for the residue. Gaps are signified by "-" and non-structurally- # aligned residues by lower-case letters GAP = '-' # Can't use below as Theseus ignores lower-case letters in the alignment if case_sensitive: core = [ all([x in ample_util.one2three.keys() for x in t]) for t in zip(*align_seq.sequences) ] else: core = [all([x != GAP for x in t]) for t in zip(*align_seq.sequences)] if not any(core): raise RuntimeError( "Cannot generate core for models: {0}".format(models)) # For each sequence, get a list of which positions are core core_positions = [] for seq in align_seq.sequences: p = [] count = 0 for i, pos in enumerate(seq): if pos != GAP: if core[i]: p.append(count) count += 1 core_positions.append(p) # Should check lengths of sequences match the length of the aa in the pdbs # Create dict mapping seq_names to core positions core_dict = dict((s, core_positions[i]) for i, s in enumerate(seq_names)) # Cut the models down to core core_models = [] for m in models: name = os.path.basename(m) pdbout = ample_util.filename_append(m, astr='core', directory=work_dir) pdb_edit.select_residues(m, pdbout, tokeep_idx=core_dict[name]) core_models.append(pdbout) return core_models
def _mod_structures(self, model_aln, structure_aln, model_pdb, structure_pdb): """ Parameters ---------- model_aln : str A string containing the aligned sequence of the model structure_aln : str A string containing the alignment sequence of the structure model_pdb : str The path to the model pdb file structure_pdb : str The path to the structure pdb file Returns ------- model_pdb_ret : str The path to the modified model pdb file structure_pdb_ret : str The path to the modified structure pdb file """ # ================================ # File definitions # ================================ # Create a storage for the files work_dir_mod = os.path.join(self.work_dir, "tm_util_pdbs") if not os.path.isdir(work_dir_mod): os.mkdir(work_dir_mod) # Create a random file suffix to avoid overwriting file names if duplicate # Taken from http://stackoverflow.com/a/2257449 random_suffix = ''.join( random.SystemRandom().choice(string.ascii_lowercase + string.digits) for _ in range(10)) # File names and output files model_name = os.path.basename(model_pdb).rsplit(".", 1)[0] model_pdb_ret = os.path.join( work_dir_mod, "_".join([model_name, random_suffix, "mod.pdb"])) structure_name = os.path.basename(structure_pdb).rsplit(".", 1)[0] structure_pdb_ret = os.path.join( work_dir_mod, "_".join([structure_name, random_suffix, "mod.pdb"])) # Check if the files we are to create for comparison do not exist if os.path.isfile(model_pdb_ret) or os.path.isfile(structure_pdb_ret): msg = "Comparison structures exist. Move, delete or rename before continuing" logger.critical(msg) raise RuntimeError(msg) # Create temporary files _model_pdb_tmp_stage1 = ample_util.tmp_file_name( delete=False, directory=work_dir_mod, suffix=".pdb") _model_pdb_tmp_stage2 = ample_util.tmp_file_name( delete=False, directory=work_dir_mod, suffix=".pdb") _structure_pdb_tmp_stage1 = ample_util.tmp_file_name( delete=False, directory=work_dir_mod, suffix=".pdb") _structure_pdb_tmp_stage2 = ample_util.tmp_file_name( delete=False, directory=work_dir_mod, suffix=".pdb") # ================================== # File manipulation and modification # ================================== # Get the gap positions in both sequences model_gaps = self._find_gaps(model_aln) structure_gaps = self._find_gaps(structure_aln) # Renumber the pdb files - required in case there are any gaps pdb_edit.renumber_residues_gaps(model_pdb, _model_pdb_tmp_stage1, model_gaps) pdb_edit.renumber_residues_gaps(structure_pdb, _structure_pdb_tmp_stage1, structure_gaps) # Determine the gap indeces model_gaps_indeces = [ i + 1 for i, is_gap in enumerate(model_gaps) if is_gap ] structure_gaps_indeces = [ i + 1 for i, is_gap in enumerate(structure_gaps) if is_gap ] # Use gaps of other sequence to even out pdb_edit.select_residues(_model_pdb_tmp_stage1, _model_pdb_tmp_stage2, delete=structure_gaps_indeces) pdb_edit.select_residues(_structure_pdb_tmp_stage1, _structure_pdb_tmp_stage2, delete=model_gaps_indeces) # Renumber the pdb files - required by TMscore binary pdb_edit.renumber_residues(_model_pdb_tmp_stage2, model_pdb_ret) pdb_edit.renumber_residues(_structure_pdb_tmp_stage2, structure_pdb_ret) # ================================== # Checks and validations # ================================== # Extract some information from each PDB structure file _model_data = list(self._pdb_info(model_pdb_ret)) _structure_data = list(self._pdb_info(structure_pdb_ret)) # Make sure our structures contain the same residues with correct indeces if set(_model_data) != set(_structure_data): msg = "Residues in model and structure non-identical. Affected PDBs {0} - {1}".format( model_name, structure_name) logger.critical(msg) raise RuntimeError(msg) # Remove the temporary files for f in [ _model_pdb_tmp_stage1, _model_pdb_tmp_stage2, _structure_pdb_tmp_stage1, _structure_pdb_tmp_stage2 ]: os.unlink(f) return model_pdb_ret, structure_pdb_ret