def _PDBize(biounit, asu, seqres=None, min_polymer_size=10, transformation=False): pdbizer = mol.alg.PDBize(min_polymer_size=min_polymer_size) chains = biounit.GetChainList() c_intvls = biounit.GetChainIntervalList() o_intvls = biounit.GetOperationsIntervalList() ss = seqres if not ss: ss = seq.CreateSequenceList() # create list of operations # for cartesian products, operations are stored in a list, multiplied with # the next list of operations and re-stored... until all lists of operations # are multiplied in an all-against-all manner. operations = biounit.GetOperations() for i in range(0, len(c_intvls)): trans_matrices = geom.Mat4List() l_operations = operations[o_intvls[i][0]:o_intvls[i][1]] if len(l_operations) > 0: for op in l_operations[0]: rot = geom.Mat4() rot.PasteRotation(op.rotation) trans = geom.Mat4() trans.PasteTranslation(op.translation) tr = geom.Mat4() tr = trans * rot trans_matrices.append(tr) for op_n in range(1, len(l_operations)): tmp_ops = geom.Mat4List() for o in l_operations[op_n]: rot = geom.Mat4() rot.PasteRotation(o.rotation) trans = geom.Mat4() trans.PasteTranslation(o.translation) tr = geom.Mat4() tr = trans * rot for t_o in trans_matrices: tp = t_o * tr tmp_ops.append(tp) trans_matrices = tmp_ops # select chains into a view as basis for each transformation assu = asu.Select('cname='+','.join(mol.QueryQuoteName(name) \ for name in \ chains[c_intvls[i][0]:c_intvls[i][1]])) pdbizer.Add(assu, trans_matrices, ss) pdb_bu = pdbizer.Finish(transformation) if transformation: return pdb_bu, pdb_bu.GetTransform().GetMatrix() return pdb_bu
def test_starts_from_last_water_rnum(self): m = mol.CreateEntity() e = m.EditXCS(mol.BUFFERED_EDIT) c = e.InsertChain("A") e.SetChainType(c, mol.CHAINTYPE_WATER) e.AppendResidue(c, "HOH") pdbizer = mol.alg.PDBize() transformations = geom.Mat4List() transformations.append(geom.Mat4()) seqs = seq.CreateSequenceList() pdbizer.Add(m.Select(''), transformations, seqs) pdbizer.Add(m.Select(''), transformations, seqs) pdbized = pdbizer.Finish() self.assertEqual([c.name for c in pdbized.chains], ["-"]) residues = pdbized.residues self.assertEqual( [r.number for r in residues], [mol.ResNum(1, 'A'), mol.ResNum(1, 'B')])
def testSeqListSlice(self): a = seq.CreateSequence('A', 'aaaa') b = seq.CreateSequence('B', 'bbbb') c = seq.CreateSequence('C', 'cccc') d = seq.CreateSequence('D', 'dddd') sl = seq.CreateSequenceList(a, b, c, d) sliced = sl[1:] self.assertEqual(len(sliced), 3) self.assertEqual(str(sliced[0]), str(b)) self.assertEqual(str(sliced[1]), str(c)) self.assertEqual(str(sliced[2]), str(d)) sliced = sl[:-1] self.assertEqual(len(sliced), 3) self.assertEqual(str(sliced[0]), str(a)) self.assertEqual(str(sliced[1]), str(b)) self.assertEqual(str(sliced[2]), str(c)) sliced = sl[-1:] self.assertEqual(len(sliced), 1) self.assertEqual(str(sliced[0]), str(d))
def test_numbers_water_molecules_with_ins_codes(self): m = mol.CreateEntity() e = m.EditXCS(mol.BUFFERED_EDIT) c = e.InsertChain("A") e.SetChainType(c, mol.CHAINTYPE_WATER) for i in range(27): e.AppendResidue(c, "HOH") pdbizer = mol.alg.PDBize() transformations = geom.Mat4List() transformations.append(geom.Mat4()) seqs = seq.CreateSequenceList() pdbizer.Add(m.Select(''), transformations, seqs) pdbized = pdbizer.Finish() self.assertEqual([c.name for c in pdbized.chains], ["-"]) residues = pdbized.residues for i in range(26): self.assertEqual(residues[i].number.num, 1) self.assertEqual(residues[i].number.ins_code, chr(ord('A') + i)) self.assertEqual(residues[26].number.num, 2) self.assertEqual(residues[26].number.ins_code, 'A')
def _ParseOutput(tmp_dir_name): with open(os.path.join(tmp_dir_name, 'headers.dmp'), 'r') as f: header_data = f.readlines() with open(os.path.join(tmp_dir_name, 'clusters.dmp'), 'r') as f: cluster_data = f.readlines() sequences = io.LoadSequenceList(os.path.join(tmp_dir_name, 'fastadb.fasta')) clusters = dict() header_mapper = dict() for line in header_data: header_mapper[int( line.split()[0])] = line.split()[1].strip().strip('>') #find numeric ids of the representatives of the clusters unique_representatives = list() for line in cluster_data[1:]: actual_cluster = int(line.split()[1]) try: unique_representatives.index(actual_cluster) except: unique_representatives.append(actual_cluster) #assign every header to its corresponding cluster, where the #cluster id is given by the id of the representative of the cluster for idx in unique_representatives: clusters[idx] = seq.CreateSequenceList() for line in cluster_data[1:]: clusters[int(line.split()[1])].AddSequence( sequences.FindSequence(header_mapper[int(line.split()[0])])) #translate into final output res = list() for k, v in clusters.items(): res.append(cluster(v, header_mapper[k])) return res
def ClustalW(seq1, seq2=None, clustalw=None, keep_files=False, nopgap=False, clustalw_option_string=False): ''' Runs a ClustalW multiple sequence alignment. The results are returned as a :class:`~ost.seq.AlignmentHandle` instance. There are two ways to use this function: - align exactly two sequences: :param seq1: sequence_one :type seq1: :class:`~ost.seq.SequenceHandle` or :class:`str` :param seq2: sequence_two :type seq2: :class:`~ost.seq.SequenceHandle` or :class:`str` The two sequences can be specified as two separate function parameters (`seq1`, `seq2`). The type of both parameters can be either :class:`~ost.seq.SequenceHandle` or :class:`str`, but must be the same for both parameters. - align two or more sequences: :param seq1: sequence_list :type seq1: :class:`~ost.seq.SequenceList` :param seq2: must be :class:`None` Two or more sequences can be specified by using a :class:`~ost.seq.SequenceList`. It is then passed as the first function parameter (`seq1`). The second parameter (`seq2`) must be :class:`None`. :param clustalw: path to ClustalW executable (used in :func:`~ost.settings.Locate`) :type clustalw: :class:`str` :param nopgap: turn residue-specific gaps off :type nopgap: :class:`bool` :param clustalw_option_string: additional ClustalW flags (see http://www.clustal.org/download/clustalw_help.txt) :type clustalw_option_string: :class:`str` :param keep_files: do not delete temporary files :type keep_files: :class:`bool` .. note :: - In the passed sequences ClustalW will convert lowercase to uppercase, and change all '.' to '-'. OST will convert and '?' to 'X' before aligning sequences with ClustalW. - If a :attr:`sequence name <ost.seq.SequenceHandle.name>` contains spaces, only the part before the space is considered as sequence name. To avoid surprises, you should remove spaces from the sequence name. - Sequence names must be unique (:class:`ValueError` exception raised otherwise). ClustalW will accept only IUB/IUPAC amino acid and nucleic acid codes: ======= ======================= ======= ============================ Residue Name Residue Name ======= ======================= ======= ============================ A alanine P proline B aspartate or asparagine Q glutamine C cystine R arginine D aspartate S serine E glutamate T threonine F phenylalanine U selenocysteine G glycine V valine H histidine W tryptophan I isoleucine Y tyrosine K lysine Z glutamate or glutamine L leucine X any M methionine \\* translation stop N asparagine \\- gap of indeterminate length ======= ======================= ======= ============================ ''' clustalw_path = settings.Locate(('clustalw', 'clustalw2'), explicit_file_name=clustalw) if seq2 != None: if isinstance(seq1, seq.SequenceHandle) and isinstance( seq2, seq.SequenceHandle): seq_list = seq.CreateSequenceList() seq_list.AddSequence(seq1) seq_list.AddSequence(seq2) elif isinstance(seq1, str) and isinstance(seq2, str): seqh1 = seq.CreateSequence("seq1", seq1) seqh2 = seq.CreateSequence("seq2", seq2) seq_list = seq.CreateSequenceList() seq_list.AddSequence(seqh1) seq_list.AddSequence(seqh2) else: LogError("WARNING: Specify at least two Sequences") return elif isinstance(seq1, seq.SequenceList): seq_list = seq1 else: LogError( "WARNING: Specify either two SequenceHandles or one SequenceList") return sequence_names = set() for s in seq_list: # we cut out anything after a space to be consistent with ClustalW behaviour sequence_names.add(s.GetName().split(' ')[0]) if len(sequence_names) < len(seq_list): raise ValueError( "ClustalW can only process sequences with unique identifiers!") new_list = seq.CreateSequenceList() for s in seq_list: ss = s.Copy() for i, c in enumerate(ss): if c == '?': ss[i] = 'X' new_list.AddSequence(ss) seq_list = new_list temp_dir = utils.TempDirWithFiles((seq_list, )) out = os.path.join(temp_dir.dirname, 'out.fasta') command = '%s -infile="%s" -output=fasta -outfile="%s"' % ( clustalw_path, temp_dir.files[0], out) if nopgap: command += " -nopgap" if clustalw_option_string != False: command = command + " " + clustalw_option_string #see useful flags: http://toolkit.tuebingen.mpg.de/clustalw/help_params subprocess.run(command, shell=True, stdout=subprocess.DEVNULL) aln = io.LoadAlignment(out) for sequence in seq_list: for seq_num, aln_seq in enumerate(aln.sequences): if aln_seq.GetName() == sequence.GetName(): break aln.SetSequenceOffset(seq_num, sequence.offset) if sequence.HasAttachedView(): aln.AttachView(seq_num, sequence.GetAttachedView().Copy()) if not keep_files: temp_dir.Cleanup() return aln