Example #1
0
def _PDBize(biounit,
            asu,
            seqres=None,
            min_polymer_size=10,
            transformation=False):
    pdbizer = mol.alg.PDBize(min_polymer_size=min_polymer_size)

    chains = biounit.GetChainList()
    c_intvls = biounit.GetChainIntervalList()
    o_intvls = biounit.GetOperationsIntervalList()
    ss = seqres
    if not ss:
        ss = seq.CreateSequenceList()
    # create list of operations
    # for cartesian products, operations are stored in a list, multiplied with
    # the next list of operations and re-stored... until all lists of operations
    # are multiplied in an all-against-all manner.
    operations = biounit.GetOperations()
    for i in range(0, len(c_intvls)):
        trans_matrices = geom.Mat4List()
        l_operations = operations[o_intvls[i][0]:o_intvls[i][1]]
        if len(l_operations) > 0:
            for op in l_operations[0]:
                rot = geom.Mat4()
                rot.PasteRotation(op.rotation)
                trans = geom.Mat4()
                trans.PasteTranslation(op.translation)
                tr = geom.Mat4()
                tr = trans * rot
                trans_matrices.append(tr)
            for op_n in range(1, len(l_operations)):
                tmp_ops = geom.Mat4List()
                for o in l_operations[op_n]:
                    rot = geom.Mat4()
                    rot.PasteRotation(o.rotation)
                    trans = geom.Mat4()
                    trans.PasteTranslation(o.translation)
                    tr = geom.Mat4()
                    tr = trans * rot
                    for t_o in trans_matrices:
                        tp = t_o * tr
                        tmp_ops.append(tp)
                trans_matrices = tmp_ops
        # select chains into a view as basis for each transformation
        assu = asu.Select('cname='+','.join(mol.QueryQuoteName(name) \
                                            for name in \
                                            chains[c_intvls[i][0]:c_intvls[i][1]]))
        pdbizer.Add(assu, trans_matrices, ss)
    pdb_bu = pdbizer.Finish(transformation)
    if transformation:
        return pdb_bu, pdb_bu.GetTransform().GetMatrix()
    return pdb_bu
Example #2
0
 def test_starts_from_last_water_rnum(self):
     m = mol.CreateEntity()
     e = m.EditXCS(mol.BUFFERED_EDIT)
     c = e.InsertChain("A")
     e.SetChainType(c, mol.CHAINTYPE_WATER)
     e.AppendResidue(c, "HOH")
     pdbizer = mol.alg.PDBize()
     transformations = geom.Mat4List()
     transformations.append(geom.Mat4())
     seqs = seq.CreateSequenceList()
     pdbizer.Add(m.Select(''), transformations, seqs)
     pdbizer.Add(m.Select(''), transformations, seqs)
     pdbized = pdbizer.Finish()
     self.assertEqual([c.name for c in pdbized.chains], ["-"])
     residues = pdbized.residues
     self.assertEqual(
         [r.number for r in residues],
         [mol.ResNum(1, 'A'), mol.ResNum(1, 'B')])
Example #3
0
 def testSeqListSlice(self):
     a = seq.CreateSequence('A', 'aaaa')
     b = seq.CreateSequence('B', 'bbbb')
     c = seq.CreateSequence('C', 'cccc')
     d = seq.CreateSequence('D', 'dddd')
     sl = seq.CreateSequenceList(a, b, c, d)
     sliced = sl[1:]
     self.assertEqual(len(sliced), 3)
     self.assertEqual(str(sliced[0]), str(b))
     self.assertEqual(str(sliced[1]), str(c))
     self.assertEqual(str(sliced[2]), str(d))
     sliced = sl[:-1]
     self.assertEqual(len(sliced), 3)
     self.assertEqual(str(sliced[0]), str(a))
     self.assertEqual(str(sliced[1]), str(b))
     self.assertEqual(str(sliced[2]), str(c))
     sliced = sl[-1:]
     self.assertEqual(len(sliced), 1)
     self.assertEqual(str(sliced[0]), str(d))
Example #4
0
 def test_numbers_water_molecules_with_ins_codes(self):
     m = mol.CreateEntity()
     e = m.EditXCS(mol.BUFFERED_EDIT)
     c = e.InsertChain("A")
     e.SetChainType(c, mol.CHAINTYPE_WATER)
     for i in range(27):
         e.AppendResidue(c, "HOH")
     pdbizer = mol.alg.PDBize()
     transformations = geom.Mat4List()
     transformations.append(geom.Mat4())
     seqs = seq.CreateSequenceList()
     pdbizer.Add(m.Select(''), transformations, seqs)
     pdbized = pdbizer.Finish()
     self.assertEqual([c.name for c in pdbized.chains], ["-"])
     residues = pdbized.residues
     for i in range(26):
         self.assertEqual(residues[i].number.num, 1)
         self.assertEqual(residues[i].number.ins_code, chr(ord('A') + i))
     self.assertEqual(residues[26].number.num, 2)
     self.assertEqual(residues[26].number.ins_code, 'A')
Example #5
0
def _ParseOutput(tmp_dir_name):

    with open(os.path.join(tmp_dir_name, 'headers.dmp'), 'r') as f:
        header_data = f.readlines()
    with open(os.path.join(tmp_dir_name, 'clusters.dmp'), 'r') as f:
        cluster_data = f.readlines()
    sequences = io.LoadSequenceList(os.path.join(tmp_dir_name,
                                                 'fastadb.fasta'))

    clusters = dict()
    header_mapper = dict()
    for line in header_data:
        header_mapper[int(
            line.split()[0])] = line.split()[1].strip().strip('>')

    #find numeric ids of the representatives of the clusters
    unique_representatives = list()
    for line in cluster_data[1:]:
        actual_cluster = int(line.split()[1])
        try:
            unique_representatives.index(actual_cluster)
        except:
            unique_representatives.append(actual_cluster)

    #assign every header to its corresponding cluster, where the
    #cluster id is given by the id of the representative of the cluster
    for idx in unique_representatives:
        clusters[idx] = seq.CreateSequenceList()
    for line in cluster_data[1:]:
        clusters[int(line.split()[1])].AddSequence(
            sequences.FindSequence(header_mapper[int(line.split()[0])]))

    #translate into final output

    res = list()
    for k, v in clusters.items():
        res.append(cluster(v, header_mapper[k]))

    return res
Example #6
0
def ClustalW(seq1,
             seq2=None,
             clustalw=None,
             keep_files=False,
             nopgap=False,
             clustalw_option_string=False):
    '''
  Runs a ClustalW multiple sequence alignment. The results are returned as a
  :class:`~ost.seq.AlignmentHandle` instance.
  
  There are two ways to use this function:
  
   - align exactly two sequences:
   
      :param seq1: sequence_one
      :type seq1: :class:`~ost.seq.SequenceHandle` or :class:`str`
      
      :param seq2: sequence_two
      :type seq2: :class:`~ost.seq.SequenceHandle` or :class:`str`
  
      The two sequences can be specified as two separate function parameters 
      (`seq1`, `seq2`). The type of both parameters can be either
      :class:`~ost.seq.SequenceHandle` or :class:`str`, but must be the same for
      both parameters.
      
   - align two or more sequences:
   
      :param seq1: sequence_list
      :type seq1: :class:`~ost.seq.SequenceList`
      
      :param seq2: must be :class:`None`
      
      Two or more sequences can be specified by using a
      :class:`~ost.seq.SequenceList`. It is then passed as the first function 
      parameter (`seq1`). The second parameter (`seq2`) must be :class:`None`.
      
       
  :param clustalw: path to ClustalW executable (used in :func:`~ost.settings.Locate`)
  :type clustalw: :class:`str`
  :param nopgap: turn residue-specific gaps off
  :type nopgap: :class:`bool`
  :param clustalw_option_string: additional ClustalW flags (see http://www.clustal.org/download/clustalw_help.txt)
  :type clustalw_option_string: :class:`str`
  :param keep_files: do not delete temporary files
  :type keep_files: :class:`bool`

  .. note ::
   
    - In the passed sequences ClustalW will convert lowercase to uppercase, and
      change all '.' to '-'. OST will convert and '?' to 'X' before aligning
      sequences with ClustalW.
    - If a :attr:`sequence name <ost.seq.SequenceHandle.name>` contains spaces,
      only the part before the space is considered as sequence name. To avoid
      surprises, you should remove spaces from the sequence name.
    - Sequence names must be unique (:class:`ValueError` exception raised
      otherwise).

  ClustalW will accept only IUB/IUPAC amino acid and nucleic acid codes:

  ======= ======================= ======= ============================ 
  Residue  Name                   Residue  Name 
  ======= ======================= ======= ============================
     A    alanine                    P    proline
     B    aspartate or asparagine    Q    glutamine
     C    cystine                    R    arginine
     D    aspartate                  S    serine
     E    glutamate                  T    threonine
     F    phenylalanine              U    selenocysteine
     G    glycine                    V    valine
     H    histidine                  W    tryptophan
     I    isoleucine                 Y    tyrosine
     K    lysine                     Z    glutamate or glutamine
     L    leucine                    X    any
     M    methionine                 \\*   translation stop
     N    asparagine                 \\-   gap of indeterminate length
  ======= ======================= ======= ============================ 

  '''
    clustalw_path = settings.Locate(('clustalw', 'clustalw2'),
                                    explicit_file_name=clustalw)

    if seq2 != None:
        if isinstance(seq1, seq.SequenceHandle) and isinstance(
                seq2, seq.SequenceHandle):
            seq_list = seq.CreateSequenceList()
            seq_list.AddSequence(seq1)
            seq_list.AddSequence(seq2)
        elif isinstance(seq1, str) and isinstance(seq2, str):
            seqh1 = seq.CreateSequence("seq1", seq1)
            seqh2 = seq.CreateSequence("seq2", seq2)
            seq_list = seq.CreateSequenceList()
            seq_list.AddSequence(seqh1)
            seq_list.AddSequence(seqh2)
        else:
            LogError("WARNING: Specify at least two Sequences")
            return
    elif isinstance(seq1, seq.SequenceList):
        seq_list = seq1
    else:
        LogError(
            "WARNING: Specify either two SequenceHandles or one SequenceList")
        return

    sequence_names = set()
    for s in seq_list:
        # we cut out anything after a space to be consistent with ClustalW behaviour
        sequence_names.add(s.GetName().split(' ')[0])
    if len(sequence_names) < len(seq_list):
        raise ValueError(
            "ClustalW can only process sequences with unique identifiers!")

    new_list = seq.CreateSequenceList()
    for s in seq_list:
        ss = s.Copy()
        for i, c in enumerate(ss):
            if c == '?':
                ss[i] = 'X'
        new_list.AddSequence(ss)

    seq_list = new_list

    temp_dir = utils.TempDirWithFiles((seq_list, ))
    out = os.path.join(temp_dir.dirname, 'out.fasta')
    command = '%s -infile="%s" -output=fasta -outfile="%s"' % (
        clustalw_path, temp_dir.files[0], out)
    if nopgap:
        command += " -nopgap"
    if clustalw_option_string != False:
        command = command + " " + clustalw_option_string  #see useful flags: http://toolkit.tuebingen.mpg.de/clustalw/help_params

    subprocess.run(command, shell=True, stdout=subprocess.DEVNULL)

    aln = io.LoadAlignment(out)

    for sequence in seq_list:
        for seq_num, aln_seq in enumerate(aln.sequences):
            if aln_seq.GetName() == sequence.GetName():
                break
        aln.SetSequenceOffset(seq_num, sequence.offset)
        if sequence.HasAttachedView():
            aln.AttachView(seq_num, sequence.GetAttachedView().Copy())

    if not keep_files:
        temp_dir.Cleanup()

    return aln