def build_graph_from_secstruct(self, secstruct, domain, chainid=None, ignore_insertion_codes=False):
        """
        Build the list of nodes from the the supplied PTSecStruct
        object. 


        Parameters:
            secstruct - PTSecStruct (ptsecstruct.py) object to build from
            domain - PTDomain (ptdomain.py) object listing the segment(s)
                     that make up this domain (only one domain processed at a
                     time).
                     (in/out) NOTE: may be modified by having a segment
                     added if SSE is only partly in domain.
            chainid - chain identifier to build graph for only this chain,
                      or None for all chains (default)
            ignore_insertion_codes - If True, a hack to make it work with
                      PMML (only) which does not report insertion codes
                      unlike DSSP and STRIDE

        Uses member data (write):
            chain_dict - dict of { chainid : node_list } where node_list is
                          list of nodes in order, built in this function
            secstruct - keeps a pointer to the supplied secstruct

          (readonly):
            pdb_struct - The Bio.PDB parsed PDB struct (atomic co-ordinates)
                         for this protein.
            include_310_helices, include_pi_helices - if true, include
                         these kinds of helices.

        Raises exceptions:
           NoSSE_Exception if no helices or strands found
        
        Return value:
            None.
            
        """

        self.secstruct = secstruct

        helix_num = 1
        strand_num = 1

        num_helices_in_domain = 0
        num_strands_in_domain = 0

        #
        # Build dictionary mapping (chainid, pdb_resid) to index in residue_list
        # for ALL residues, not just those in this domain.
        #
        self.residue_list = self.get_residue_list(self.pdb_struct, PTDomain(None, None))
        self.pdb_resid_dict = {}
        seq_indx = 0
        while seq_indx < len(self.residue_list):
            residue = self.residue_list[seq_indx]
            self.pdb_resid_dict[
                (
                    ptsecstruct.pdb_chainid_to_stride_chainid(residue.get_full_id()[2]),
                    biopdbresid_to_pdbresseq(residue.get_id(), ignore_insertion_codes),
                )
            ] = seq_indx
            seq_indx += 1

        # Note that now we are only adding elements in the supplied domain,
        # so the so-called 'chains' may really be segments, i.e. subsequences
        # of chains (rest of chain may be in other domain(s)

        self.chain_dict = {}  # dict of {chainid : node_list}

        for (start_chainid, start_resnum, end_chainid, end_resnum, helixtype) in secstruct.helix_list:
            assert start_chainid == end_chainid  # helix must be same chain
            if chainid and chainid != start_chainid:
                continue  # chainid specified, skip ones not in that chain
            # will consider structures in domain if first residue is in domain
            if domain.is_in_domain(start_chainid, get_int_icode(start_resnum)[0]):
                num_helices_in_domain += 1
                if helixtype == "H":
                    idprefix = "ALPHAHELIX_"
                    htype = "ALPHA"
                    this_helix_num = helix_num
                    helix_num += 1
                elif helixtype == "I":
                    if not self.include_pi_helices:
                        continue
                    idprefix = "PIHELIX_"
                    htype = "PI"
                    this_helix_num = helix_num
                    helix_num += 1
                elif helixtype == "G":
                    if not self.include_310_helices:
                        continue
                    idprefix = "310HELIX_"
                    htype = "310"
                    this_helix_num = helix_num
                    helix_num += 1
                else:  # shouldn't happen
                    sys.stderr.write("ERROR: bad helix type " + helixtype + "\n")
                ah_node = PTNodeHelix(
                    htype,
                    idprefix + start_chainid + "_" + str(this_helix_num),
                    this_helix_num,
                    start_resnum,
                    end_resnum,
                    start_chainid,
                    domain.domainid,
                    self.residue_list,
                    self.pdb_resid_dict,
                )
                if not self.chain_dict.has_key(start_chainid):
                    self.chain_dict[start_chainid] = []
                self.chain_dict[start_chainid].append(ah_node)

                # we must already have handled the case of SSEs that cross
                # domain boundaries (by moving whole SSE to one of the domains)
                assert domain.is_in_domain(end_chainid, get_int_icode(end_resnum)[0])

        for (start_chainid, start_resnum, end_chainid, end_resnum) in secstruct.strand_list:
            assert start_chainid == end_chainid  # must be in same chain
            if chainid and chainid != start_chainid:
                continue  # chainid specified, skip ones not in that chain
            if domain.is_in_domain(start_chainid, get_int_icode(start_resnum)[0]):
                num_strands_in_domain += 1
                bs_node = PTNodeStrand(
                    "STRAND_" + start_chainid + "_" + str(strand_num),
                    strand_num,
                    start_resnum,
                    end_resnum,
                    start_chainid,
                    domain.domainid,
                    self.residue_list,
                    self.pdb_resid_dict,
                )
                strand_num += 1
                if not self.chain_dict.has_key(start_chainid):
                    self.chain_dict[start_chainid] = []

                # we must already have handled the case of SSEs that cross
                # domain boundaries (by moving whole SSE to one of the domains)
                assert domain.is_in_domain(end_chainid, get_int_icode(end_resnum)[0])
                self.chain_dict[start_chainid].append(bs_node)

        # raise an exception if there are no SSEs at all in this domain
        if num_helices_in_domain == 0 and num_strands_in_domain == 0:
            raise NoSSE_Exception

        delete_chainid_list = []  # list of chainids to delete from chain_dict
        for (chainid, nodelist) in self.chain_dict.iteritems():
            # sort in order of start residue id ascending (all must be disjoint)
            nodelist.sort()

            if len(nodelist) < 1:
                # There are no SSEs in this chain, get rid of it.
                sys.stderr.write("WARNING: no SSEs in chain " + chainid + "; chain ignored\n")
                delete_chainid_list.append(chainid)  # don't delete while in loop
                continue
            else:
                # Check for chain with only SSEs that will not be drawn
                # (i.e. pi or 310 helices), and delete those too
                found_useful_node = False
                for ptnode in nodelist:
                    if isinstance(ptnode, PTNodeStrand):
                        found_useful_node = True
                        break
                    elif isinstance(ptnode, PTNodeHelix):
                        if ptnode.get_type() == "ALPHA":
                            found_useful_node = True
                            break
                        elif (ptnode.get_type() == "310" and self.include_310_helices) or (
                            ptnode.get_type() == "PI" and self.include_pi_helices
                        ):
                            found_useful_node = True
                            break
                if not found_useful_node:
                    sys.stderr.write("WARNING: only pi or 310 helices in chain " + chainid + "; chain ignored\n")
                    delete_chainid_list.append(chainid)
                    continue

        # delete chains from chain_dict that were marked earlier for deletion
        for chainid in delete_chainid_list:
            self.chain_dict.pop(chainid)

        # -------------------------------------------------------------------

        # This is needed only for labelling sheets for HH and KK codes
        # (see dfs_strands() etc. below)

        # add edges for hydrogen bonds
        # uses secstruct and chainid member data
        # these are used for determining which side bridge partners are
        # on (and also for drawing a hydrogen bond graph if requested)
        self.add_hbond_edges_from_secstruct()

        # add edges for bridge partners
        # uses secstruct and chainid member data
        self.add_bridge_edges_from_secstruct()

        # ---------------------------------------------------------------------

        # for sequential numbering, we'll build this dictionary mapping
        # sequential number (note NOT restarting for each chain)
        # to PTNode
        # so that sequential numbers as used in ptgraph2 -b sequential
        # option.
        # this is a dictionary of { seqnum : PTNode }
        self.seqnum2node = {}
        for (seqnum, node) in enumerate(
            [
                node
                for node in self.iter_nodes()
                if not (
                    (isinstance(node, PTNodeTerminus))
                    or (
                        isinstance(node, PTNodeHelix)
                        and (
                            (node.get_type() == "310" and not self.include_310_helices)
                            or (node.get_type() == "PI" and not self.include_pi_helices)
                        )
                    )
                )
            ]
        ):
            self.seqnum2node[seqnum + 1] = node  # start at 1 not 0
Exemple #2
0
def fixup_crossdomain_sses(secstruct, domain_list):
    """
    Find any SSEs that span a domain boundary, and put each entirely
    in one domain.
    The domain is chosen as the one that contains most of the residues
    int the SSE.
    
    Parameters:
         secstruct - PTSecStruct (ptsecstruct.py) object descirbing SSEs
         domain_list - list of PTDomain objects representing all the domains
                          in this protein.
                          (in/out) NOTE: may be modified by having a segment
                           removed from a domain if SSE is only partly in 
                           the domain.
    Return value: None.
    """
    sse_list = ( [(start_chainid, start_resnum, end_chainid, end_resnum)
                  for (start_chainid, start_resnum, end_chainid, end_resnum)
                  in secstruct.strand_list] +
                 [(start_chainid, start_resnum, end_chainid, end_resnum)
                  for (start_chainid, start_resnum, end_chainid, end_resnum, helix_type)
                  in secstruct.helix_list] )
        
    for (start_chainid, start_resnum, end_chainid, end_resnum) in sse_list:
        for domain in domain_list:
            if (domain.is_in_domain(start_chainid,
                                    get_int_icode(start_resnum)[0])
                and not domain.is_in_domain(end_chainid,
                                            get_int_icode(end_resnum)[0]) ):
                # This really shouldn't happen, but does: domain
                # decomposition has determined that this SSE crosses
                # a domain boundary (really our SSE decisions don't
                # match whatever domain decomposition has done).
                # We'll have to assign the SSE to
                # a domain, and add the residues it spans into that
                # domain.

                # find domain2 as the other domain the SSE is also in
                for domain2 in domain_list:
                    if domain2 == domain:
                        continue
                    if domain2.is_in_domain(end_chainid,
                                            get_int_icode(end_resnum)[0]):
                        break
                        

                # find sse_domain as the domain with more residues of the
                # SSE in it

                domain_res_count = 0
                domain2_res_count = 0
                # FIXME: this is ignoring insertion codes etc., really
                # should convert to proper sequential residue sequence numbers
                # to do this
                start_resint = get_int_icode(start_resnum)[0]
                end_resint = get_int_icode(end_resnum)[0]
                for resint in range(start_resint, end_resint+1):
                    if domain.is_in_domain(start_chainid, resint):
                        domain_res_count += 1
                    elif domain2.is_in_domain(start_chainid, resint):
                        domain2_res_count += 1
                    else:
                        sys.stderr.write('ERROR: SSE in more than 2 domains\n')
                if domain2_res_count > domain_res_count:
                    sse_domain = domain2
                else:
                    sse_domain = domain # arbitrarily domain if equal count

                # first remove the segment from where it currently is
                seg = PTSegment(start_chainid,
                                get_int_icode(start_resnum)[0],
                                get_int_icode(end_resnum)[0])
#                print 'xxx',str(seg)
                for dom in domain_list:
#                    print 'aaa',str(dom)
                    dom.remove_segment(seg)
#                    print 'bbb',str(dom)

                    
                sys.stderr.write('WARNING: SSE ' + start_chainid + ':' +
                                 start_resnum + '-' + end_resnum +
                                 ' crosses domain boundary.\n'
                                 '  Put in domain ' + sse_domain.domainid +
                                 ' (' + str(sse_domain) + ').\n')
                sse_domain.add_segment(seg)
#                print 'zzz',str(sse_domain)

                break # no need to look at any more domains for this SSE
    def check_validity_and_fix(self):
        """
        Check for overlapping secondary structures. This happens for
        example in the PDB HELIX records for 1DLC.  In such a case we
        recover from it in for example this case
        by adding or subtracting one to start/end of ovlerlapping
        HELIX records,

        Parameters:
          None
        Return value:
          True if OK, False if invalid (overlapping structures)
          (Now returns True if it has fixed up overlaps itself)
        Uses data members (READ/WRITE):
           helix_list, strand_list
            (start and end in helix and strand tuples may be modified;
            lists are sorted by increasing residue sequence number)
        """
        helices = [ (chain, start, end, endchain, 'H', htype) 
                    for (chain, start, endchain, end, htype)
                    in self.helix_list ]
        strands = [ (chain, start, end, endchain, 'E', None)
                    for (chain, start, endchain, end)
                    in self.strand_list ]
        sselist = helices + strands
        sselist.sort(cmp=tuplecmp)
        is_valid = True
        for i in xrange(1, len(sselist)):
            sse = sselist[i]
            prevsse = sselist[i-1]
            if (prevsse[0] == sse[0] and
                pdb_res_seq_cmp(sse[1], prevsse[2]) <= 0):
                sys.stderr.write('WARNING: PDB has overlapping SSE definitions'
                                 ' ' + str(prevsse) + ' and ' + str(sse) + ': ')
                # remove overlap by shortening longer one and lengthing
                # shorter one
                # FIXME: this is ignoring insertion codes etc., really
                # should convert to proper sequential residue sequence numbers
                # to do this
                (prevsse_start,prevsse_start_icode) = get_int_icode(prevsse[1])
                (prevsse_end,prevsse_end_icode) = get_int_icode(prevsse[2])
                (sse_start,sse_start_icode) = get_int_icode(sse[1])
                (sse_end,sse_end_icode) = get_int_icode(sse[2])
                if (prevsse_end_icode or sse_start_icode):
                    sys.stderr.write('contains insertion codes, giving up\n')
                    is_valid = False
                    continue
                prevsse_len = prevsse_end - prevsse_start + 1
                sse_len = sse_end - sse_start + 1
                overlap = prevsse_end - sse_start + 1
                if sse_len > prevsse_len:
                    sse_start += overlap
                else:
                    prevsse_end -= overlap
                sselist[i] = (sse[0],str(sse_start),str(sse_end),
                               sse[3],sse[4],sse[5])
                sselist[i-1] = (prevsse[0],str(prevsse_start),str(prevsse_end),
                                 prevsse[3],prevsse[4],prevsse[5])
                sys.stderr.write('changed to ' + str(sselist[i-1]) + ' and ' +
                                 str(sselist[i]) + '\n')
            i += 1

        # rebuild the helix_list and strand_list with our modified tuples
        self.helix_list = [ (chain, start, endchain, end, htype)
                            for (chain, start, end, endchain, ssetype, htype)
                            in sselist if ssetype == 'H' ]
        self.strand_list = [ (chain, start, endchain, end) 
                             for (chain, start, end, endchain, ssetype, htype)
                             in sselist if ssetype == 'E' ]
        return is_valid