def check_validity_and_fix(self): """ Check for overlapping secondary structures. This happens for example in the PDB HELIX records for 1DLC. In such a case we recover from it in for example this case by adding or subtracting one to start/end of ovlerlapping HELIX records, Parameters: None Return value: True if OK, False if invalid (overlapping structures) (Now returns True if it has fixed up overlaps itself) Uses data members (READ/WRITE): helix_list, strand_list (start and end in helix and strand tuples may be modified; lists are sorted by increasing residue sequence number) """ helices = [(chain, start, end, endchain, 'H', htype) for (chain, start, endchain, end, htype) in self.helix_list] strands = [(chain, start, end, endchain, 'E', None) for (chain, start, endchain, end) in self.strand_list] sselist = helices + strands sselist.sort(cmp=tuplecmp) is_valid = True for i in xrange(1, len(sselist)): sse = sselist[i] prevsse = sselist[i - 1] if (prevsse[0] == sse[0] and pdb_res_seq_cmp(sse[1], prevsse[2]) <= 0): sys.stderr.write('WARNING: PDB has overlapping SSE definitions' ' ' + str(prevsse) + ' and ' + str(sse) + ': ') # remove overlap by shortening longer one and lengthing # shorter one # FIXME: this is ignoring insertion codes etc., really # should convert to proper sequential residue sequence numbers # to do this (prevsse_start, prevsse_start_icode) = get_int_icode(prevsse[1]) (prevsse_end, prevsse_end_icode) = get_int_icode(prevsse[2]) (sse_start, sse_start_icode) = get_int_icode(sse[1]) (sse_end, sse_end_icode) = get_int_icode(sse[2]) if (prevsse_end_icode or sse_start_icode): sys.stderr.write('contains insertion codes, giving up\n') is_valid = False continue prevsse_len = prevsse_end - prevsse_start + 1 sse_len = sse_end - sse_start + 1 overlap = prevsse_end - sse_start + 1 if sse_len > prevsse_len: sse_start += overlap else: prevsse_end -= overlap sselist[i] = (sse[0], str(sse_start), str(sse_end), sse[3], sse[4], sse[5]) sselist[i - 1] = (prevsse[0], str(prevsse_start), str(prevsse_end), prevsse[3], prevsse[4], prevsse[5]) sys.stderr.write('changed to ' + str(sselist[i - 1]) + ' and ' + str(sselist[i]) + '\n') i += 1 # rebuild the helix_list and strand_list with our modified tuples self.helix_list = [(chain, start, endchain, end, htype) for (chain, start, end, endchain, ssetype, htype) in sselist if ssetype == 'H'] self.strand_list = [(chain, start, endchain, end) for (chain, start, end, endchain, ssetype, htype) in sselist if ssetype == 'E'] return is_valid
def build_graph_from_secstruct(self, secstruct, domain, chainid=None, ignore_insertion_codes=False): """ Build the list of nodes from the the supplied PTSecStruct object. Parameters: secstruct - PTSecStruct (ptsecstruct.py) object to build from domain - PTDomain (ptdomain.py) object listing the segment(s) that make up this domain (only one domain processed at a time). (in/out) NOTE: may be modified by having a segment added if SSE is only partly in domain. chainid - chain identifier to build graph for only this chain, or None for all chains (default) ignore_insertion_codes - If True, a hack to make it work with PMML (only) which does not report insertion codes unlike DSSP and STRIDE Uses member data (write): chain_dict - dict of { chainid : node_list } where node_list is list of nodes in order, built in this function secstruct - keeps a pointer to the supplied secstruct (readonly): pdb_struct - The Bio.PDB parsed PDB struct (atomic co-ordinates) for this protein. include_310_helices, include_pi_helices - if true, include these kinds of helices. Raises exceptions: NoSSE_Exception if no helices or strands found Return value: None. """ self.secstruct = secstruct helix_num = 1 strand_num = 1 num_helices_in_domain = 0 num_strands_in_domain = 0 # # Build dictionary mapping (chainid, pdb_resid) to index in residue_list # for ALL residues, not just those in this domain. # self.residue_list = self.get_residue_list(self.pdb_struct, PTDomain(None, None)) self.pdb_resid_dict = {} seq_indx = 0 while seq_indx < len(self.residue_list): residue = self.residue_list[seq_indx] self.pdb_resid_dict[(ptsecstruct.pdb_chainid_to_stride_chainid( residue.get_full_id()[2]), biopdbresid_to_pdbresseq( residue.get_id(), ignore_insertion_codes))] = seq_indx seq_indx += 1 # Note that now we are only adding elements in the supplied domain, # so the so-called 'chains' may really be segments, i.e. subsequences # of chains (rest of chain may be in other domain(s) self.chain_dict = {} # dict of {chainid : node_list} for (start_chainid, start_resnum, end_chainid, end_resnum, helixtype) \ in secstruct.helix_list: assert (start_chainid == end_chainid) #helix must be same chain if chainid and chainid != start_chainid: continue # chainid specified, skip ones not in that chain # will consider structures in domain if first residue is in domain if domain.is_in_domain(start_chainid, get_int_icode(start_resnum)[0]): num_helices_in_domain += 1 if helixtype == "H": idprefix = "ALPHAHELIX_" htype = "ALPHA" this_helix_num = helix_num helix_num += 1 elif helixtype == "I": if not self.include_pi_helices: continue idprefix = "PIHELIX_" htype = "PI" this_helix_num = helix_num helix_num += 1 elif helixtype == "G": if not self.include_310_helices: continue idprefix = "310HELIX_" htype = "310" this_helix_num = helix_num helix_num += 1 else: # shouldn't happen sys.stderr.write("ERROR: bad helix type " + helixtype + "\n") ah_node = PTNodeHelix(htype, idprefix + start_chainid+"_" +\ str(this_helix_num), this_helix_num, start_resnum, end_resnum, start_chainid, domain.domainid, self.residue_list, self.pdb_resid_dict) if not self.chain_dict.has_key(start_chainid): self.chain_dict[start_chainid] = [] self.chain_dict[start_chainid].append(ah_node) # we must already have handled the case of SSEs that cross # domain boundaries (by moving whole SSE to one of the domains) assert (domain.is_in_domain(end_chainid, get_int_icode(end_resnum)[0])) for (start_chainid, start_resnum, end_chainid, end_resnum) \ in secstruct.strand_list: assert (start_chainid == end_chainid) # must be in same chain if chainid and chainid != start_chainid: continue # chainid specified, skip ones not in that chain if domain.is_in_domain(start_chainid, get_int_icode(start_resnum)[0]): num_strands_in_domain += 1 bs_node = PTNodeStrand("STRAND_"+start_chainid +"_"+\ str(strand_num), strand_num, start_resnum, end_resnum, start_chainid, domain.domainid, self.residue_list, self.pdb_resid_dict) strand_num += 1 if not self.chain_dict.has_key(start_chainid): self.chain_dict[start_chainid] = [] # we must already have handled the case of SSEs that cross # domain boundaries (by moving whole SSE to one of the domains) assert (domain.is_in_domain(end_chainid, get_int_icode(end_resnum)[0])) self.chain_dict[start_chainid].append(bs_node) # raise an exception if there are no SSEs at all in this domain if num_helices_in_domain == 0 and num_strands_in_domain == 0: raise NoSSE_Exception delete_chainid_list = [] # list of chainids to delete from chain_dict for (chainid, nodelist) in self.chain_dict.iteritems(): # sort in order of start residue id ascending (all must be disjoint) nodelist.sort() if len(nodelist) < 1: # There are no SSEs in this chain, get rid of it. sys.stderr.write('WARNING: no SSEs in chain ' + chainid + '; chain ignored\n') delete_chainid_list.append( chainid) # don't delete while in loop continue else: # Check for chain with only SSEs that will not be drawn # (i.e. pi or 310 helices), and delete those too found_useful_node = False for ptnode in nodelist: if isinstance(ptnode, PTNodeStrand): found_useful_node = True break elif isinstance(ptnode, PTNodeHelix): if ptnode.get_type() == "ALPHA": found_useful_node = True break elif ((ptnode.get_type() == "310" and self.include_310_helices) or (ptnode.get_type() == "PI" and self.include_pi_helices)): found_useful_node = True break if not found_useful_node: sys.stderr.write( 'WARNING: only pi or 310 helices in chain ' + chainid + '; chain ignored\n') delete_chainid_list.append(chainid) continue # delete chains from chain_dict that were marked earlier for deletion for chainid in delete_chainid_list: self.chain_dict.pop(chainid) # ------------------------------------------------------------------- # This is needed only for labelling sheets for HH and KK codes # (see dfs_strands() etc. below) # add edges for hydrogen bonds # uses secstruct and chainid member data # these are used for determining which side bridge partners are # on (and also for drawing a hydrogen bond graph if requested) self.add_hbond_edges_from_secstruct() # add edges for bridge partners # uses secstruct and chainid member data self.add_bridge_edges_from_secstruct() #--------------------------------------------------------------------- # for sequential numbering, we'll build this dictionary mapping # sequential number (note NOT restarting for each chain) # to PTNode # so that sequential numbers as used in ptgraph2 -b sequential # option. # this is a dictionary of { seqnum : PTNode } self.seqnum2node = {} for (seqnum, node) in \ enumerate([node for node in self.iter_nodes() if \ not ( (isinstance(node, PTNodeTerminus)) or (isinstance(node, PTNodeHelix) and ( (node.get_type() == "310" and not self.include_310_helices) or (node.get_type() == "PI" and not self.include_pi_helices) ) ) ) ]): self.seqnum2node[seqnum + 1] = node # start at 1 not 0
def build_graph_from_secstruct(self, secstruct, domain, chainid=None, ignore_insertion_codes=False): """ Build the list of nodes from the the supplied PTSecStruct object. Parameters: secstruct - PTSecStruct (ptsecstruct.py) object to build from domain - PTDomain (ptdomain.py) object listing the segment(s) that make up this domain (only one domain processed at a time). (in/out) NOTE: may be modified by having a segment added if SSE is only partly in domain. chainid - chain identifier to build graph for only this chain, or None for all chains (default) ignore_insertion_codes - If True, a hack to make it work with PMML (only) which does not report insertion codes unlike DSSP and STRIDE Uses member data (write): chain_dict - dict of { chainid : node_list } where node_list is list of nodes in order, built in this function secstruct - keeps a pointer to the supplied secstruct (readonly): pdb_struct - The Bio.PDB parsed PDB struct (atomic co-ordinates) for this protein. include_310_helices, include_pi_helices - if true, include these kinds of helices. Raises exceptions: NoSSE_Exception if no helices or strands found Return value: None. """ self.secstruct = secstruct helix_num = 1 strand_num = 1 num_helices_in_domain = 0 num_strands_in_domain = 0 # # Build dictionary mapping (chainid, pdb_resid) to index in residue_list # for ALL residues, not just those in this domain. # self.residue_list = self.get_residue_list(self.pdb_struct, PTDomain(None, None)) self.pdb_resid_dict = {} seq_indx = 0 while seq_indx < len(self.residue_list): residue = self.residue_list[seq_indx] self.pdb_resid_dict[ ( ptsecstruct.pdb_chainid_to_stride_chainid(residue.get_full_id()[2]), biopdbresid_to_pdbresseq(residue.get_id(), ignore_insertion_codes), ) ] = seq_indx seq_indx += 1 # Note that now we are only adding elements in the supplied domain, # so the so-called 'chains' may really be segments, i.e. subsequences # of chains (rest of chain may be in other domain(s) self.chain_dict = {} # dict of {chainid : node_list} for (start_chainid, start_resnum, end_chainid, end_resnum, helixtype) in secstruct.helix_list: assert start_chainid == end_chainid # helix must be same chain if chainid and chainid != start_chainid: continue # chainid specified, skip ones not in that chain # will consider structures in domain if first residue is in domain if domain.is_in_domain(start_chainid, get_int_icode(start_resnum)[0]): num_helices_in_domain += 1 if helixtype == "H": idprefix = "ALPHAHELIX_" htype = "ALPHA" this_helix_num = helix_num helix_num += 1 elif helixtype == "I": if not self.include_pi_helices: continue idprefix = "PIHELIX_" htype = "PI" this_helix_num = helix_num helix_num += 1 elif helixtype == "G": if not self.include_310_helices: continue idprefix = "310HELIX_" htype = "310" this_helix_num = helix_num helix_num += 1 else: # shouldn't happen sys.stderr.write("ERROR: bad helix type " + helixtype + "\n") ah_node = PTNodeHelix( htype, idprefix + start_chainid + "_" + str(this_helix_num), this_helix_num, start_resnum, end_resnum, start_chainid, domain.domainid, self.residue_list, self.pdb_resid_dict, ) if not self.chain_dict.has_key(start_chainid): self.chain_dict[start_chainid] = [] self.chain_dict[start_chainid].append(ah_node) # we must already have handled the case of SSEs that cross # domain boundaries (by moving whole SSE to one of the domains) assert domain.is_in_domain(end_chainid, get_int_icode(end_resnum)[0]) for (start_chainid, start_resnum, end_chainid, end_resnum) in secstruct.strand_list: assert start_chainid == end_chainid # must be in same chain if chainid and chainid != start_chainid: continue # chainid specified, skip ones not in that chain if domain.is_in_domain(start_chainid, get_int_icode(start_resnum)[0]): num_strands_in_domain += 1 bs_node = PTNodeStrand( "STRAND_" + start_chainid + "_" + str(strand_num), strand_num, start_resnum, end_resnum, start_chainid, domain.domainid, self.residue_list, self.pdb_resid_dict, ) strand_num += 1 if not self.chain_dict.has_key(start_chainid): self.chain_dict[start_chainid] = [] # we must already have handled the case of SSEs that cross # domain boundaries (by moving whole SSE to one of the domains) assert domain.is_in_domain(end_chainid, get_int_icode(end_resnum)[0]) self.chain_dict[start_chainid].append(bs_node) # raise an exception if there are no SSEs at all in this domain if num_helices_in_domain == 0 and num_strands_in_domain == 0: raise NoSSE_Exception delete_chainid_list = [] # list of chainids to delete from chain_dict for (chainid, nodelist) in self.chain_dict.iteritems(): # sort in order of start residue id ascending (all must be disjoint) nodelist.sort() if len(nodelist) < 1: # There are no SSEs in this chain, get rid of it. sys.stderr.write("WARNING: no SSEs in chain " + chainid + "; chain ignored\n") delete_chainid_list.append(chainid) # don't delete while in loop continue else: # Check for chain with only SSEs that will not be drawn # (i.e. pi or 310 helices), and delete those too found_useful_node = False for ptnode in nodelist: if isinstance(ptnode, PTNodeStrand): found_useful_node = True break elif isinstance(ptnode, PTNodeHelix): if ptnode.get_type() == "ALPHA": found_useful_node = True break elif (ptnode.get_type() == "310" and self.include_310_helices) or ( ptnode.get_type() == "PI" and self.include_pi_helices ): found_useful_node = True break if not found_useful_node: sys.stderr.write("WARNING: only pi or 310 helices in chain " + chainid + "; chain ignored\n") delete_chainid_list.append(chainid) continue # delete chains from chain_dict that were marked earlier for deletion for chainid in delete_chainid_list: self.chain_dict.pop(chainid) # ------------------------------------------------------------------- # This is needed only for labelling sheets for HH and KK codes # (see dfs_strands() etc. below) # add edges for hydrogen bonds # uses secstruct and chainid member data # these are used for determining which side bridge partners are # on (and also for drawing a hydrogen bond graph if requested) self.add_hbond_edges_from_secstruct() # add edges for bridge partners # uses secstruct and chainid member data self.add_bridge_edges_from_secstruct() # --------------------------------------------------------------------- # for sequential numbering, we'll build this dictionary mapping # sequential number (note NOT restarting for each chain) # to PTNode # so that sequential numbers as used in ptgraph2 -b sequential # option. # this is a dictionary of { seqnum : PTNode } self.seqnum2node = {} for (seqnum, node) in enumerate( [ node for node in self.iter_nodes() if not ( (isinstance(node, PTNodeTerminus)) or ( isinstance(node, PTNodeHelix) and ( (node.get_type() == "310" and not self.include_310_helices) or (node.get_type() == "PI" and not self.include_pi_helices) ) ) ) ] ): self.seqnum2node[seqnum + 1] = node # start at 1 not 0
def fixup_crossdomain_sses(secstruct, domain_list): """ Find any SSEs that span a domain boundary, and put each entirely in one domain. The domain is chosen as the one that contains most of the residues int the SSE. Parameters: secstruct - PTSecStruct (ptsecstruct.py) object descirbing SSEs domain_list - list of PTDomain objects representing all the domains in this protein. (in/out) NOTE: may be modified by having a segment removed from a domain if SSE is only partly in the domain. Return value: None. """ sse_list = ( [(start_chainid, start_resnum, end_chainid, end_resnum) for (start_chainid, start_resnum, end_chainid, end_resnum) in secstruct.strand_list] + [(start_chainid, start_resnum, end_chainid, end_resnum) for (start_chainid, start_resnum, end_chainid, end_resnum, helix_type) in secstruct.helix_list] ) for (start_chainid, start_resnum, end_chainid, end_resnum) in sse_list: for domain in domain_list: if (domain.is_in_domain(start_chainid, get_int_icode(start_resnum)[0]) and not domain.is_in_domain(end_chainid, get_int_icode(end_resnum)[0]) ): # This really shouldn't happen, but does: domain # decomposition has determined that this SSE crosses # a domain boundary (really our SSE decisions don't # match whatever domain decomposition has done). # We'll have to assign the SSE to # a domain, and add the residues it spans into that # domain. # find domain2 as the other domain the SSE is also in for domain2 in domain_list: if domain2 == domain: continue if domain2.is_in_domain(end_chainid, get_int_icode(end_resnum)[0]): break # find sse_domain as the domain with more residues of the # SSE in it domain_res_count = 0 domain2_res_count = 0 # FIXME: this is ignoring insertion codes etc., really # should convert to proper sequential residue sequence numbers # to do this start_resint = get_int_icode(start_resnum)[0] end_resint = get_int_icode(end_resnum)[0] for resint in range(start_resint, end_resint+1): if domain.is_in_domain(start_chainid, resint): domain_res_count += 1 elif domain2.is_in_domain(start_chainid, resint): domain2_res_count += 1 else: sys.stderr.write('ERROR: SSE in more than 2 domains\n') if domain2_res_count > domain_res_count: sse_domain = domain2 else: sse_domain = domain # arbitrarily domain if equal count # first remove the segment from where it currently is seg = PTSegment(start_chainid, get_int_icode(start_resnum)[0], get_int_icode(end_resnum)[0]) # print 'xxx',str(seg) for dom in domain_list: # print 'aaa',str(dom) dom.remove_segment(seg) # print 'bbb',str(dom) sys.stderr.write('WARNING: SSE ' + start_chainid + ':' + start_resnum + '-' + end_resnum + ' crosses domain boundary.\n' ' Put in domain ' + sse_domain.domainid + ' (' + str(sse_domain) + ').\n') sse_domain.add_segment(seg) # print 'zzz',str(sse_domain) break # no need to look at any more domains for this SSE
def check_validity_and_fix(self): """ Check for overlapping secondary structures. This happens for example in the PDB HELIX records for 1DLC. In such a case we recover from it in for example this case by adding or subtracting one to start/end of ovlerlapping HELIX records, Parameters: None Return value: True if OK, False if invalid (overlapping structures) (Now returns True if it has fixed up overlaps itself) Uses data members (READ/WRITE): helix_list, strand_list (start and end in helix and strand tuples may be modified; lists are sorted by increasing residue sequence number) """ helices = [ (chain, start, end, endchain, 'H', htype) for (chain, start, endchain, end, htype) in self.helix_list ] strands = [ (chain, start, end, endchain, 'E', None) for (chain, start, endchain, end) in self.strand_list ] sselist = helices + strands sselist.sort(cmp=tuplecmp) is_valid = True for i in xrange(1, len(sselist)): sse = sselist[i] prevsse = sselist[i-1] if (prevsse[0] == sse[0] and pdb_res_seq_cmp(sse[1], prevsse[2]) <= 0): sys.stderr.write('WARNING: PDB has overlapping SSE definitions' ' ' + str(prevsse) + ' and ' + str(sse) + ': ') # remove overlap by shortening longer one and lengthing # shorter one # FIXME: this is ignoring insertion codes etc., really # should convert to proper sequential residue sequence numbers # to do this (prevsse_start,prevsse_start_icode) = get_int_icode(prevsse[1]) (prevsse_end,prevsse_end_icode) = get_int_icode(prevsse[2]) (sse_start,sse_start_icode) = get_int_icode(sse[1]) (sse_end,sse_end_icode) = get_int_icode(sse[2]) if (prevsse_end_icode or sse_start_icode): sys.stderr.write('contains insertion codes, giving up\n') is_valid = False continue prevsse_len = prevsse_end - prevsse_start + 1 sse_len = sse_end - sse_start + 1 overlap = prevsse_end - sse_start + 1 if sse_len > prevsse_len: sse_start += overlap else: prevsse_end -= overlap sselist[i] = (sse[0],str(sse_start),str(sse_end), sse[3],sse[4],sse[5]) sselist[i-1] = (prevsse[0],str(prevsse_start),str(prevsse_end), prevsse[3],prevsse[4],prevsse[5]) sys.stderr.write('changed to ' + str(sselist[i-1]) + ' and ' + str(sselist[i]) + '\n') i += 1 # rebuild the helix_list and strand_list with our modified tuples self.helix_list = [ (chain, start, endchain, end, htype) for (chain, start, end, endchain, ssetype, htype) in sselist if ssetype == 'H' ] self.strand_list = [ (chain, start, endchain, end) for (chain, start, end, endchain, ssetype, htype) in sselist if ssetype == 'E' ] return is_valid