def _get_header(self, header_coords_trailer): "Get the header of the PDB file, return the rest." structure_builder=self.structure_builder for i in range(0, len(header_coords_trailer)): structure_builder.set_line_counter(i+1) line=header_coords_trailer[i] record_type=line[0:6] if(record_type=='ATOM ' or record_type=='HETATM' or record_type=='MODEL '): break header=header_coords_trailer[0:i] # Return the rest of the coords+trailer for further processing self.line_counter=i coords_trailer=header_coords_trailer[i:] header_dict=_parse_pdb_header_list(header) return header_dict, coords_trailer
def _get_header(self, header_coords_trailer): """Get the header of the PDB file, return the rest (PRIVATE).""" structure_builder = self.structure_builder i = 0 for i in range(0, len(header_coords_trailer)): structure_builder.set_line_counter(i + 1) line = header_coords_trailer[i] record_type = line[0:6] if record_type in ("ATOM ", "HETATM", "MODEL "): break header = header_coords_trailer[0:i] # Return the rest of the coords+trailer for further processing self.line_counter = i coords_trailer = header_coords_trailer[i:] header_dict = _parse_pdb_header_list(header) return header_dict, coords_trailer
def read_PIC(file: TextIO, verbose: bool = False) -> Structure: """Load Protein Internal Coordinate (.pic) data from file. PIC file format: - comment lines start with # - (optional) PDB HEADER record - idcode and deposition date recommended but optional - deposition date in PDB format or as changed by Biopython - (optional) PDB TITLE record - repeat: - Biopython Residue Full ID - sets residue IDs of returned structure - (optional) PDB N, CA, C ATOM records for chain start - (optional) PIC Hedra records for residue - (optional) PIC Dihedra records for residue - (optional) BFAC records listing AtomKeys and b-factors An improvement would define relative positions for HOH (water) entries. N.B. dihedron (i-1)C-N-CA-CB is ignored in assembly if O exists. C-beta is by default placed using O-C-CA-CB, but O is missing in some PDB file residues, which means the sidechain cannot be placed. The alternate CB path (i-1)C-N-CA-CB is provided to circumvent this, but if this is needed then it must be adjusted in conjunction with PHI ((i-1)C-N-CA-C) as they overlap. (i-1)C-N-CA-CB is included by default in .pic files for consistency and informational (e.g. statistics gathering) purposes, as otherwise the dihedron would only appear in the few cases it is needed for. :param Bio.File file: file name or handle :param bool verbose: complain when lines not as expected :returns: Biopython Structure object, Residues with .internal_coord attributes but no coordinates except for chain start N, CA, C atoms if supplied, **OR** None on parse fail (silent unless verbose=True) """ pdb_hdr_re = re.compile( r"^HEADER\s{4}(?P<cf>.{1,40})" r"(?:\s+(?P<dd>\d\d\d\d-\d\d-\d\d|\d\d-\w\w\w-\d\d))?" r"(?:\s+(?P<id>[0-9A-Z]{4}))?\s*$") # ^\('(?P<pid>\w*)',\s(?P<mdl>\d+),\s'(?P<chn>\w)',\s\('(?P<het>\s|[\w-]+)',\s(?P<pos>\d+),\s'(?P<icode>\s|\w)'\)\)\s(?P<res>[A-Z]{3})\s(\[(?P<segid>[a-zA-z\s]{4})\])?\s*$ pdb_ttl_re = re.compile(r"^TITLE\s{5}(?P<ttl>.+)\s*$") biop_id_re = re.compile(r"^\('(?P<pid>[^\s]*)',\s(?P<mdl>\d+),\s" r"'(?P<chn>\s|\w)',\s\('(?P<het>\s|[\w\s-]+)" r"',\s(?P<pos>-?\d+),\s'(?P<icode>\s|\w)'\)\)" r"\s+(?P<res>[\w]{1,3})" r"(\s\[(?P<segid>[a-zA-z\s]+)\])?" r"\s*$") pdb_atm_re = re.compile(r"^ATOM\s\s(?:\s*(?P<ser>\d+))\s(?P<atm>[\w\s]{4})" r"(?P<alc>\w|\s)(?P<res>[\w]{3})\s(?P<chn>.)" r"(?P<pos>[\s\-\d]{4})(?P<icode>[A-Za-z\s])\s\s\s" r"(?P<x>[\s\-\d\.]{8})(?P<y>[\s\-\d\.]{8})" r"(?P<z>[\s\-\d\.]{8})(?P<occ>[\s\d\.]{6})" r"(?P<tfac>[\s\d\.]{6})\s{6}" r"(?P<segid>[a-zA-z\s]{4})(?P<elm>.{2})" r"(?P<chg>.{2})?\s*$") bfac_re = re.compile(r"^BFAC:\s([^\s]+\s+[\-\d\.]+)" r"\s*([^\s]+\s+[\-\d\.]+)?" r"\s*([^\s]+\s+[\-\d\.]+)?" r"\s*([^\s]+\s+[\-\d\.]+)?" r"\s*([^\s]+\s+[\-\d\.]+)?") bfac2_re = re.compile(r"([^\s]+)\s+([\-\d\.]+)") struct_builder = StructureBuilder() # init empty header dict # - could use to parse HEADER and TITLE lines except # deposition_date format changed from original PDB header header_dict = _parse_pdb_header_list([]) curr_SMCS = [None, None, None, None] # struct model chain seg SMCS_init = [ struct_builder.init_structure, struct_builder.init_model, struct_builder.init_chain, struct_builder.init_seg, ] sb_res = None with as_handle(file, mode="r") as handle: for aline in handle.readlines(): if aline.startswith("#"): pass # skip comment lines elif aline.startswith("HEADER "): m = pdb_hdr_re.match(aline) if m: header_dict["head"] = m.group("cf") # classification header_dict["idcode"] = m.group("id") header_dict["deposition_date"] = m.group("dd") elif verbose: print("Reading pic file", file, "HEADER parse fail: ", aline) elif aline.startswith("TITLE "): m = pdb_ttl_re.match(aline) if m: header_dict["name"] = m.group("ttl").strip() # print('TTL: ', m.group('ttl').strip()) elif verbose: print("Reading pic file", file, "TITLE parse fail:, ", aline) elif aline.startswith("("): # Biopython ID line for Residue m = biop_id_re.match(aline) if m: # check SMCS = Structure, Model, Chain, SegID segid = m.group(9) if segid is None: segid = " " this_SMCS = [ m.group(1), int(m.group(2)), m.group(3), segid ] if curr_SMCS != this_SMCS: # init new SMCS level as needed for i in range(4): if curr_SMCS[i] != this_SMCS[i]: SMCS_init[i](this_SMCS[i]) curr_SMCS[i] = this_SMCS[i] if 0 == i: # 0 = init structure so add header struct_builder.set_header(header_dict) elif 1 == i: # new model means new chain and new segid curr_SMCS[2] = curr_SMCS[3] = None struct_builder.init_residue( m.group("res"), m.group("het"), int(m.group("pos")), m.group("icode"), ) sb_res = struct_builder.residue if 2 == sb_res.is_disordered(): for r in sb_res.child_dict.values(): if not r.internal_coord: sb_res = r break sb_res.internal_coord = IC_Residue(sb_res) # print('res id:', m.groupdict()) # print(report_IC(struct_builder.get_structure())) else: if verbose: print("Reading pic file", file, "residue ID parse fail: ", aline) return None elif aline.startswith("ATOM "): m = pdb_atm_re.match(aline) if m: if sb_res is None: # ATOM without res spec already loaded, not a pic file if verbose: print( "Reading pic file", file, "ATOM without residue configured:, ", aline, ) return None if sb_res.resname != m.group("res") or sb_res.id[1] != int( m.group("pos")): if verbose: print( "Reading pic file", file, "ATOM not in configured residue (", sb_res.resname, str(sb_res.id), "):", aline, ) return None coord = numpy.array( (float(m.group("x")), float( m.group("y")), float(m.group("z"))), "f", ) struct_builder.init_atom( m.group("atm").strip(), coord, float(m.group("tfac")), float(m.group("occ")), m.group("alc"), m.group("atm"), int(m.group("ser")), m.group("elm").strip(), ) # print('atom: ', m.groupdict()) # elif verbose: # print("Reading pic file", file, "ATOM parse fail:", aline) elif aline.startswith("BFAC: "): m = bfac_re.match(aline) if m: for bfac_pair in m.groups(): if bfac_pair is not None: m2 = bfac2_re.match(bfac_pair) if m2 and sb_res is not None and sb_res.internal_coord: rp = sb_res.internal_coord rp.bfactors[m2.group(1)] = float(m2.group(2)) # else: # print('Reading pic file', file, 'B-factor line fail: ', aline) else: m = Edron.edron_re.match(aline) if m and sb_res is not None: sb_res.internal_coord.load_PIC(m.groupdict()) elif m: print( "PIC file: ", file, " error: no residue info before reading (di/h)edron data: ", aline, ) return None elif aline.strip(): if verbose: print("Reading PIC file", file, "parse fail on: .", aline, ".") return None struct = struct_builder.get_structure() for chn in struct.get_chains(): chnp = chn.internal_coord = IC_Chain(chn) # done in IC_Chain init : chnp.set_residues() chnp.link_residues() chnp.init_edra() # print(report_PIC(struct_builder.get_structure())) return struct
def read_PIC(file): """Load Protein Internal Coordinate (PIC) data from file. PIC file format: # comment lines start with # (optional) PDB HEADER record - idcode and deposition date recommended but optional - deposition date in PDB format or as changed by Biopython (optional) PDB TITLE record repeat: Biopython Residue Full ID - sets ID of returned structure (optional) PDB ATOM records for chain start N, CA, C PIC Hedra records for residue PIC Dihedra records for residue :param Bio.File file: file name or handle :returns: Biopython Structure object, Residues with .pic attributes but no coordinates except for chain start N, CA, C atoms if supplied, or None on parse fail (silent, no exception rasied) """ pdb_hdr_re = re.compile( r'^HEADER\s{4}(?P<cf>.{1,40})' r'(?:\s+(?P<dd>\d\d\d\d-\d\d-\d\d|\d\d-\w\w\w-\d\d))?' r'(?:\s+(?P<id>[0-9A-Z]{4}))?\s*$', ) # ^\('(?P<pid>\w*)',\s(?P<mdl>\d+),\s'(?P<chn>\w)',\s\('(?P<het>\s|[\w-]+)',\s(?P<pos>\d+),\s'(?P<icode>\s|\w)'\)\)\s(?P<res>[A-Z]{3})\s(\[(?P<segid>[a-zA-z\s]{4})\])?\s*$ pdb_ttl_re = re.compile(r'^TITLE\s{5}(?P<ttl>.+)\s*$') biop_id_re = re.compile(r"^\('(?P<pid>\w*)',\s(?P<mdl>\d+),\s" r"'(?P<chn>\s|\w)',\s\('(?P<het>\s|[\w\s-]+)" r"',\s(?P<pos>-?\d+),\s'(?P<icode>\s|\w)'\)\)" r'\s+(?P<res>[\w]{1,3})' r'(\s\[(?P<segid>[a-zA-z\s]+)\])?' r'\s*$') pdb_atm_re = re.compile(r'^ATOM\s\s(?:\s*(?P<ser>\d+))\s(?P<atm>[\w\s]{4})' r'(?P<alc>\w|\s)(?P<res>[\w]{3})\s(?P<chn>.)' r'(?P<pos>[\s\-\d]{4})(?P<icode>[A-Za-z\s])\s\s\s' r'(?P<x>[\s\-\d\.]{8})(?P<y>[\s\-\d\.]{8})' r'(?P<z>[\s\-\d\.]{8})(?P<occ>[\s\d\.]{6})' r'(?P<tfac>[\s\d\.]{6})\s{6}' r'(?P<segid>[a-zA-z\s]{4})(?P<elm>.{2})' r'(?P<chg>.{2})?\s*$') bfac_re = re.compile(r'^BFAC:\s([^\s]+\s+[\-\d\.]+)' r'\s*([^\s]+\s+[\-\d\.]+)?' r'\s*([^\s]+\s+[\-\d\.]+)?' r'\s*([^\s]+\s+[\-\d\.]+)?' r'\s*([^\s]+\s+[\-\d\.]+)?') bfac2_re = re.compile(r'([^\s]+)\s+([\-\d\.]+)') struct_builder = StructureBuilder() # init empty header dict # - could use to parse HEADER and TITLE lines except # deposition_date format changed from original PDB header header_dict = _parse_pdb_header_list([]) curr_SMCS = [None, None, None, None] # struct model chain seg SMCS_init = [ struct_builder.init_structure, struct_builder.init_model, struct_builder.init_chain, struct_builder.init_seg ] sb_res = None with as_handle(file, mode='r') as handle: for aline in handle.readlines(): if aline.startswith('#'): pass # skip comment lines elif aline.startswith('HEADER '): m = pdb_hdr_re.match(aline) if m: header_dict['head'] = m.group('cf') # classification header_dict['idcode'] = m.group('id') header_dict['deposition_date'] = m.group('dd') else: print('Reading pic file', file, 'HEADER fail: ', aline) pass elif aline.startswith('TITLE '): m = pdb_ttl_re.match(aline) if m: header_dict['name'] = m.group('ttl').strip() # print('TTL: ', m.group('ttl').strip()) else: print('Reading pic file', file, 'TITLE fail:, ', aline) elif aline.startswith('('): # Biopython ID line for Residue m = biop_id_re.match(aline) if m: # check SMCS = Structure, Model, Chain, SegID segid = m.group(9) if segid is None: segid = ' ' this_SMCS = [ m.group(1), int(m.group(2)), m.group(3), segid ] if curr_SMCS != this_SMCS: # init new SMCS level as needed for i in range(4): if curr_SMCS[i] != this_SMCS[i]: SMCS_init[i](this_SMCS[i]) curr_SMCS[i] = this_SMCS[i] if 0 == i: # 0 = init structure so add header struct_builder.set_header(header_dict) elif 1 == i: # new model means new chain and new segid curr_SMCS[2] = curr_SMCS[3] = None struct_builder.init_residue(m.group('res'), m.group('het'), int(m.group('pos')), m.group('icode')) sb_res = struct_builder.residue if 2 == sb_res.is_disordered(): for r in sb_res.child_dict.values(): if not hasattr(r, 'internal_coord'): sb_res = r break sb_res.internal_coord = IC_Residue(sb_res) # print('res id:', m.groupdict()) # print(report_PIC(struct_builder.get_structure())) else: print('Reading pic file', file, 'residue fail: ', aline) elif aline.startswith('ATOM '): m = pdb_atm_re.match(aline) if m: if sb_res is None: # ATOM without res spec already loaded, not a pic file print('no sb_res - not pic file', aline) return None if (sb_res.resname != m.group('res') or sb_res.id[1] != int(m.group('pos'))): # TODO: better exception here? raise Exception( 'pic ATOM read confusion: %s %s %s' % (sb_res.resname, str(sb_res.id), aline)) coord = numpy.array( (float(m.group('x')), float( m.group('y')), float(m.group('z'))), "f") struct_builder.init_atom( m.group('atm').strip(), coord, float(m.group('tfac')), float(m.group('occ')), m.group('alc'), m.group('atm'), int(m.group('ser')), m.group('elm').strip()) # print('atom: ', m.groupdict()) else: print('Reading pic file', file, 'ATOM fail: ', aline) elif aline.startswith('BFAC: '): m = bfac_re.match(aline) if m: for bfac_pair in m.groups(): if bfac_pair is not None: m2 = bfac2_re.match(bfac_pair) if (m2 and sb_res is not None and hasattr(sb_res, 'internal_coord')): rp = sb_res.internal_coord rp.bfactors[m2.group(1)] = float(m2.group(2)) else: m = Edron.edron_re.match(aline) if m: sb_res.internal_coord.load_PIC(m.groupdict()) elif aline.strip(): print('Reading PIC file', file, 'parse fail on: .', aline, '.') return None struct = struct_builder.get_structure() for chn in struct.get_chains(): chnp = chn.internal_coord = IC_Chain(chn) # done in IC_Chain init : chnp.set_residues() chnp.link_residues() chnp.render_dihedra() # print(report_PIC(struct_builder.get_structure())) return struct
def read_PIC( file: TextIO, verbose: bool = False, quick: bool = False, defaults: bool = False, ) -> Structure: """Load Protein Internal Coordinate (.pic) data from file. PIC file format: - comment lines start with # - (optional) PDB HEADER record - idcode and deposition date recommended but optional - deposition date in PDB format or as changed by Biopython - (optional) PDB TITLE record - repeat: - Biopython Residue Full ID - sets residue IDs of returned structure - (optional) PDB N, CA, C ATOM records for chain start - (optional) PIC Hedra records for residue - (optional) PIC Dihedra records for residue - (optional) BFAC records listing AtomKeys and b-factors An improvement would define relative positions for HOH (water) entries. Defaults will be supplied for any value if defaults=True. Default values are supplied in ic_data.py, but structures degrade quickly with any deviation from true coordinates. Experiment with :data:`Bio.PDB.internal_coords.IC_Residue.pic_flags` options to :func:`write_PIC` to verify this. N.B. dihedron (i-1)C-N-CA-CB is ignored in assembly if O exists. C-beta is by default placed using O-C-CA-CB, but O is missing in some PDB file residues, which means the sidechain cannot be placed. The alternate CB path (i-1)C-N-CA-CB is provided to circumvent this, but if this is needed then it must be adjusted in conjunction with PHI ((i-1)C-N-CA-C) as they overlap (see :meth:`.bond_set` and :meth:`.bond_rotate` to handle this automatically). :param Bio.File file: :func:`.as_handle` file name or handle :param bool verbose: complain when lines not as expected :param bool quick: don't check residues for all dihedra (no default values) :param bool defaults: create di/hedra as needed from reference database. Amide proton created if 'H' is in IC_Residue.accept_atoms :returns: Biopython Structure object, Residues with .internal_coord attributes but no coordinates except for chain start N, CA, C atoms if supplied, **OR** None on parse fail (silent unless verbose=True) """ proton = "H" in IC_Residue.accept_atoms pdb_hdr_re = re.compile( r"^HEADER\s{4}(?P<cf>.{1,40})" r"(?:\s+(?P<dd>\d\d\d\d-\d\d-\d\d|\d\d-\w\w\w-\d\d))?" r"(?:\s+(?P<id>[0-9A-Z]{4}))?\s*$") pdb_ttl_re = re.compile(r"^TITLE\s{5}(?P<ttl>.+)\s*$") biop_id_re = re.compile(r"^\('(?P<pid>[^\s]*)',\s(?P<mdl>\d+),\s" r"'(?P<chn>\s|\w)',\s\('(?P<het>\s|[\w\s-]+)" r"',\s(?P<pos>-?\d+),\s'(?P<icode>\s|\w)'\)\)" r"\s+(?P<res>[\w]{1,3})" r"(\s\[(?P<segid>[a-zA-z\s]+)\])?" r"\s*$") pdb_atm_re = re.compile(r"^ATOM\s\s(?:\s*(?P<ser>\d+))\s(?P<atm>[\w\s]{4})" r"(?P<alc>\w|\s)(?P<res>[\w]{3})\s(?P<chn>.)" r"(?P<pos>[\s\-\d]{4})(?P<icode>[A-Za-z\s])\s\s\s" r"(?P<x>[\s\-\d\.]{8})(?P<y>[\s\-\d\.]{8})" r"(?P<z>[\s\-\d\.]{8})(?P<occ>[\s\d\.]{6})" r"(?P<tfac>[\s\d\.]{6})\s{6}" r"(?P<segid>[a-zA-z\s]{4})(?P<elm>.{2})" r"(?P<chg>.{2})?\s*$") bfac_re = re.compile(r"^BFAC:\s([^\s]+\s+[\-\d\.]+)" r"\s*([^\s]+\s+[\-\d\.]+)?" r"\s*([^\s]+\s+[\-\d\.]+)?" r"\s*([^\s]+\s+[\-\d\.]+)?" r"\s*([^\s]+\s+[\-\d\.]+)?") bfac2_re = re.compile(r"([^\s]+)\s+([\-\d\.]+)") struct_builder = StructureBuilder() # init empty header dict # - could use to parse HEADER and TITLE lines except # deposition_date format changed from original PDB header header_dict = _parse_pdb_header_list([]) curr_SMCS = [None, None, None, None] # struct model chain seg SMCS_init = [ struct_builder.init_structure, struct_builder.init_model, struct_builder.init_chain, struct_builder.init_seg, ] sb_res = None rkl = None sb_chain = None sbcic = None sbric = None akc = {} hl12 = {} ha = {} hl23 = {} da = {} bfacs = {} orphan_aks = set() # [] tr = [] # this residue pr = [] # previous residue def akcache(akstr: str) -> AtomKey: """Maintain dictionary of AtomKeys seen while reading this PIC file.""" # akstr: full AtomKey string read from .pic file, includes residue info try: return akc[akstr] except (KeyError): ak = akc[akstr] = AtomKey(akstr) return ak def link_residues(ppr: List[Residue], pr: List[Residue]) -> None: """Set next and prev links between i-1 and i-2 residues.""" for p_r in pr: pric = p_r.internal_coord for p_p_r in ppr: ppric = p_p_r.internal_coord if p_r.id[0] == " ": # not heteroatoms if pric not in ppric.rnext: ppric.rnext.append(pric) if p_p_r.id[0] == " ": if ppric not in pric.rprev: pric.rprev.append(ppric) def process_hedron( a1: str, a2: str, a3: str, l12: str, ang: str, l23: str, ric: IC_Residue, ) -> Tuple: """Create Hedron on current (sbcic) Chain.internal_coord.""" ek = (akcache(a1), akcache(a2), akcache(a3)) atmNdx = AtomKey.fields.atm accpt = IC_Residue.accept_atoms if not all(ek[i].akl[atmNdx] in accpt for i in range(3)): return hl12[ek] = float(l12) ha[ek] = float(ang) hl23[ek] = float(l23) sbcic.hedra[ek] = ric.hedra[ek] = h = Hedron(ek) h.cic = sbcic ak_add(ek, ric) return ek def default_hedron(ek: Tuple, ric: IC_Residue) -> None: """Create Hedron based on same rdh_class hedra in ref database. Adds Hedron to current Chain.internal_coord, see ic_data for default values and reference database source. """ aks = [] hkey = None atmNdx = AtomKey.fields.atm resNdx = AtomKey.fields.resname resPos = AtomKey.fields.respos aks = [ek[i].akl for i in range(3)] atpl = tuple([aks[i][atmNdx] for i in range(3)]) res = aks[0][resNdx] if (aks[0][resPos] != aks[2][resPos] # hedra crosses amide bond so not reversed or atpl == ("N", "CA", "C") # or chain start tau or atpl in ic_data_backbone # or found forward hedron in ic_data or (res not in ["A", "G"] and atpl in ic_data_sidechains[res])): hkey = ek rhcl = [aks[i][resNdx] + aks[i][atmNdx] for i in range(3)] try: dflts = hedra_defaults["".join(rhcl)][0] except KeyError: if aks[0][resPos] == aks[1][resPos]: rhcl = [aks[i][resNdx] + aks[i][atmNdx] for i in range(2)] rhc = "".join(rhcl) + "X" + aks[2][atmNdx] else: rhcl = [ aks[i][resNdx] + aks[i][atmNdx] for i in range(1, 3) ] rhc = "X" + aks[0][atmNdx] + "".join(rhcl) dflts = hedra_defaults[rhc][0] else: # must be reversed or fail hkey = ek[::-1] rhcl = [aks[i][resNdx] + aks[i][atmNdx] for i in range(2, -1, -1)] dflts = hedra_defaults["".join(rhcl)][0] process_hedron( str(hkey[0]), str(hkey[1]), str(hkey[2]), dflts[0], dflts[1], dflts[2], ric, ) if verbose: print(f" default for {ek}") def hedra_check(dk: str, ric: IC_Residue) -> None: """Confirm both hedra present for dihedron key, use default if set.""" if dk[0:3] not in sbcic.hedra and dk[2::-1] not in sbcic.hedra: if defaults: default_hedron(dk[0:3], ric) else: print(f"{dk} missing h1") if dk[1:4] not in sbcic.hedra and dk[3:0:-1] not in sbcic.hedra: if defaults: default_hedron(dk[1:4], ric) else: print(f"{dk} missing h2") def process_dihedron(a1: str, a2: str, a3: str, a4: str, dangle: str, ric: IC_Residue) -> Set: """Create Dihedron on current Chain.internal_coord.""" ek = ( akcache(a1), akcache(a2), akcache(a3), akcache(a4), ) atmNdx = AtomKey.fields.atm accpt = IC_Residue.accept_atoms if not all(ek[i].akl[atmNdx] in accpt for i in range(4)): return da[ek] = float(dangle) sbcic.dihedra[ek] = ric.dihedra[ek] = d = Dihedron(ek) d.cic = sbcic if not quick: hedra_check(ek, ric) ak_add(ek, ric) return ek def default_dihedron(ek: List, ric: IC_Residue) -> None: """Create Dihedron based on same residue class dihedra in ref database. Adds Dihedron to current Chain.internal_coord, see ic_data for default values and reference database source. """ atmNdx = AtomKey.fields.atm resNdx = AtomKey.fields.resname resPos = AtomKey.fields.respos rdclass = "" dclass = "" for ak in ek: dclass += ak.akl[atmNdx] rdclass += ak.akl[resNdx] + ak.akl[atmNdx] if dclass == "NCACN": rdclass = rdclass[0:7] + "XN" elif dclass == "CACNCA": rdclass = "XCAXC" + rdclass[5:] elif dclass == "CNCAC": rdclass = "XC" + rdclass[2:] if rdclass in dihedra_primary_defaults: process_dihedron( str(ek[0]), str(ek[1]), str(ek[2]), str(ek[3]), dihedra_primary_defaults[rdclass][0], ric, ) if verbose: print(f" default for {ek}") elif rdclass in dihedra_secondary_defaults: primAngle, offset = dihedra_secondary_defaults[rdclass] rname = ek[2].akl[resNdx] rnum = int(ek[2].akl[resPos]) paKey = None if primAngle == ("N", "CA", "C", "N") and ek[0].ric.rnext != []: paKey = [ AtomKey((rnum, None, rname, primAngle[x], None, None)) for x in range(3) ] rnext = ek[0].ric.rnext paKey.append( AtomKey(( rnext[0].rbase[0], None, rnext[0].rbase[2], "N", None, None, ))) paKey = tuple(paKey) elif primAngle == ("CA", "C", "N", "CA"): prname = pr.akl[0][resNdx] prnum = pr.akl[0][resPos] paKey = [ AtomKey(prnum, None, prname, primAngle[x], None, None) for x in range(0, 2) ] paKey.add([ AtomKey((rnum, None, rname, primAngle[x], None, None)) for x in range(2, 4) ]) paKey = tuple(paKey) else: paKey = tuple( AtomKey((rnum, None, rname, atm, None, None)) for atm in primAngle) if paKey in da: process_dihedron( str(ek[0]), str(ek[1]), str(ek[2]), str(ek[3]), da[paKey] + dihedra_secondary_defaults[rdclass][1], ric, ) if verbose: print(f" secondary default for {ek}") elif rdclass in dihedra_secondary_xoxt_defaults: if primAngle == ("C", "N", "CA", "C"): # primary for alt cb # no way to trigger alt cb with default=True # because will generate default N-CA-C-O prname = pr.akl[0][resNdx] prnum = pr.akl[0][resPos] paKey = [ AtomKey(prnum, None, prname, primAngle[0], None, None) ] paKey.add([ AtomKey((rnum, None, rname, primAngle[x], None, None)) for x in range(1, 4) ]) paKey = tuple(paKey) else: primAngle, offset = dihedra_secondary_xoxt_defaults[ rdclass] rname = ek[2].akl[resNdx] rnum = int(ek[2].akl[resPos]) paKey = tuple( AtomKey((rnum, None, rname, atm, None, None)) for atm in primAngle) if paKey in da: process_dihedron( str(ek[0]), str(ek[1]), str(ek[2]), str(ek[3]), da[paKey] + offset, ric, ) if verbose: print(f" oxt default for {ek}") else: print(f"missing primary angle {paKey} {primAngle} to " f"generate {rnum}{rname} {rdclass}") else: print( f"missing {ek} -> {rdclass} ({dclass}) not found in primary or" " secondary defaults") def dihedra_check(ric: IC_Residue) -> None: """Look for required dihedra in residue, generate defaults if set.""" # rnext should be set def ake_recurse(akList: List) -> List: """Bulid combinatorics of AtomKey lists.""" car = akList[0] if len(akList) > 1: retList = [] for ak in car: cdr = akList[1:] rslt = ake_recurse(cdr) for r in rslt: r.insert(0, ak) retList.append(r) return retList else: if len(car) == 1: return [list(car)] else: retList = [[ak] for ak in car] return retList def ak_expand(eLst: List) -> List: """Expand AtomKey list with altlocs, all combinatorics.""" retList = [] for edron in eLst: newList = [] for ak in edron: rslt = ak.ric.split_akl([ak]) rlst = [r[0] for r in rslt] if rlst != []: newList.append(rlst) else: newList.append([ak]) rslt = ake_recurse(newList) for r in rslt: retList.append(r) return retList # dihedra_check processing starts here # generate the list of dihedra this residue should have chkLst = [] sN, sCA, sC = AtomKey(ric, "N"), AtomKey(ric, "CA"), AtomKey(ric, "C") sO, sCB, sH = AtomKey(ric, "O"), AtomKey(ric, "CB"), AtomKey(ric, "H") if ric.rnext != []: for rn in ric.rnext: nN, nCA, nC = ( AtomKey(rn, "N"), AtomKey(rn, "CA"), AtomKey(rn, "C"), ) # intermediate residue, need psi, phi, omg chkLst.append((sN, sCA, sC, nN)) # psi chkLst.append((sCA, sC, nN, nCA)) # omg i+1 chkLst.append((sC, nN, nCA, nC)) # phi i+1 else: chkLst.append((sN, sCA, sC, AtomKey(ric, "OXT"))) # psi rn = "(no rnext)" chkLst.append((sN, sCA, sC, sO)) # locate backbone O if ric.lc != "G": chkLst.append((sO, sC, sCA, sCB)) # locate CB if ric.rprev != [] and ric.lc != "P" and proton: chkLst.append((sC, sCA, sN, sH)) # amide proton try: for edron in ic_data_sidechains[ric.lc]: if len(edron) > 3: # dihedra only if all(not atm[0] == "H" for atm in edron): akl = [AtomKey(ric, atm) for atm in edron[0:4]] chkLst.append(akl) except KeyError: pass # now compare generated list to ric.dihedra, get defaults if set. chkLst = ak_expand(chkLst) altloc_ndx = AtomKey.fields.altloc for dk in chkLst: if tuple(dk) in ric.dihedra: pass elif sH in dk: pass # ignore missing hydrogens elif all(atm.akl[altloc_ndx] is None for atm in dk): if defaults: default_dihedron(dk, ric) else: if verbose: print(f"{ric}-{rn} missing {dk}") else: # print(f"skip {ek}") pass # ignore missing combinatoric of altloc atoms # need more here? def ak_add(ek: set, ric: IC_Residue) -> None: """Allocate edron key AtomKeys to current residue as appropriate. A hedron or dihedron may span a backbone amide bond, this routine allocates atoms in the (h/di)edron to the ric residue or saves them for a residue yet to be processed. :param set ek: AtomKeys in edron :param IC_Residue ric: current residue to assign AtomKeys to """ res = ric.residue reskl = ( str(res.id[1]), (None if res.id[2] == " " else res.id[2]), ric.lc, ) for ak in ek: if ak.ric is None: sbcic.akset.add(ak) if ak.akl[0:3] == reskl: ak.ric = ric ric.ak_set.add(ak) else: orphan_aks.add(ak) def finish_chain() -> None: """Do last rnext, rprev links and process chain edra data.""" link_residues(pr, tr) # check/confirm completeness if not quick: for r in pr: dihedra_check(r.internal_coord) for r in tr: dihedra_check(r.internal_coord) if ha != {}: sha = {k: ha[k] for k in sorted(ha)} shl12 = {k: hl12[k] for k in sorted(hl12)} shl23 = {k: hl23[k] for k in sorted(hl23)} sbcic._hedraDict2chain(shl12, sha, shl23, da, bfacs) # read_PIC processing starts here: with as_handle(file, mode="r") as handle: for line in handle.readlines(): if line.startswith("#"): pass # skip comment lines elif line.startswith("HEADER "): m = pdb_hdr_re.match(line) if m: header_dict["head"] = m.group("cf") # classification header_dict["idcode"] = m.group("id") header_dict["deposition_date"] = m.group("dd") elif verbose: print("Reading pic file", file, "HEADER parse fail: ", line) elif line.startswith("TITLE "): m = pdb_ttl_re.match(line) if m: header_dict["name"] = m.group("ttl").strip() # print('TTL: ', m.group('ttl').strip()) elif verbose: print("Reading pic file", file, "TITLE parse fail:, ", line) elif line.startswith("("): # Biopython ID line for Residue m = biop_id_re.match(line) if m: # check SMCS = Structure, Model, Chain, SegID segid = m.group(9) if segid is None: segid = " " this_SMCS = [ m.group(1), int(m.group(2)), m.group(3), segid, ] if curr_SMCS != this_SMCS: if curr_SMCS[:3] != this_SMCS[:3] and ha != {}: # chain change so process current chain data finish_chain() akc = {} # atomkey cache, used by akcache() hl12 = {} # hedra key -> len12 ha = {} # -> hedra angle hl23 = {} # -> len23 da = {} # dihedra key -> angle value bfacs = {} # atomkey string -> b-factor # init new Biopython SMCS level as needed for i in range(4): if curr_SMCS[i] != this_SMCS[i]: SMCS_init[i](this_SMCS[i]) curr_SMCS[i] = this_SMCS[i] if i == 0: # 0 = init structure so add header struct_builder.set_header(header_dict) elif i == 1: # new model means new chain and new segid curr_SMCS[2] = curr_SMCS[3] = None elif i == 2: # new chain so init internal_coord sb_chain = struct_builder.chain sbcic = sb_chain.internal_coord = IC_Chain( sb_chain) struct_builder.init_residue( m.group("res"), m.group("het"), int(m.group("pos")), m.group("icode"), ) sb_res = struct_builder.residue if sb_res.id[0] != " ": # skip hetatm continue if 2 == sb_res.is_disordered(): for r in sb_res.child_dict.values(): if not r.internal_coord: sb_res = r break # added to disordered res tr.append(sb_res) else: # new res so fix up previous residue as feasible link_residues(pr, tr) if not quick: for r in pr: # create di/hedra if default for residue i-1 # just linked dihedra_check(r.internal_coord) pr = tr tr = [sb_res] sbric = sb_res.internal_coord = IC_Residue( sb_res) # no atoms so no rak sbric.cic = sbcic rkl = ( str(sb_res.id[1]), (None if sb_res.id[2] == " " else sb_res.id[2]), sbric.lc, ) sbcic.ordered_aa_ic_list.append(sbric) # update AtomKeys w/o IC_Residue references, in case # chain ends before di/hedra sees them (2XHE test case) for ak in orphan_aks: if ak.akl[0:3] == rkl: ak.ric = sbric sbric.ak_set.add(ak) # may need altoc support here orphan_aks = set( filter(lambda ak: ak.ric is None, orphan_aks)) else: if verbose: print( "Reading pic file", file, "residue ID parse fail: ", line, ) return None elif line.startswith("ATOM "): m = pdb_atm_re.match(line) if m: if sb_res is None: # ATOM without res spec already loaded, not a pic file if verbose: print( "Reading pic file", file, "ATOM without residue configured:, ", line, ) return None if sb_res.resname != m.group("res") or sb_res.id[1] != int( m.group("pos")): if verbose: print( "Reading pic file", file, "ATOM not in configured residue (", sb_res.resname, str(sb_res.id), "):", line, ) return None coord = numpy.array( ( float(m.group("x")), float(m.group("y")), float(m.group("z")), ), "f", ) struct_builder.init_atom( m.group("atm").strip(), coord, float(m.group("tfac")), float(m.group("occ")), m.group("alc"), m.group("atm"), int(m.group("ser")), m.group("elm").strip(), ) # reset because prev does not link to this residue # (chainBreak) pr = [] elif line.startswith("BFAC: "): m = bfac_re.match(line) if m: for bfac_pair in m.groups(): if bfac_pair is not None: m2 = bfac2_re.match(bfac_pair) bfacs[m2.group(1)] = float(m2.group(2)) # else: # print f"Reading pic file {file} B-factor fail: {line}" else: m = Edron.edron_re.match(line) if m and sb_res is not None: if m["a4"] is None: process_hedron( m["a1"], m["a2"], m["a3"], m["len12"], m["angle"], m["len23"], sb_res.internal_coord, ) else: process_dihedron( m["a1"], m["a2"], m["a3"], m["a4"], float(m["dihedral"]), sb_res.internal_coord, ) elif m: print( "PIC file: ", file, " error: no residue info before reading (di/h)edron: ", line, ) return None elif line.strip(): if verbose: print( "Reading PIC file", file, "parse fail on: .", line, ".", ) return None # reached end of input finish_chain() # print(report_PIC(struct_builder.get_structure())) return struct_builder.get_structure()