def _build_structure(self, structure_id, filehandle): # two special chars as placeholders in the mmCIF format # for item values that cannot be explicitly assigned # see: pdbx/mmcif syntax web page _unassigned = {".", "?"} # Read only _atom_site. and atom_site_anisotrop entries read_atom, read_aniso = False, False _fields, _records = [], [] _anisof, _anisors = [], [] for line in filehandle: if line.startswith("_atom_site."): read_atom = True _fields.append(line.strip()) elif line.startswith("_atom_site_anisotrop."): read_aniso = True _anisof.append(line.strip()) elif read_atom and line.startswith("#"): read_atom = False elif read_aniso and line.startswith("#"): read_aniso = False elif read_atom: _records.append(line.strip()) elif read_aniso: _anisors.append(line.strip()) # Dumping the shlex module here since this particular # category should be rather straightforward. # Quite a performance boost.. _record_tbl = zip(*map(str.split, _records)) _anisob_tbl = zip(*map(str.split, _anisors)) mmcif_dict = dict(zip(_fields, _record_tbl)) mmcif_dict.update(dict(zip(_anisof, _anisob_tbl))) # Build structure object atom_id_list = mmcif_dict["_atom_site.label_atom_id"] residue_id_list = mmcif_dict["_atom_site.label_comp_id"] try: element_list = mmcif_dict["_atom_site.type_symbol"] except KeyError: element_list = None chain_id_list = mmcif_dict["_atom_site.auth_asym_id"] x_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_x"]] y_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_y"]] z_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_z"]] alt_list = mmcif_dict["_atom_site.label_alt_id"] icode_list = mmcif_dict["_atom_site.pdbx_PDB_ins_code"] b_factor_list = mmcif_dict["_atom_site.B_iso_or_equiv"] occupancy_list = mmcif_dict["_atom_site.occupancy"] fieldname_list = mmcif_dict["_atom_site.group_PDB"] try: serial_list = [ int(n) for n in mmcif_dict["_atom_site.pdbx_PDB_model_num"] ] except KeyError: # No model number column serial_list = None except ValueError: # Invalid model number (malformed file) raise PDBConstructionException("Invalid model number") try: aniso_u11 = mmcif_dict["_atom_site_anisotrop.U[1][1]"] aniso_u12 = mmcif_dict["_atom_site_anisotrop.U[1][2]"] aniso_u13 = mmcif_dict["_atom_site_anisotrop.U[1][3]"] aniso_u22 = mmcif_dict["_atom_site_anisotrop.U[2][2]"] aniso_u23 = mmcif_dict["_atom_site_anisotrop.U[2][3]"] aniso_u33 = mmcif_dict["_atom_site_anisotrop.U[3][3]"] aniso_flag = 1 except KeyError: # no anisotropic B factors aniso_flag = 0 # if auth_seq_id is present, we use this. # Otherwise label_seq_id is used. if "_atom_site.auth_seq_id" in mmcif_dict: seq_id_list = mmcif_dict["_atom_site.auth_seq_id"] else: seq_id_list = mmcif_dict["_atom_site.label_seq_id"] # Now loop over atoms and build the structure current_chain_id = None current_residue_id = None current_resname = None structure_builder = self._structure_builder structure_builder.init_structure(structure_id) structure_builder.init_seg(" ") # Historically, Biopython PDB parser uses model_id to mean array index # so serial_id means the Model ID specified in the file current_model_id = -1 current_serial_id = -1 for i in range(0, len(atom_id_list)): # set the line_counter for 'ATOM' lines only and not # as a global line counter found in the PDBParser() # this number should match the '_atom_site.id' index in the MMCIF structure_builder.set_line_counter(i) x = x_list[i] y = y_list[i] z = z_list[i] resname = residue_id_list[i] chainid = chain_id_list[i] altloc = alt_list[i] if altloc in _unassigned: altloc = " " int_resseq = int(seq_id_list[i]) icode = icode_list[i] if icode in _unassigned: icode = " " # Remove occasional " from quoted atom names (e.g. xNA) name = atom_id_list[i].strip('"') # occupancy & B factor try: tempfactor = float(b_factor_list[i]) except ValueError: raise PDBConstructionException("Invalid or missing B factor") try: occupancy = float(occupancy_list[i]) except ValueError: raise PDBConstructionException("Invalid or missing occupancy") fieldname = fieldname_list[i] if fieldname == "HETATM": hetatm_flag = "H" else: hetatm_flag = " " resseq = (hetatm_flag, int_resseq, icode) if serial_list is not None: # model column exists; use it serial_id = serial_list[i] if current_serial_id != serial_id: # if serial changes, update it and start new model current_serial_id = serial_id current_model_id += 1 structure_builder.init_model(current_model_id, current_serial_id) current_chain_id = None current_residue_id = None current_resname = None else: # no explicit model column; initialize single model structure_builder.init_model(current_model_id) if current_chain_id != chainid: current_chain_id = chainid structure_builder.init_chain(current_chain_id) current_residue_id = None current_resname = None if current_residue_id != resseq or current_resname != resname: current_residue_id = resseq current_resname = resname structure_builder.init_residue(resname, hetatm_flag, int_resseq, icode) coord = numpy.array((x, y, z), "f") element = element_list[i] if element_list else None structure_builder.init_atom(name, coord, tempfactor, occupancy, altloc, name, element=element) if aniso_flag == 1 and i < len(aniso_u11): u = ( aniso_u11[i], aniso_u12[i], aniso_u13[i], aniso_u22[i], aniso_u23[i], aniso_u33[i], ) mapped_anisou = [float(_) for _ in u] anisou_array = numpy.array(mapped_anisou, "f") structure_builder.set_anisou(anisou_array)
def _build_structure(self, structure_id): # two special chars as placeholders in the mmCIF format # for item values that cannot be explicitly assigned # see: pdbx/mmcif syntax web page _unassigned = {".", "?"} mmcif_dict = self._mmcif_dict atom_id_list = mmcif_dict["_atom_site.label_atom_id"] residue_id_list = mmcif_dict["_atom_site.label_comp_id"] try: element_list = mmcif_dict["_atom_site.type_symbol"] except KeyError: element_list = None chain_id_list = mmcif_dict["_atom_site.auth_asym_id"] x_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_x"]] y_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_y"]] z_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_z"]] alt_list = mmcif_dict["_atom_site.label_alt_id"] icode_list = mmcif_dict["_atom_site.pdbx_PDB_ins_code"] b_factor_list = mmcif_dict["_atom_site.B_iso_or_equiv"] occupancy_list = mmcif_dict["_atom_site.occupancy"] fieldname_list = mmcif_dict["_atom_site.group_PDB"] try: serial_list = [ int(n) for n in mmcif_dict["_atom_site.pdbx_PDB_model_num"] ] except KeyError: # No model number column serial_list = None except ValueError: # Invalid model number (malformed file) raise PDBConstructionException("Invalid model number") try: aniso_u11 = mmcif_dict["_atom_site_anisotrop.U[1][1]"] aniso_u12 = mmcif_dict["_atom_site_anisotrop.U[1][2]"] aniso_u13 = mmcif_dict["_atom_site_anisotrop.U[1][3]"] aniso_u22 = mmcif_dict["_atom_site_anisotrop.U[2][2]"] aniso_u23 = mmcif_dict["_atom_site_anisotrop.U[2][3]"] aniso_u33 = mmcif_dict["_atom_site_anisotrop.U[3][3]"] aniso_flag = 1 except KeyError: # no anisotropic B factors aniso_flag = 0 # if auth_seq_id is present, we use this. # Otherwise label_seq_id is used. if "_atom_site.auth_seq_id" in mmcif_dict: seq_id_list = mmcif_dict["_atom_site.auth_seq_id"] else: seq_id_list = mmcif_dict["_atom_site.label_seq_id"] # Now loop over atoms and build the structure current_chain_id = None current_residue_id = None current_resname = None structure_builder = self._structure_builder structure_builder.init_structure(structure_id) structure_builder.init_seg(" ") # Historically, Biopython PDB parser uses model_id to mean array index # so serial_id means the Model ID specified in the file current_model_id = -1 current_serial_id = -1 for i in range(0, len(atom_id_list)): # set the line_counter for 'ATOM' lines only and not # as a global line counter found in the PDBParser() # this number should match the '_atom_site.id' index in the MMCIF structure_builder.set_line_counter(i) x = x_list[i] y = y_list[i] z = z_list[i] resname = residue_id_list[i] chainid = chain_id_list[i] altloc = alt_list[i] if altloc in _unassigned: altloc = " " int_resseq = int(seq_id_list[i]) icode = icode_list[i] if icode in _unassigned: icode = " " name = atom_id_list[i] # occupancy & B factor try: tempfactor = float(b_factor_list[i]) except ValueError: raise PDBConstructionException("Invalid or missing B factor") try: occupancy = float(occupancy_list[i]) except ValueError: raise PDBConstructionException("Invalid or missing occupancy") fieldname = fieldname_list[i] if fieldname == "HETATM": if resname == "HOH" or resname == "WAT": hetatm_flag = "W" else: hetatm_flag = "H" else: hetatm_flag = " " resseq = (hetatm_flag, int_resseq, icode) if serial_list is not None: # model column exists; use it serial_id = serial_list[i] if current_serial_id != serial_id: # if serial changes, update it and start new model current_serial_id = serial_id current_model_id += 1 structure_builder.init_model(current_model_id, current_serial_id) current_chain_id = None current_residue_id = None current_resname = None else: # no explicit model column; initialize single model structure_builder.init_model(current_model_id) if current_chain_id != chainid: current_chain_id = chainid structure_builder.init_chain(current_chain_id) current_residue_id = None current_resname = None if current_residue_id != resseq or current_resname != resname: current_residue_id = resseq current_resname = resname structure_builder.init_residue(resname, hetatm_flag, int_resseq, icode) coord = numpy.array((x, y, z), "f") element = element_list[i].upper() if element_list else None structure_builder.init_atom(name, coord, tempfactor, occupancy, altloc, name, element=element) if aniso_flag == 1 and i < len(aniso_u11): u = ( aniso_u11[i], aniso_u12[i], aniso_u13[i], aniso_u22[i], aniso_u23[i], aniso_u33[i], ) mapped_anisou = [float(_) for _ in u] anisou_array = numpy.array(mapped_anisou, "f") structure_builder.set_anisou(anisou_array) # Now try to set the cell try: a = float(mmcif_dict["_cell.length_a"][0]) b = float(mmcif_dict["_cell.length_b"][0]) c = float(mmcif_dict["_cell.length_c"][0]) alpha = float(mmcif_dict["_cell.angle_alpha"][0]) beta = float(mmcif_dict["_cell.angle_beta"][0]) gamma = float(mmcif_dict["_cell.angle_gamma"][0]) cell = numpy.array((a, b, c, alpha, beta, gamma), "f") spacegroup = mmcif_dict["_symmetry.space_group_name_H-M"][0] spacegroup = spacegroup[1:-1] # get rid of quotes!! if spacegroup is None: raise Exception structure_builder.set_symmetry(spacegroup, cell) except Exception: pass # no cell found, so just ignore
def init_residue(self, resname, field, resseq, icode): """Create a new Residue object. Arguments: - resname - string, e.g. "ASN" - field - hetero flag, "W" for waters, "H" for hetero residues, otherwise blank. - resseq - int, sequence identifier - icode - string, insertion code """ if field != " ": if field == "H": # The hetero field consists of H_ + the residue name (e.g. H_FUC) field = "H_" + resname res_id = (field, resseq, icode) if field == " ": if self.chain.has_id(res_id): # There already is a residue with the id (field, resseq, icode). # This only makes sense in the case of a point mutation. warnings.warn( "WARNING: Residue ('%s', %i, '%s') " "redefined at line %i." % (field, resseq, icode, self.line_counter), PDBConstructionWarning) duplicate_residue = self.chain[res_id] if duplicate_residue.is_disordered() == 2: # The residue in the chain is a DisorderedResidue object. # So just add the last Residue object. if duplicate_residue.disordered_has_id(resname): # The residue was already made self.residue = duplicate_residue duplicate_residue.disordered_select(resname) else: # Make a new residue and add it to the already # present DisorderedResidue new_residue = Residue(res_id, resname, self.segid) duplicate_residue.disordered_add(new_residue) self.residue = duplicate_residue return else: if resname == duplicate_residue.resname: warnings.warn( "WARNING: Residue ('%s', %i, '%s','%s')" " already defined with the same name at line %i." % (field, resseq, icode, resname, self.line_counter), PDBConstructionWarning) self.residue = duplicate_residue return # Make a new DisorderedResidue object and put all # the Residue objects with the id (field, resseq, icode) in it. # These residues each should have non-blank altlocs for all their atoms. # If not, the PDB file probably contains an error. if not self._is_completely_disordered(duplicate_residue): # if this exception is ignored, a residue will be missing self.residue = None raise PDBConstructionException( "Blank altlocs in duplicate residue %s ('%s', %i, '%s')" % (resname, field, resseq, icode)) self.chain.detach_child(res_id) new_residue = Residue(res_id, resname, self.segid) disordered_residue = DisorderedResidue(res_id) self.chain.add(disordered_residue) disordered_residue.disordered_add(duplicate_residue) disordered_residue.disordered_add(new_residue) self.residue = disordered_residue return self.residue = Residue(res_id, resname, self.segid) self.chain.add(self.residue)
def _parse_coordinates(self, coords_trailer): """Parse the atomic data in the PDB file (PRIVATE).""" allowed_records = { "ATOM ", "HETATM", "MODEL ", "ENDMDL", "TER ", "ANISOU", # These are older 2.3 format specs: "SIGATM", "SIGUIJ", # bookkeeping records after coordinates: "MASTER", } local_line_counter = 0 structure_builder = self.structure_builder current_model_id = 0 # Flag we have an open model model_open = 0 current_chain_id = None current_segid = None current_residue_id = None current_resname = None for i in range(0, len(coords_trailer)): line = coords_trailer[i].rstrip("\n") record_type = line[0:6] global_line_counter = self.line_counter + local_line_counter + 1 structure_builder.set_line_counter(global_line_counter) if not line.strip(): continue # skip empty lines elif record_type == "ATOM " or record_type == "HETATM": # Initialize the Model - there was no explicit MODEL record if not model_open: structure_builder.init_model(current_model_id) current_model_id += 1 model_open = 1 fullname = line[12:16] # get rid of whitespace in atom names split_list = fullname.split() if len(split_list) != 1: # atom name has internal spaces, e.g. " N B ", so # we do not strip spaces name = fullname else: # atom name is like " CA ", so we can strip spaces name = split_list[0] altloc = line[16] resname = line[17:20] chainid = line[21] try: serial_number = int(line[6:11]) except Exception: serial_number = 0 resseq = int(line[22:26].split()[0]) # sequence identifier icode = line[26] # insertion code if record_type == "HETATM": # hetero atom flag if resname == "HOH" or resname == "WAT": hetero_flag = "W" else: hetero_flag = "H" else: hetero_flag = " " residue_id = (hetero_flag, resseq, icode) # atomic coordinates try: x = float(line[30:38]) y = float(line[38:46]) z = float(line[46:54]) except Exception: # Should we allow parsing to continue in permissive mode? # If so, what coordinates should we default to? Easier to abort! raise PDBConstructionException( "Invalid or missing coordinate(s) at line %i." % global_line_counter) from None coord = numpy.array((x, y, z), "f") # occupancy & B factor if not self.is_pqr: try: occupancy = float(line[54:60]) except Exception: self._handle_PDB_exception( "Invalid or missing occupancy", global_line_counter) occupancy = None # Rather than arbitrary zero or one if occupancy is not None and occupancy < 0: # TODO - Should this be an error in strict mode? # self._handle_PDB_exception("Negative occupancy", # global_line_counter) # This uses fixed text so the warning occurs once only: warnings.warn( "Negative occupancy in one or more atoms", PDBConstructionWarning, ) try: bfactor = float(line[60:66]) except Exception: self._handle_PDB_exception( "Invalid or missing B factor", global_line_counter) bfactor = 0.0 # PDB uses a default of zero if missing elif self.is_pqr: # Attempt to parse charge and radius fields try: pqr_charge = float(line[54:62]) except Exception: self._handle_PDB_exception("Invalid or missing charge", global_line_counter) pqr_charge = None # Rather than arbitrary zero or one try: radius = float(line[62:70]) except Exception: self._handle_PDB_exception("Invalid or missing radius", global_line_counter) radius = None if radius is not None and radius < 0: # In permissive mode raise fatal exception. message = "Negative atom radius" self._handle_PDB_exception(message, global_line_counter) radius = None segid = line[72:76] element = line[76:78].strip().upper() if current_segid != segid: current_segid = segid structure_builder.init_seg(current_segid) if current_chain_id != chainid: current_chain_id = chainid structure_builder.init_chain(current_chain_id) current_residue_id = residue_id current_resname = resname try: structure_builder.init_residue(resname, hetero_flag, resseq, icode) except PDBConstructionException as message: self._handle_PDB_exception(message, global_line_counter) elif current_residue_id != residue_id or current_resname != resname: current_residue_id = residue_id current_resname = resname try: structure_builder.init_residue(resname, hetero_flag, resseq, icode) except PDBConstructionException as message: self._handle_PDB_exception(message, global_line_counter) if not self.is_pqr: # init atom with pdb fields try: structure_builder.init_atom( name, coord, bfactor, occupancy, altloc, fullname, serial_number, element, ) except PDBConstructionException as message: self._handle_PDB_exception(message, global_line_counter) elif self.is_pqr: try: structure_builder.init_atom( name, coord, pqr_charge, radius, altloc, fullname, serial_number, element, pqr_charge, radius, self.is_pqr, ) except PDBConstructionException as message: self._handle_PDB_exception(message, global_line_counter) elif record_type == "ANISOU": anisou = [ float(x) for x in ( line[28:35], line[35:42], line[43:49], line[49:56], line[56:63], line[63:70], ) ] # U's are scaled by 10^4 anisou_array = (numpy.array(anisou, "f") / 10000.0).astype("f") structure_builder.set_anisou(anisou_array) elif record_type == "MODEL ": try: serial_num = int(line[10:14]) except Exception: self._handle_PDB_exception( "Invalid or missing model serial number", global_line_counter) serial_num = 0 structure_builder.init_model(current_model_id, serial_num) current_model_id += 1 model_open = 1 current_chain_id = None current_residue_id = None elif record_type == "END " or record_type == "CONECT": # End of atomic data, return the trailer self.line_counter += local_line_counter return coords_trailer[local_line_counter:] elif record_type == "ENDMDL": model_open = 0 current_chain_id = None current_residue_id = None elif record_type == "SIGUIJ": # standard deviation of anisotropic B factor siguij = [ float(x) for x in ( line[28:35], line[35:42], line[42:49], line[49:56], line[56:63], line[63:70], ) ] # U sigma's are scaled by 10^4 siguij_array = (numpy.array(siguij, "f") / 10000.0).astype("f") structure_builder.set_siguij(siguij_array) elif record_type == "SIGATM": # standard deviation of atomic positions sigatm = [ float(x) for x in ( line[30:38], line[38:45], line[46:54], line[54:60], line[60:66], ) ] sigatm_array = numpy.array(sigatm, "f") structure_builder.set_sigatm(sigatm_array) elif record_type not in allowed_records: warnings.warn( "Ignoring unrecognized record '{}' at line {}".format( record_type, global_line_counter), PDBConstructionWarning, ) local_line_counter += 1 # EOF (does not end in END or CONECT) self.line_counter = self.line_counter + local_line_counter return []
def _build_structure(self, structure_id): mmcif_dict = self._mmcif_dict atom_id_list = mmcif_dict["_atom_site.label_atom_id"] residue_id_list = mmcif_dict["_atom_site.label_comp_id"] try: element_list = mmcif_dict["_atom_site.type_symbol"] except KeyError: element_list = None seq_id_list = mmcif_dict["_atom_site.label_seq_id"] chain_id_list = mmcif_dict["_atom_site.label_asym_id"] x_list = map(float, mmcif_dict["_atom_site.Cartn_x"]) y_list = map(float, mmcif_dict["_atom_site.Cartn_y"]) z_list = map(float, mmcif_dict["_atom_site.Cartn_z"]) alt_list = mmcif_dict["_atom_site.label_alt_id"] b_factor_list = mmcif_dict["_atom_site.B_iso_or_equiv"] occupancy_list = mmcif_dict["_atom_site.occupancy"] fieldname_list = mmcif_dict["_atom_site.group_PDB"] try: serial_list = [ int(n) for n in mmcif_dict["_atom_site.pdbx_PDB_model_num"] ] except KeyError: # No model number column serial_list = None except ValueError: # Invalid model number (malformed file) raise PDBConstructionException("Invalid model number") try: aniso_u11 = mmcif_dict["_atom_site.aniso_U[1][1]"] aniso_u12 = mmcif_dict["_atom_site.aniso_U[1][2]"] aniso_u13 = mmcif_dict["_atom_site.aniso_U[1][3]"] aniso_u22 = mmcif_dict["_atom_site.aniso_U[2][2]"] aniso_u23 = mmcif_dict["_atom_site.aniso_U[2][3]"] aniso_u33 = mmcif_dict["_atom_site.aniso_U[3][3]"] aniso_flag = 1 except KeyError: # no anisotropic B factors aniso_flag = 0 # if auth_seq_id is present, we use this. # Otherwise label_seq_id is used. if "_atom_site.auth_seq_id" in mmcif_dict: seq_id_list = mmcif_dict["_atom_site.auth_seq_id"] else: seq_id_list = mmcif_dict["_atom_site.label_seq_id"] # Now loop over atoms and build the structure current_chain_id = None current_residue_id = None structure_builder = self._structure_builder structure_builder.init_structure(structure_id) structure_builder.init_seg(" ") # Historically, Biopython PDB parser uses model_id to mean array index # so serial_id means the Model ID specified in the file current_model_id = 0 current_serial_id = 0 for i in xrange(0, len(atom_id_list)): x = x_list[i] y = y_list[i] z = z_list[i] resname = residue_id_list[i] chainid = chain_id_list[i] altloc = alt_list[i] if altloc == ".": altloc = " " resseq = seq_id_list[i] name = atom_id_list[i] # occupancy & B factor try: tempfactor = float(b_factor_list[i]) except ValueError: raise PDBConstructionException("Invalid or missing B factor") try: occupancy = float(occupancy_list[i]) except ValueError: raise PDBConstructionException("Invalid or missing occupancy") fieldname = fieldname_list[i] if fieldname == "HETATM": hetatm_flag = "H" else: hetatm_flag = " " if serial_list is not None: # model column exists; use it serial_id = serial_list[i] if current_serial_id != serial_id: # if serial changes, update it and start new model current_serial_id = serial_id structure_builder.init_model(current_model_id, current_serial_id) current_model_id += 1 else: # no explicit model column; initialize single model structure_builder.init_model(current_model_id) if current_chain_id != chainid: current_chain_id = chainid structure_builder.init_chain(current_chain_id) current_residue_id = resseq icode, int_resseq = self._get_icode(resseq) structure_builder.init_residue(resname, hetatm_flag, int_resseq, icode) elif current_residue_id != resseq: current_residue_id = resseq icode, int_resseq = self._get_icode(resseq) structure_builder.init_residue(resname, hetatm_flag, int_resseq, icode) coord = numpy.array((x, y, z), 'f') element = element_list[i] if element_list else None structure_builder.init_atom(name, coord, tempfactor, occupancy, altloc, name, element=element) if aniso_flag == 1: u = (aniso_u11[i], aniso_u12[i], aniso_u13[i], aniso_u22[i], aniso_u23[i], aniso_u33[i]) mapped_anisou = map(float, u) anisou_array = numpy.array(mapped_anisou, 'f') structure_builder.set_anisou(anisou_array) # Now try to set the cell try: a = float(mmcif_dict["_cell.length_a"]) b = float(mmcif_dict["_cell.length_b"]) c = float(mmcif_dict["_cell.length_c"]) alpha = float(mmcif_dict["_cell.angle_alpha"]) beta = float(mmcif_dict["_cell.angle_beta"]) gamma = float(mmcif_dict["_cell.angle_gamma"]) cell = numpy.array((a, b, c, alpha, beta, gamma), 'f') spacegroup = mmcif_dict["_symmetry.space_group_name_H-M"] spacegroup = spacegroup[1:-1] # get rid of quotes!! if spacegroup is None: raise Exception structure_builder.set_symmetry(spacegroup, cell) except: pass # no cell found, so just ignore
def _parse_coordinates(self, coords_trailer): "Parse the atomic data in the PDB file." local_line_counter=0 structure_builder=self.structure_builder current_model_id=0 # Flag we have an open model model_open=0 current_chain_id=None current_segid=None current_residue_id=None current_resname=None for i in range(0, len(coords_trailer)): line=coords_trailer[i] record_type=line[0:6] global_line_counter=self.line_counter+local_line_counter+1 structure_builder.set_line_counter(global_line_counter) if(record_type=='ATOM ' or record_type=='HETATM'): # Initialize the Model - there was no explicit MODEL record if not model_open: structure_builder.init_model(current_model_id) current_model_id+=1 model_open=1 fullname=line[12:16] # get rid of whitespace in atom names split_list=fullname.split() if len(split_list)!=1: # atom name has internal spaces, e.g. " N B ", so # we do not strip spaces name=fullname else: # atom name is like " CA ", so we can strip spaces name=split_list[0] altloc=line[16:17] resname=line[17:20] chainid=line[21:22] try: serial_number=int(line[6:11]) except: serial_number=0 resseq=int(line[22:26].split()[0]) # sequence identifier icode=line[26:27] # insertion code if record_type=='HETATM': # hetero atom flag if resname=="HOH" or resname=="WAT": hetero_flag="W" else: hetero_flag="H" else: hetero_flag=" " residue_id=(hetero_flag, resseq, icode) # atomic coordinates try: x=float(line[30:38]) y=float(line[38:46]) z=float(line[46:54]) except: #Should we allow parsing to continue in permissive mode? #If so what coordindates should we default to? Easier to abort! raise PDBConstructionException(\ "Invalid or missing coordinate(s) at line %i." \ % global_line_counter) coord=numpy.array((x, y, z), 'f') # occupancy & B factor try: occupancy=float(line[54:60]) except: self._handle_PDB_exception("Invalid or missing occupancy", global_line_counter) occupancy = 0.0 #Is one or zero a good default? try: bfactor=float(line[60:66]) except: self._handle_PDB_exception("Invalid or missing B factor", global_line_counter) bfactor = 0.0 #The PDB use a default of zero if the data is missing segid=line[72:76] element=line[76:78].strip() if current_segid!=segid: current_segid=segid structure_builder.init_seg(current_segid) if current_chain_id!=chainid: current_chain_id=chainid structure_builder.init_chain(current_chain_id) current_residue_id=residue_id current_resname=resname try: structure_builder.init_residue(resname, hetero_flag, resseq, icode) except PDBConstructionException, message: self._handle_PDB_exception(message, global_line_counter) elif current_residue_id!=residue_id or current_resname!=resname: current_residue_id=residue_id current_resname=resname try: structure_builder.init_residue(resname, hetero_flag, resseq, icode) except PDBConstructionException, message: self._handle_PDB_exception(message, global_line_counter) # init atom try: structure_builder.init_atom(name, coord, bfactor, occupancy, altloc, fullname, serial_number, element) except PDBConstructionException, message: self._handle_PDB_exception(message, global_line_counter)
def _parse_coordinates(self, coords_trailer): "Parse the atomic data in the PDB file." local_line_counter = 0 structure_builder = self.structure_builder current_model_id = 0 # Flag we have an open model model_open = 0 current_chain_id = None current_segid = None current_residue_id = None current_resname = None for i in range(0, len(coords_trailer)): line = coords_trailer[i] record_type = line[0:6] global_line_counter = self.line_counter + local_line_counter + 1 structure_builder.set_line_counter(global_line_counter) if record_type == "ATOM " or record_type == "HETATM": # Initialize the Model - there was no explicit MODEL record if not model_open: structure_builder.init_model(current_model_id) current_model_id += 1 model_open = 1 fullname = line[12:16] # get rid of whitespace in atom names split_list = fullname.split() if len(split_list) != 1: # atom name has internal spaces, e.g. " N B ", so # we do not strip spaces name = fullname else: # atom name is like " CA ", so we can strip spaces name = split_list[0] altloc = line[16] resname = line[17:20] chainid = line[21] try: serial_number = int(line[6:11]) except: serial_number = 0 resseq = int(line[22:26].split()[0]) # sequence identifier icode = line[26] # insertion code if record_type == "HETATM": # hetero atom flag if resname == "HOH" or resname == "WAT": hetero_flag = "W" else: hetero_flag = "H" else: hetero_flag = " " residue_id = (hetero_flag, resseq, icode) # atomic coordinates try: x = float(line[30:38]) y = float(line[38:46]) z = float(line[46:54]) except: # Should we allow parsing to continue in permissive mode? # If so, what coordinates should we default to? Easier to abort! raise PDBConstructionException( "Invalid or missing coordinate(s) at line %i." % global_line_counter) coord = numpy.array((x, y, z), "f") # occupancy & B factor try: occupancy = float(line[54:60]) except: self._handle_PDB_exception("Invalid or missing occupancy", global_line_counter) occupancy = None # Rather than arbitrary zero or one try: bfactor = float(line[60:66]) except: self._handle_PDB_exception("Invalid or missing B factor", global_line_counter) bfactor = 0.0 # The PDB use a default of zero if the data is missing segid = line[72:76] element = line[76:78].strip() if current_segid != segid: current_segid = segid structure_builder.init_seg(current_segid) if current_chain_id != chainid: current_chain_id = chainid structure_builder.init_chain(current_chain_id) current_residue_id = residue_id current_resname = resname try: structure_builder.init_residue(resname, hetero_flag, resseq, icode) except PDBConstructionException as message: self._handle_PDB_exception(message, global_line_counter) elif current_residue_id != residue_id or current_resname != resname: current_residue_id = residue_id current_resname = resname try: structure_builder.init_residue(resname, hetero_flag, resseq, icode) except PDBConstructionException as message: self._handle_PDB_exception(message, global_line_counter) # init atom try: structure_builder.init_atom(name, coord, bfactor, occupancy, altloc, fullname, serial_number, element) except PDBConstructionException as message: self._handle_PDB_exception(message, global_line_counter) elif record_type == "ANISOU": anisou = map(float, (line[28:35], line[35:42], line[43:49], line[49:56], line[56:63], line[63:70])) # U's are scaled by 10^4 anisou_array = (numpy.array(anisou, "f") / 10000.0).astype("f") structure_builder.set_anisou(anisou_array) elif record_type == "MODEL ": try: serial_num = int(line[10:14]) except: self._handle_PDB_exception( "Invalid or missing model serial number", global_line_counter) serial_num = 0 structure_builder.init_model(current_model_id, serial_num) current_model_id += 1 model_open = 1 current_chain_id = None current_residue_id = None elif record_type == "END " or record_type == "CONECT": # End of atomic data, return the trailer self.line_counter += local_line_counter return coords_trailer[local_line_counter:] elif record_type == "ENDMDL": model_open = 0 current_chain_id = None current_residue_id = None elif record_type == "SIGUIJ": # standard deviation of anisotropic B factor siguij = map(float, (line[28:35], line[35:42], line[42:49], line[49:56], line[56:63], line[63:70])) # U sigma's are scaled by 10^4 siguij_array = (numpy.array(siguij, "f") / 10000.0).astype("f") structure_builder.set_siguij(siguij_array) elif record_type == "SIGATM": # standard deviation of atomic positions sigatm = map(float, (line[30:38], line[38:45], line[46:54], line[54:60], line[60:66])) sigatm_array = numpy.array(sigatm, "f") structure_builder.set_sigatm(sigatm_array) local_line_counter += 1 # EOF (does not end in END or CONECT) self.line_counter = self.line_counter + local_line_counter return []