def test_read_write_data_file(tmp_path): """Data file read/write test.""" input_path = Path(DATA_DIR) / Path("1kip.cif") with open(input_path, "rt") as input_file: data_list = pdbx.load(input_file) output_path = Path(tmp_path) / Path("testOutputDataFile.cif") with open(output_path, "wt") as output_path: pdbx.dump(data_list, output_path)
def spatial_feature_to_DB(self, pdbID, chain): tableName = '_'.join([pdbID, chain]) if self.sql_table_exist(tableName): print(f"{tableName} is already in the DB") return if not self.PDB_downloader(pdbID): return with open(self.workingRoot + pdbID + '.cif', 'r') as f: cifData = pdbx.load(f)[0] atomList = pd.DataFrame( cifData.get_object('atom_site')._row_list, columns=cifData.get_object('atom_site')._attribute_name_list) ############ # Resolution not found in mmcif # Leave space here ##### atomList = atomList[atomList['auth_asym_id'] == chain] if atomList.shape[0] < 1: print(f'chain {chain} not found in {pdbID} cif file') return res_info_chain = np.empty(2000, dtype=[ ('pdbID', 'S5'), ('chain', 'S2'), ('type', 'S4'), ('pdbNum', 'int16'), ('authNum', 'int16'), ('center', 'float16,float16,float16'), ('direction', 'float16,float16,float16'), ('caPosition', 'float16,float16,float16') ]) allPDBNum = atomList['label_seq_id'].unique() i = 0 for PDBNum in allPDBNum: atomList = atomList[atomList["pdbx_PDB_model_num"] == '1'] oneRes = atomList[atomList["label_seq_id"] == PDBNum] if oneRes['label_comp_id'].iloc[0] not in aminoAcidCodes: continue if oneRes['label_comp_id'].iloc[0] != 'GLY': try: # in case some components missing rGroup = oneRes[oneRes["label_atom_id"].isin(backBone)] rGroupX = sum(pd.to_numeric( rGroup['Cartn_x'])) / rGroup.shape[0] rGroupY = sum(pd.to_numeric( rGroup['Cartn_y'])) / rGroup.shape[0] rGroupZ = sum(pd.to_numeric( rGroup['Cartn_z'])) / rGroup.shape[0] cToRGroup = np.subtract((float( oneRes[oneRes['label_atom_id'] == 'CA']['Cartn_x'] ), float( oneRes[oneRes['label_atom_id'] == 'CA']['Cartn_y'] ), float( oneRes[oneRes['label_atom_id'] == 'CA']['Cartn_z'])), (rGroupX, rGroupY, rGroupZ)) res_info_chain[i] = (pdbID, chain, oneRes['label_comp_id'].iloc[0], PDBNum, oneRes["auth_seq_id"].iloc[0], (rGroupX, rGroupY, rGroupZ), tuple(cToRGroup), (float(oneRes[oneRes['label_atom_id'] == 'CA']['Cartn_x']), float(oneRes[oneRes['label_atom_id'] == 'CA']['Cartn_y']), float(oneRes[oneRes['label_atom_id'] == 'CA']['Cartn_z']))) i += 1 except Exception as e: print(e) else: try: rGroupX = float(oneRes[oneRes['label_atom_id'] == 'C']['Cartn_x']) + \ float(oneRes[oneRes['label_atom_id'] == 'N']['Cartn_x']) + \ float(oneRes[oneRes['label_atom_id'] == 'O']['Cartn_x']) rGroupY = float(oneRes[oneRes['label_atom_id'] == 'C']['Cartn_y']) + \ float(oneRes[oneRes['label_atom_id'] == 'N']['Cartn_y']) + \ float(oneRes[oneRes['label_atom_id'] == 'O']['Cartn_y']) rGroupZ = float(oneRes[oneRes['label_atom_id'] == 'C']['Cartn_z']) + \ float(oneRes[oneRes['label_atom_id'] == 'N']['Cartn_z']) + \ float(oneRes[oneRes['label_atom_id'] == 'O']['Cartn_z']) cToRGroup = np.subtract((float( oneRes[oneRes['label_atom_id'] == 'CA']['Cartn_x'] ), float( oneRes[oneRes['label_atom_id'] == 'CA']['Cartn_y'] ), float( oneRes[oneRes['label_atom_id'] == 'CA']['Cartn_z'])), (rGroupX, rGroupY, rGroupZ)) res_info_chain[i] = (pdbID, chain, oneRes['label_comp_id'].iloc[0], PDBNum, oneRes["auth_seq_id"].iloc[0], (rGroupX / 3, rGroupY / 3, rGroupZ / 3), tuple(cToRGroup), (float(oneRes[oneRes['label_atom_id'] == 'CA']['Cartn_x']), float(oneRes[oneRes['label_atom_id'] == 'CA']['Cartn_y']), float(oneRes[oneRes['label_atom_id'] == 'CA']['Cartn_z']))) i += 1 except Exception as e: print(e) res_info_chain = res_info_chain[0:i] self.sql_create_table(tableName) conn = self.sql_connection() cursor = conn.cursor() try: for res1 in range(i): for res2 in range(i): corrdinatesSubstract = np.subtract( tuple(res_info_chain[res1]["center"]), tuple(res_info_chain[res2]["center"])) distance = np.sqrt(np.sum(corrdinatesSubstract**2)) vector1 = np.array(tuple( res_info_chain[res1]["direction"])) vector2 = np.subtract( tuple(res_info_chain[res2]['center']), tuple(res_info_chain[res1]['caPosition'])) if abs(distance ) > 0.0001: # Exclue 0 distance, e.g. self to self angle = 180 * np.arccos( 0.99 * np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))) / np.pi else: angle = 0 sqlCommand = f''' INSERT INTO `{tableName}` (pdbID , chain, pdbNum1 ,authNum1,resType1 ,pdbNum2 ,authNum2 ,resType2 ,distance,angle) VALUES ('{pdbID}','{chain}','{res_info_chain[res1]['pdbNum']}','{res_info_chain[res1]['authNum']}', '{res_info_chain[res1]['type'].decode('UTF-8')}','{res_info_chain[res2]['pdbNum']}', '{res_info_chain[res2]['authNum']}', '{res_info_chain[res2]['type'].decode('UTF-8')}',{distance},{angle}) ''' cursor.execute(sqlCommand) except Exception as e: print(e) conn.commit() cursor.close() conn.close() print(f'{pdbID} {chain} done') return
def read_cif(cif_file): """Parse CIF-format data into array of Atom objects. .. todo:: Manage several blocks of data. :param file: open file-like object :type file: file :return: (a dictionary indexed by PDBx/CIF record names, a list of record names that couldn't be parsed) :rtype: (dict, [str]) """ pdblist = [] # Array of parsed lines (as objects) errlist = [] # List of record names that couldn't be parsed. if cif_file is None: return pdblist, errlist pdbdata = pdbx.load(cif_file) if len(pdbdata) > 0: for block in pdbdata: head_pdb, head_err = header(block) title_pdb, title_err = title(block) cmpnd_pdb, cmpnd_err = compnd(block) src_pdb, src_err = source(block) key_pdb, key_err = keywds(block) ex_pdb, ex_err = expdata(block) aut_pdb, aut_err = author(block) ssb_pdb, ssb_err = ssbond(block) cis_pdb, cis_err = cispep(block) cry_pdb, cry_err = cryst1(block) or_pdb, or_err = origxn(block) sc_pdb, sc_err = scalen(block) ato_pdb, ato_err = atom_site(block) con_pdb, con_err = conect(block) pdblist = ( head_pdb + title_pdb + cmpnd_pdb + src_pdb + key_pdb + ex_pdb + aut_pdb + ssb_pdb + cis_pdb + cry_pdb + or_pdb + sc_pdb + ato_pdb + con_pdb ) errlist = ( head_err + title_err + cmpnd_err + src_err + key_err + ex_err + aut_err + ssb_err + cis_err + cry_err + or_err + sc_err + ato_err + con_err ) else: _LOGGER.error("Unknown error while reading CIF file.") return pdblist, errlist
def parse_cif_file(self, cif_file): """Parse CIF file into PDB entry. :param file cif_file: CIF file to parse (file object ready for reading) """ containers = pdbx.load(cif_file) if len(containers) > 1: errstr = f"Found {len(containers)} instead of 1." raise ValueError(errstr) else: container = containers[0] header = annotation.Header() if header.parse_cif(container): self._header = header obsolete = annotation.Obsolete() if obsolete.parse_cif(container): self._obsolete = obsolete self.parse_cif_title(container) caveat = annotation.Caveat() if caveat.parse_cif(container): self._caveat = caveat compound = annotation.Compound() if compound.parse_cif(container): self._compound = compound source = annotation.Source() if source.parse_cif(container): self._source = source keywords = annotation.Keywords() if keywords.parse_cif(container): self._keyword = keywords exp_data = annotation.ExperimentalData() if exp_data.parse_cif(container): self._experimental_data = exp_data model_type = annotation.ModelType() if model_type.parse_cif(container): self._model_type = model_type authors = annotation.Author() if authors.parse_cif(container): self._author = authors rev_data = annotation.RevisionData() if rev_data.parse_cif(container): self._revision_data = rev_data supersedes = annotation.Supersedes() if supersedes.parse_cif(container): self._supersedes = supersedes journals = annotation.Journal.parse_cif(container) self._journal = journals _LOGGER.warning("Not parsing REMARK records from CIF.") db_refs = primary.DatabaseReference.parse_cif(container) self._database_reference = db_refs sequence_diffs = primary.SequenceDifferences.parse_cif(container) self._sequence_difference = sequence_diffs sequence_residues = primary.SequenceResidues() if sequence_residues.parse_cif(container): self._sequence_residue = sequence_residues modified_residues = primary.ModifiedResidue.parse_cif(container) if modified_residues: self._modified_residue = modified_residues heterogens = heterogen.Heterogen.parse_cif(container) if heterogens: self._heterogen = heterogens het_name = heterogen.HeterogenName() if het_name.parse_cif(container): self._heterogen_name = het_name het_syn = heterogen.HeterogenSynonym() if het_syn.parse_cif(container): self._heterogen_synonym = het_syn _LOGGER.warning("Not parsing FORMULA records from CIF.") helices = secondary.Helix.parse_cif(container) if helices: self._helix = helices _LOGGER.warning("Not parsing SHEET records from CIF.") disulfides = secondary.DisulfideBond.parse_cif(container) if disulfides: self._disulfide_bond = disulfides links = secondary.Link.parse_cif(container) if links: self._link = links cis_peps = secondary.CisPeptide.parse_cif(container) if cis_peps: self._cis_peptide = cis_peps site = annotation.Site() if site.parse_cif(container): self._site = site unit_cell = crystallography.UnitCell() if unit_cell.parse_cif(container): self._unit_cell = unit_cell transforms = crystallography.OriginalTransform.parse_cif(container) if transforms: self._orig_transform = transforms transforms = crystallography.FractionalTransform.parse_cif(container) if transforms: self._frac_transform = transforms transforms = crystallography.NoncrystalTransform.parse_cif(container) if transforms: self._noncrystal_transform = transforms models = coordinates.Model.parse_cif(container) if models: self._models = models atoms = coordinates.Atom.parse_cif(container) het_atoms = coordinates.HeterogenAtom.parse_cif(container) temp_factors = coordinates.TemperatureFactor.parse_cif(container) for model in models: model_num = model.serial try: model.records += atoms[model_num] except KeyError: _LOGGER.debug(f"No ATOM records for model {model_num}.") try: model.records += het_atoms[model_num] except KeyError: _LOGGER.debug(f"No HETATM records for model {model_num}.") try: model.records += temp_factors[model_num] except KeyError: _LOGGER.debug(f"No ANISOU records for model {model_num}.") self._model = models