def __init__(self, sqlQuery): self.chainLevel = False self.pdbIds = [] dataset = pdbjMineDataset.get_dataset(sqlQuery) if dataset == None: raise Exception( "Dataset empty. Either provide an sql query or a dataset") # Check if there is a pdbID file if 'structureId' in dataset.columns: self.chainLevel = False self.pdbIds = [ a[0] for a in dataset.select('structureId').collect() ] self.structureIds = self.pdbIds if 'structureChainId' in dataset.columns: self.chainLevel = True ids = [a[0] for a in dataset.select('structureChainId').collect()] ids_sub = [i[:4] for i in ids] self.pdbIds = ids + ids_sub self.structureIds = ids_sub
def test1(self): sql = "SELECT * FROM sifts.pdb_chain_uniprot LIMIT 10" ds = pdbjMineDataset.get_dataset(sql) count = ds.filter("structureChainId == '101M.A'").count() self.assertTrue(count == 1)
def __get_entity_to_chain_id(): # get entityID to strandId mapping query = "SELECT pdbid, entity_id, pdbx_strand_id FROM entity_poly" mapping: DataFrame = pdbjMineDataset.get_dataset(query) # split one-to-many relationship into multiple records: 'A,B -> [A, B] -> explode to separate rows mapping = mapping.withColumn("chainId", split(mapping.pdbx_strand_id, ",")) mapping = mapping.withColumn("chainId", explode("chainId")) # create a structureChainId file, e.g. 1XYZ + A -> 1XYZ.A mapping = mapping.withColumn( "pdbChainId", concat_ws(".", mapping.structureId, mapping.chainId)) return mapping.select(mapping.entity_id, mapping.structureId, mapping.pdbChainId)
def test1(self): sql = "select count(*) from brief_summary" search = pdbjMineDataset.get_dataset(sql) self.assertTrue(search.head()[0] > 100000)
# In[21]: sql = "select pdbid from keyword_search('porin')" pdb = pdb.filter(PdbjMineSearch(sql)) print(pdb.keys().collect()) print("\n") print(f"Number of entries matching query: {pdb.count()}") # ## Apply a SQL search on PDBj and get a dataset # In[14]: sql = "select pdbid, resolution, biol_species, db_uniprot, db_pfam, hit_score from keyword_search('porin') order by hit_score desc" dataset = pdbjMineDataset.get_dataset(sql) dataset.show(10) # ## Terminate Spark Context # In[15]: sc.stop()
# ## Show one example per drug molecule # In[7]: ligands = ligands.dropDuplicates(["Commonname"]) ligands = ligands.select("structureChainId", "ligandId", "DrugBankID", "Commonname", "ligandMolecularWeight", "ligandFormula", "InChIKey", "ligandSmiles") ligands.sort("Commonname").toPandas().head(10) # ## Query structures with 2.7.11.1 EC number using PDBjMine # In[8]: enzymeQuery = "SELECT * FROM sifts.pdb_chain_enzyme WHERE ec_number = '2.7.11.1'" enzyme = pdbjMineDataset.get_dataset(enzymeQuery) print(f"First 10 results for query: {enzymeQuery}") enzyme.show(10) # ## Join ligand dataset with PDBjMine dataset with structureChainId # In[9]: ligands = ligands.join(enzyme, ligands.structureChainId == enzyme.structureChainId) print(f"Total number of structures: {ligands.count()}") df = ligands.toPandas() df.head()
# ## Configure Spark # In[2]: spark = SparkSession.builder .master("local[*]") .appName("SIFTSDataDemo") .getOrCreate() # ## Get PDB entry to PubMed Id mappings # In[4]: pubmedQuery = "SELECT * FROM sifts.pdb_pubmed LIMIT 10" pubmed = pdbjMineDataset.get_dataset(pubmedQuery) print(f"First 10 results for query: {pubmedQuery}") pubmed.show(10) # ## Get PDB chain to InterPro mappings # In[9]: interproQuery = "SELECT * FROM sifts.pdb_chain_interpro LIMIT 10" interpro = pdbjMineDataset.get_dataset(interproQuery) print(f"First 10 results for query: {interproQuery}") interpro.show(10)
# ## Query PDBj Mine # # Query the following fields from the \citation category using PDBj's Mine 2 web service: # * journal_abbrev # * pdbx_database_id_PubMed # * year # # Note: mixed case column names must be quoted and escaped with \ # # In[3]: sqlQuery = "SELECT pdbid, journal_abbrev, \"pdbx_database_id_PubMed\", year from citation WHERE id = 'primary'" ds = pdbjMineDataset.get_dataset(sqlQuery) # ## Show first 10 results from query # In[4]: ds.show(10, False) # ## Filter out unpublished entries # # Published entires contain the word "published" in various upper/lower case combinations # In[5]: ds = ds.filter("UPPER(journal_abbrev) NOT LIKE '%PUBLISHED%'")