Esempio n. 1
0
    def __init__(self, sqlQuery):

        self.chainLevel = False
        self.pdbIds = []

        dataset = pdbjMineDataset.get_dataset(sqlQuery)

        if dataset == None:
            raise Exception(
                "Dataset empty. Either provide an sql query or a dataset")

        # Check if there is a pdbID file
        if 'structureId' in dataset.columns:
            self.chainLevel = False
            self.pdbIds = [
                a[0] for a in dataset.select('structureId').collect()
            ]
            self.structureIds = self.pdbIds

        if 'structureChainId' in dataset.columns:
            self.chainLevel = True
            ids = [a[0] for a in dataset.select('structureChainId').collect()]
            ids_sub = [i[:4] for i in ids]
            self.pdbIds = ids + ids_sub
            self.structureIds = ids_sub
Esempio n. 2
0
    def test1(self):

        sql = "SELECT * FROM sifts.pdb_chain_uniprot LIMIT 10"
        ds = pdbjMineDataset.get_dataset(sql)

        count = ds.filter("structureChainId == '101M.A'").count()
        self.assertTrue(count == 1)
Esempio n. 3
0
def __get_entity_to_chain_id():
    # get entityID to strandId mapping
    query = "SELECT pdbid, entity_id, pdbx_strand_id FROM entity_poly"
    mapping: DataFrame = pdbjMineDataset.get_dataset(query)

    # split one-to-many relationship into multiple records: 'A,B -> [A, B] -> explode to separate rows
    mapping = mapping.withColumn("chainId", split(mapping.pdbx_strand_id, ","))
    mapping = mapping.withColumn("chainId", explode("chainId"))

    # create a structureChainId file, e.g. 1XYZ + A -> 1XYZ.A
    mapping = mapping.withColumn(
        "pdbChainId", concat_ws(".", mapping.structureId, mapping.chainId))

    return mapping.select(mapping.entity_id, mapping.structureId,
                          mapping.pdbChainId)
Esempio n. 4
0
    def test1(self):
        sql = "select count(*) from brief_summary"

        search = pdbjMineDataset.get_dataset(sql)

        self.assertTrue(search.head()[0] > 100000)
Esempio n. 5
0
# In[21]:


sql = "select pdbid from keyword_search('porin')"

pdb = pdb.filter(PdbjMineSearch(sql))
print(pdb.keys().collect())
print("\n")
print(f"Number of entries matching query: {pdb.count()}")


# ## Apply a SQL search on PDBj and get a dataset

# In[14]:


sql = "select pdbid, resolution, biol_species, db_uniprot, db_pfam, hit_score from keyword_search('porin') order by hit_score desc"

dataset = pdbjMineDataset.get_dataset(sql)
dataset.show(10)


# ## Terminate Spark Context

# In[15]:


sc.stop()

Esempio n. 6
0
# ## Show one example per drug molecule

# In[7]:

ligands = ligands.dropDuplicates(["Commonname"])
ligands = ligands.select("structureChainId", "ligandId", "DrugBankID",
                         "Commonname", "ligandMolecularWeight",
                         "ligandFormula", "InChIKey", "ligandSmiles")
ligands.sort("Commonname").toPandas().head(10)

# ## Query structures with 2.7.11.1 EC number using PDBjMine

# In[8]:

enzymeQuery = "SELECT * FROM sifts.pdb_chain_enzyme WHERE ec_number = '2.7.11.1'"
enzyme = pdbjMineDataset.get_dataset(enzymeQuery)

print(f"First 10 results for query: {enzymeQuery}")
enzyme.show(10)

# ## Join ligand dataset with PDBjMine dataset with structureChainId

# In[9]:

ligands = ligands.join(enzyme,
                       ligands.structureChainId == enzyme.structureChainId)
print(f"Total number of structures: {ligands.count()}")

df = ligands.toPandas()
df.head()
# ## Configure Spark

# In[2]:


spark = SparkSession.builder                    .master("local[*]")                    .appName("SIFTSDataDemo")                    .getOrCreate()


# ## Get PDB entry to PubMed Id mappings

# In[4]:


pubmedQuery = "SELECT * FROM sifts.pdb_pubmed LIMIT 10"
pubmed = pdbjMineDataset.get_dataset(pubmedQuery)
print(f"First 10 results for query: {pubmedQuery}")
pubmed.show(10)


# ## Get PDB chain to InterPro mappings

# In[9]:


interproQuery = "SELECT * FROM sifts.pdb_chain_interpro LIMIT 10"
interpro = pdbjMineDataset.get_dataset(interproQuery)
print(f"First 10 results for query: {interproQuery}")
interpro.show(10)

# ## Query PDBj Mine
#
# Query the following fields from the \citation category using PDBj's Mine 2 web service:
#  * journal_abbrev
#  * pdbx_database_id_PubMed
#  * year
#
# Note: mixed case column names must be quoted and escaped with \
#

# In[3]:

sqlQuery = "SELECT pdbid, journal_abbrev, \"pdbx_database_id_PubMed\", year from citation WHERE id = 'primary'"

ds = pdbjMineDataset.get_dataset(sqlQuery)

# ## Show first 10 results from query

# In[4]:

ds.show(10, False)

# ## Filter out unpublished entries
#
# Published entires contain the word "published" in various upper/lower case combinations

# In[5]:

ds = ds.filter("UPPER(journal_abbrev) NOT LIKE '%PUBLISHED%'")