Example #1
0
    def test6(self):
        pdb_6 = self.pdb.flatMap(StructureToPolymerChains())
        pdb_6 = pdb_6.filter(SecondaryStructure(
            0.70, 0.75, 0.00, 0.40, 0.25, 0.50))
        results_6 = pdb_6.keys().collect()

        self.assertTrue('2C7M.A' in results_6)
        self.assertFalse('2C7M.B' in results_6)
Example #2
0
    def test1(self):
        pdb = self.pdb.filter(ContainsLProteinChain()) \
                      .flatMap(StructureToPolymerChains()) \
                      .filter(ContainsLProteinChain())

        seq = secondaryStructureExtractor.get_dataset(pdb)

        self.assertTrue(seq.count() == 5)
Example #3
0
    def test2(self):

        pdb_2 = self.pdb.filter(BlastCluster(40))
        pdb_2 = pdb_2.flatMap(StructureToPolymerChains())
        results_2 = pdb_2.keys().collect()

        self.assertFalse('1O06' in results_2)
        self.assertTrue('1O06.A' in results_2)
        self.assertFalse('2ONX' in results_2)
    def test1(self):
        pdb = self.pdb.flatMap(StructureToPolymerChains())

        secStruct = secondaryStructureExtractor.get_dataset(pdb)

        dsspQ8 = secStruct.first()["dsspQ8Code"]
        dsspQ3 = secStruct.first()["dsspQ3Code"]

        self.assertTrue(len(dsspQ8.split('X')) - 1 == 38) # 'X' appears 38 times
        self.assertTrue(len(dsspQ8.split('C')) - 1 == 24) # 'C' appears 24 times
        self.assertTrue(len(dsspQ3.split('C')) - 1 == 44) # 'C' appears 44 times
Example #5
0
    def test3(self):

        sql = "SELECT e.pdbid, e.chain FROM sifts.pdb_chain_enzyme AS e WHERE e.ec_number = '2.7.11.1'"
        pdb = self.pdb.flatMap(StructureToPolymerChains()) \
                      .filter(PdbjMineSearch(sql))

        matches = pdb.keys().collect()

        self.assertTrue("5JDE.A" in matches)
        self.assertTrue("5JDE.B" in matches)
        self.assertTrue("5CU4.A" in matches)
        self.assertTrue("5L6W.L" in matches)
        self.assertFalse("5L6W.C" in matches)
        self.assertTrue("5UFU.A" in matches)
        self.assertFalse("5UFU.B" in matches)
        self.assertFalse("5UFU.C" in matches)
        self.assertFalse("5IHB.A" in matches)
        self.assertFalse("5IHB.B" in matches)
        self.assertFalse("5IHB.C" in matches)
        self.assertFalse("5IHB.D" in matches)
Example #6
0
    def test3(self):
        sql = "select distinct concat(entity_poly.pdbid, '.', unnest(string_to_array(entity_poly.pdbx_strand_id, ','))) as \"structureChainId\" from entity_poly join entity_src_gen on entity_src_gen.pdbid=entity_poly.pdbid and entity_poly.entity_id=entity_poly.entity_id join entity on entity.pdbid=entity_poly.pdbid and entity.id=entity_poly.entity_id where pdbx_ec='2.7.11.1' and pdbx_gene_src_scientific_name='H**o sapiens'"

        pdb = self.pdb.flatMap(StructureToPolymerChains()) \
                      .filter(PdbjMine(sql, pdbidField="structureChainId", chainLevel=True))

        matches = pdb.keys().collect()

        self.assertTrue("5JDE.A" in matches)
        self.assertTrue("5JDE.B" in matches)
        self.assertTrue("5CU4.A" in matches)
        self.assertTrue("5L6W.L" in matches)
        self.assertFalse("5L6W.C" in matches)
        self.assertFalse("5UFU.A" in matches)
        self.assertFalse("5UFU.B" in matches)
        self.assertFalse("5UFU.C" in matches)
        self.assertFalse("5IHB.A" in matches)
        self.assertFalse("5IHB.B" in matches)
        self.assertFalse("5IHB.C" in matches)
        self.assertFalse("5IHB.D" in matches)
Example #7
0
    def test2(self):
        # This test runs a chain elvel query and compares chain level results
        pdb_2 = self.pdb.flatMap(StructureToPolymerChains())

        whereClause = "WHERE ecNo='2.7.11.1' AND source='H**o sapiens'"
        fields = ["ecNo", "source"]
        pdb_2 = pdb_2.filter(CustomReportQuery(whereClause, fields))
        results_2 = pdb_2.keys().collect()

        self.assertTrue('5JDE.A' in results_2)
        self.assertTrue('5JDE.B' in results_2)
        self.assertTrue('5CU4.A' in results_2)
        self.assertTrue('5L6W.L' in results_2)
        self.assertFalse('5L6W.C' in results_2)
        self.assertFalse('5UFU.A' in results_2)
        self.assertFalse('5UFU.B' in results_2)
        self.assertFalse('5UFU.C' in results_2)
        self.assertFalse('5IHB.A' in results_2)
        self.assertFalse('5IHB.B' in results_2)
        self.assertFalse('5IHB.C' in results_2)
        self.assertFalse('5IHB.D' in results_2)
Example #8
0
    def test4(self):
        query = "<orgPdbQuery>" + \
            "<queryType>org.pdb.query.simple.EnzymeClassificationQuery</queryType>" + \
            "<Enzyme_Classification>2.7.11.1</Enzyme_Classification>" + \
            "</orgPdbQuery>"

        pdb_4 = self.pdb.flatMap(StructureToPolymerChains()) \
                        .filter(AdvancedQuery(query))
        results_4 = pdb_4.keys().collect()

        self.assertFalse('1PEN.A' in results_4)
        self.assertFalse('1OCZ.A' in results_4)
        self.assertFalse('2ONX.A' in results_4)
        self.assertTrue('5L6W.L' in results_4)
        self.assertFalse('5L6W.C' in results_4)
        self.assertFalse('5KHU.A' in results_4)
        self.assertFalse('5KHU.B' in results_4)
        self.assertTrue('5KHU.Q' in results_4)
        self.assertTrue('1F3M.A' in results_4)
        self.assertTrue('1F3M.B' in results_4)
        self.assertTrue('1F3M.C' in results_4)
        self.assertTrue('1F3M.D' in results_4)

#  ## Read MMTF Hadoop sequence file and 
#  
#  Create a non-redundant set(<=20% seq. identity) of L-protein chains

# In[3]:


path = "../../resources/mmtf_reduced_sample/"
sequenceIdentity = 20
resolution = 2.0
fraction = 0.1
seed = 123

pdb = mmtfReader         .read_sequence_file(path, sc)         .flatMap(StructureToPolymerChains())         .filter(Pisces(sequenceIdentity, resolution))         .filter(ContainsLProteinChain())         .sample(False, fraction, seed)


# ## Get content

# In[4]:


segmentLength = 11
data = secondaryStructureSegmentExtractor.get_dataset(pdb, segmentLength).cache()
print(f"original data   : {data.count()}")


# ## Drop Q3 and sequence duplicates

# In[5]:
    def test1(self):
        pdb_1 = self.pdb.flatMap(StructureToPolymerChains())
        results_1 = pdb_1.keys().collect()

        self.assertTrue(len(results_1) == 10)
path = "../../resources/mmtf_reduced_sample/"

pdb = mmtfReader.read_sequence_file(path, sc)

# ## flat map structure to polymer chains, filter by polymer composition and count
#
# ### Supported polymer composition type:
#
# ** polymerComposition.AMINO_ACIDS_20  **= ["ALA","ARG","ASN","ASP","CYS","GLN","GLU","GLY","HIS","ILE","LEU","LYS","MET","PHE","PRO","SER","THR","TRP","TYR","VAL"]
#
# ** polymerComposition.AMINO_ACIDS_22 **= ["ALA","ARG","ASN","ASP","CYS","GLN","GLU","GLY","HIS","ILE","LEU","LYS","MET","PHE","PRO","SER","THR","TRP","TYR","VAL","SEC","PYL"]
#
# ** polymerComposition.DNA_STD_NUCLEOTIDES **= ["DA","DC","DG","DT"]
#
# ** polymerComposition.RNA_STD_NUCLEOTIDES **= ["A","C","G","U"]
#

# In[4]:

count = pdb.flatMap(StructureToPolymerChains(False, True)).filter(
    PolymerComposition(PolymerComposition.AMINO_ACIDS_20)).count()

print(f"Chains with standard amino acids: {count}")

# ## Terminate Spark

# In[5]:

sc.stop()
    "SecondaryStructureElementsWord2VecEncoderDemo")

sc = SparkContext(conf=conf)

#  ## Read MMTF Hadoop sequence file and
#
#  Create a non-redundant set(<=20% seq. identity) of L-protein chains

# In[3]:

path = "../../resources/mmtf_reduced_sample/"
fraction = 0.05
seed = 123

pdb = mmtfReader.read_sequence_file(path, sc).flatMap(
    StructureToPolymerChains(False,
                             True)).filter(ContainsLProteinChain()).sample(
                                 False, fraction, seed)

# ## Extract Element "H" from Secondary Structure

# In[4]:

label = "H"
data = secondaryStructureElementExtractor.get_dataset(pdb, label).cache()
print(f"original data   : {data.count()}")
data.show(10, False)

# ## Word2Vec encoded feature Vector

# In[6]:
sc = SparkContext(conf=conf)

# ## Read MMTF Hadoop sequence file
#
# Create non-redundant set (<=40% seq. identity) if L-protein chains

# In[15]:

path = "../../resources/mmtf_reduced_sample/"
sequenceIdentity = 40
resolution = 2.0

pdb = mmtfReader.read_sequence_file(path, sc).filter(
    Pisces(sequenceIdentity,
           resolution)).flatMap(StructureToPolymerChains()).filter(
               Pisces(sequenceIdentity,
                      resolution)).filter(ContainsLProteinChain())

# ## Get secondary structure content

# In[16]:

data = secondaryStructureExtractor.get_dataset(pdb)

# ## Classify chains by secondary structure type

# In[17]:

minThreshold = 0.05
maxThreshold = 0.15
    def test1(self):
        pdb = self.pdb.flatMap(StructureToPolymerChains())
        seq = polymerSequenceExtractor.get_dataset(pdb)

        self.assertTrue(seq.count() == 5)
Example #15
0
conf = SparkConf().setMaster("local[*]").setAppName("DSSPDemo")
sc = SparkContext(conf=conf)

# ## Download single protein (1STP)

# In[11]:

pdbIds = ["1STP"]

pdb = mmtfReader.download_mmtf_files(pdbIds, sc).cache()

# ## Flatmap to polymer chains

# In[12]:

pdb = pdb.flatMap(StructureToPolymerChains())

# ## Extract Secondary Structures

# In[13]:

ds = secondaryStructureExtractor.get_dataset(pdb)

ds.printSchema()
ds.show(2, False)

# ## Terminate Spark

# In[14]:

sc.stop()
Example #16
0
sc = SparkContext(conf=conf)

#  ## Read MMTF Hadoop sequence file and
#
#  Create a non-redundant set(<=20% seq. identity) of L-protein chains

# In[3]:

path = "../../resources/mmtf_reduced_sample/"
sequenceIdentity = 20
resolution = 2.0
fraction = 0.1
seed = 123

pdb = mmtfReader.read_sequence_file(
    path, sc).flatMap(StructureToPolymerChains()).filter(
        Pisces(sequenceIdentity,
               resolution)).filter(ContainsLProteinChain()).sample(
                   False, fraction, seed)

# ## Get content

# In[4]:

segmentLength = 11
data = secondaryStructureSegmentExtractor.get_dataset(pdb,
                                                      segmentLength).cache()
print(f"original data   : {data.count()}")

# ## Drop Q3 and sequence duplicates
Example #17
0
    "secondaryStructureElementDemo")
sc = SparkContext(conf=conf)

# ## Download protein (1STP)
#
# ### Note: Need to use SparkContext as parameter to download Mmtf files

# In[12]:

pdb = mmtfReader.download_mmtf_files(['1STP'], sc).cache()

# ## Map protein to polymer chains and apply LProteinChain filter

# In[13]:

pdb = pdb.flatMap(StructureToPolymerChains()).filter(ContainsLProteinChain())

# ## Extract secondary structure element 'E'

# In[14]:

ds = secondaryStructureElementExtractor.get_dataset(pdb, 'E', 6)

ds.show(50, False)

# ## Terminate Spark

# In[15]:

sc.stop()
Example #18
0
    def test1(self):
        pdb = self.pdb.flatMap(StructureToPolymerChains())

        seq = secondaryStructureSegmentExtractor.get_dataset(pdb, 25)

        self.assertTrue("DPSKDSKAQVSAAEAGITGTWYNQL" == seq.head()[1])
Example #19
0
# ## Configure Spark

# In[2]:

conf = SparkConf().setMaster("local[*]").setAppName("polypeptideCahinStats")
sc = SparkContext(conf=conf)

# ## Read in mmtf files, flatMap to polymer chains, filter by polymer composition, and get number of groups

# In[4]:

path = "../../resources/mmtf_full_sample/"

chainLengths = mmtfReader.read_sequence_file(path, sc).flatMap(
    StructureToPolymerChains(False, True)).filter(
        PolymerComposition(PolymerComposition.AMINO_ACIDS_20)).map(
            lambda t: t[1].num_groups).cache()

# ## Print out poly-peptide chain statistics

# In[5]:

print(f"Total number of chains: {chainLengths.count()}")
print(f"Total number of groups: {chainLengths.sum()}")
print(f"Min chain length: {chainLengths.min()}")
print(f"Mean chain length: {chainLengths.mean()}")
print(f"Max chain length: {chainLengths.max()}")

# ## Terminate Spark
Example #20
0

path = "../../resources/mmtf_full_sample/"

pdb = mmtfReader.read_sequence_file(path, sc)


# ## Filter by representative protein chains at 40% sequence identity

# In[7]:


sequenceIdentity = 40
resolution = 2.0

pdb = pdb.filter(Pisces(sequenceIdentity, resolution))          .flatMap(StructureToPolymerChains())          .filter(Pisces(sequenceIdentity, resolution))          .filter(PolymerComposition(PolymerComposition.AMINO_ACIDS_20))


# ## Show top 10 structures

# In[8]:


pdb.top(10)


# ## Save representative set

# In[9]:

Example #21
0
# ## Configure Spark Context

# In[2]:

conf = SparkConf().setMaster("local[*]").setAppName(
    "SequenceSimilaritySearchDemo")
sc = SparkContext(conf=conf)

# ## Read PDB in MMTF format, split into polymer chain, search by sequence similarity, and print sequence found

# In[6]:

path = "../../resources/mmtf_reduced_sample/"

pdb = mmtfReader.read_sequence_file(path, sc).flatMap(
    StructureToPolymerChains()).filter(
        SequenceSimilarity(sequence="NLVQFGVMIEKMTGKSALQYNDYGCYCGIGGSHWPVDQ",
                           searchTool=SequenceSimilarity.BLAST,
                           eValueCutoff=0.001,
                           sequenceIdentityCutoff=40,
                           maskLowComplexity=True)).collect()

for pdbId, structure in pdb:
    print(f"{pdbId} :     {structure.entity_list[0]['sequence']}")

# ## Terminate Spark Context

# In[7]:

sc.stop()
# ## Configure Spark Context

# In[3]:

conf = SparkConf().setMaster("local[*]").setAppName("KinaseDemo")
sc = SparkContext(conf=conf)

# ## Query for human protein-serine/threonine kinases using SIFTS data

# In[4]:

sql = "SELECT t.pdbid, t.chain FROM sifts.pdb_chain_taxonomy AS t  " + "JOIN sifts.pdb_chain_enzyme AS e ON (t.pdbid = e.pdbid AND t.chain = e.chain) " + "WHERE t.scientific_name = 'H**o sapiens' AND e.ec_number = '2.7.11.1'"

# ## Read PDB and filter by author

# In[6]:

path = "../../resources/mmtf_reduced_sample/"

pdb = mmtfReader.read_sequence_file(path, sc).flatMap(
    StructureToPolymerChains()).filter(PdbjMineSearch(sql))

print(f"Number of entries matching query: {pdb.count()}")

# ## Terminate Spark Context

# In[7]:

sc.stop()