Exemple #1
0
    def test1(self):
        pdb = self.pdb.filter(ContainsLProteinChain()) \
                      .flatMap(StructureToPolymerChains()) \
                      .filter(ContainsLProteinChain())

        seq = secondaryStructureExtractor.get_dataset(pdb)

        self.assertTrue(seq.count() == 5)
    def test1(self):
        pdb = self.pdb.flatMap(StructureToPolymerChains())

        secStruct = secondaryStructureExtractor.get_dataset(pdb)

        dsspQ8 = secStruct.first()["dsspQ8Code"]
        dsspQ3 = secStruct.first()["dsspQ3Code"]

        self.assertTrue(len(dsspQ8.split('X')) - 1 == 38) # 'X' appears 38 times
        self.assertTrue(len(dsspQ8.split('C')) - 1 == 24) # 'C' appears 24 times
        self.assertTrue(len(dsspQ3.split('C')) - 1 == 44) # 'C' appears 44 times
sc = SparkContext(conf=conf)

# ## Download single protein (1STP)

# In[11]:

pdbIds = ["1STP"]

pdb = mmtfReader.download_mmtf_files(pdbIds, sc).cache()

# ## Flatmap to polymer chains

# In[12]:

pdb = pdb.flatMap(StructureToPolymerChains())

# ## Extract Secondary Structures

# In[13]:

ds = secondaryStructureExtractor.get_dataset(pdb)

ds.printSchema()
ds.show(2, False)

# ## Terminate Spark

# In[14]:

sc.stop()