def test1(self): pdb = self.pdb.filter(ContainsLProteinChain()) \ .flatMap(StructureToPolymerChains()) \ .filter(ContainsLProteinChain()) seq = secondaryStructureExtractor.get_dataset(pdb) self.assertTrue(seq.count() == 5)
def test1(self): pdb = self.pdb.flatMap(StructureToPolymerChains()) secStruct = secondaryStructureExtractor.get_dataset(pdb) dsspQ8 = secStruct.first()["dsspQ8Code"] dsspQ3 = secStruct.first()["dsspQ3Code"] self.assertTrue(len(dsspQ8.split('X')) - 1 == 38) # 'X' appears 38 times self.assertTrue(len(dsspQ8.split('C')) - 1 == 24) # 'C' appears 24 times self.assertTrue(len(dsspQ3.split('C')) - 1 == 44) # 'C' appears 44 times
sc = SparkContext(conf=conf) # ## Download single protein (1STP) # In[11]: pdbIds = ["1STP"] pdb = mmtfReader.download_mmtf_files(pdbIds, sc).cache() # ## Flatmap to polymer chains # In[12]: pdb = pdb.flatMap(StructureToPolymerChains()) # ## Extract Secondary Structures # In[13]: ds = secondaryStructureExtractor.get_dataset(pdb) ds.printSchema() ds.show(2, False) # ## Terminate Spark # In[14]: sc.stop()