def test1(self): pdb_1 = self.pdb.filter(Pisces(20, 2.0)) results_1 = pdb_1.keys().collect() self.assertTrue('5X42' in results_1) self.assertTrue('4R4X' in results_1) self.assertFalse('2ONX' in results_1) self.assertFalse('1JLP' in results_1)
def test2(self): pdb_2 = self.pdb.flatMap(StructureToPolymerChains()) pdb_2 = pdb_2.filter(Pisces(20, 2.0)) results_2 = pdb_2.keys().collect() self.assertTrue('5X42.B' in results_2) self.assertTrue('4R4X.A' in results_2) self.assertFalse('5X42.A' in results_2) self.assertFalse('2ONX.A' in results_2) self.assertFalse('1JLP.A' in results_2)
# Create variables APP_NAME = "MMTF_Spark" path = "../../resources/mmtf_full_sample/" # Configure Spark conf = SparkConf().setAppName(APP_NAME).setMaster("local[*]") sc = SparkContext(conf=conf) # ## Read PDB and create PISCES non-redundant set # In[14]: pdb = mmtfReader.read_sequence_file(path, sc) pdb = pdb.filter(Pisces(sequenceIdentity = 30, resolution = 2.5)) # ## Setup criteria for metal interactions # In[15]: # Chemical component codes of metals in different oxidation states metals = {"V","CR","MN","MN3","FE","FE2","CO","3CO","NI","3NI", "CU","CU1","CU3","ZN","MO","4MO","6MO"} interactions_filter = InteractionFilter(distanceCutoff = 3.0, minInteractions=4, maxInteractions=6) interactions_filter.set_query_groups(True, metals) # Exclude non-polar interactions interactions_filter.set_target_elements(False, ['H','C','P'])
# ## Read MMTF Hadoop sequence file and # # Create a non-redundant set(<=20% seq. identity) of L-protein chains # In[3]: path = "../../resources/mmtf_reduced_sample/" sequenceIdentity = 20 resolution = 2.0 fraction = 0.1 seed = 123 pdb = mmtfReader.read_sequence_file( path, sc).flatMap(StructureToPolymerChains()).filter( Pisces(sequenceIdentity, resolution)).filter(ContainsLProteinChain()).sample( False, fraction, seed) # ## Get content # In[4]: segmentLength = 11 data = secondaryStructureSegmentExtractor.get_dataset(pdb, segmentLength).cache() print(f"original data : {data.count()}") # ## Drop Q3 and sequence duplicates # In[5]:
path = "../../resources/mmtf_full_sample/" pdb = mmtfReader.read_sequence_file(path, sc) # ## Filter by representative protein chains at 40% sequence identity # In[7]: sequenceIdentity = 40 resolution = 2.0 pdb = pdb.filter(Pisces(sequenceIdentity, resolution)) .flatMap(StructureToPolymerChains()) .filter(Pisces(sequenceIdentity, resolution)) .filter(PolymerComposition(PolymerComposition.AMINO_ACIDS_20)) # ## Show top 10 structures # In[8]: pdb.top(10) # ## Save representative set # In[9]:
# ## Read MMTF Hadoop sequence file and # # Create a non-redundant set(<=20% seq. identity) of L-protein chains # In[3]: path = "../../resources/mmtf_reduced_sample/" sequenceIdentity = 20 resolution = 2.0 fraction = 0.1 seed = 123 pdb = mmtfReader .read_sequence_file(path, sc) .flatMap(StructureToPolymerChains()) .filter(Pisces(sequenceIdentity, resolution)) .filter(ContainsLProteinChain()) .sample(False, fraction, seed) # ## Get content # In[4]: segmentLength = 11 data = secondaryStructureSegmentExtractor.get_dataset(pdb, segmentLength).cache() print(f"original data : {data.count()}") # ## Drop Q3 and sequence duplicates # In[5]:
# ## Configure Spark Context # In[18]: conf = SparkConf().setMaster("local[*]").setAppName("MachineLearningDemo") sc = SparkContext(conf=conf) # ## Read MMTF File and create a non-redundant set (<=40% seq. identity) of L-protein clains # In[19]: pdb = mmtfReader.read_sequence_file('../../resources/mmtf_reduced_sample/', sc).flatMap( StructureToPolymerChains()).filter( Pisces(sequenceIdentity=40, resolution=3.0)) # ## Get secondary structure content # In[20]: data = secondaryStructureExtractor.get_dataset(pdb) # ## Define addProteinFoldType function # In[21]: def add_protein_fold_type(data, minThreshold, maxThreshold): ''' Adds a column "foldType" with three major secondary structure class:
"ProteinFoldDatasetCreatorDemo") sc = SparkContext(conf=conf) # ## Read MMTF Hadoop sequence file # # Create non-redundant set (<=40% seq. identity) if L-protein chains # In[15]: path = "../../resources/mmtf_reduced_sample/" sequenceIdentity = 40 resolution = 2.0 pdb = mmtfReader.read_sequence_file(path, sc).filter( Pisces(sequenceIdentity, resolution)).flatMap(StructureToPolymerChains()).filter( Pisces(sequenceIdentity, resolution)).filter(ContainsLProteinChain()) # ## Get secondary structure content # In[16]: data = secondaryStructureExtractor.get_dataset(pdb) # ## Classify chains by secondary structure type # In[17]: minThreshold = 0.05 maxThreshold = 0.15
resolution = 2.5 minInteractions = 4 maxInteractions = 6 distanceCutoff = 3.0 # chemical component codes of metals in different oxidation states metals = {"V","CR","MN","MN3","FE","FE2","CO","3CO","NI","3NI", "CU","CU1","CU3","ZN","MO","4MO","6MO"} # ## Read PDB and create PISCES non-redundant set # In[12]: pdb = mmtfReader.read_sequence_file(path, sc) pdb = pdb.filter(Pisces(sequenceIdentity = sequenceIdentityCutoff, resolution = resolution)) # ## Setup criteria for metal interactions # In[13]: interactions_filter = InteractionFilter() interactions_filter.set_distance_cutoff(distanceCutoff) interactions_filter.set_min_interactions(minInteractions) interactions_filter.set_max_interactions(maxInteractions) interactions_filter.set_query_groups(True, metals) #Exclude non-polar interactions interactions_filter.set_target_elements(False, ['H','C','P'])
# ## Read PDB in MMTF format # In[3]: path = "../../resources/mmtf_full_sample/" pdb = mmtfReader.read_sequence_file(path, sc) # # Use only representative structures # In[4]: seqId = 40 resolution = 2.0 pdb = pdb.filter(Pisces(seqId, resolution)) # ## Extract proteins with Zn interactions # In[5]: finder = groupInteractionExtractor("ZN", 3) interactions = finder.get_dataset(pdb).cache() # ## List the top 10 residue types that interact with Zn # In[6]: interactions.printSchema()