def test1(self): pdb = self.pdb.filter(ContainsLProteinChain()) \ .flatMap(StructureToPolymerChains()) \ .filter(ContainsLProteinChain()) seq = secondaryStructureExtractor.get_dataset(pdb) self.assertTrue(seq.count() == 5)
def test1(self): pdb_1 = self.pdb.filter(ContainsLProteinChain()) results_1 = pdb_1.keys().collect() self.assertTrue('2ONX' in results_1) self.assertFalse('1JLP' in results_1) self.assertTrue('5X6H' in results_1) self.assertFalse('5L2G' in results_1) self.assertFalse('2MK1' in results_1)
def test2(self): pdb_2 = self.pdb.filter(ContainsLProteinChain(exclusive=True)) results_2 = pdb_2.keys().collect() self.assertTrue('2ONX' in results_2) self.assertFalse('1JLP' in results_2) self.assertFalse('5X6H' in results_2) self.assertFalse('5L2G' in results_2) self.assertFalse('2MK1' in results_2)
def test3(self): pdb_3 = self.pdb.flatMap(StructureToPolymerChains()) pdb_3 = pdb_3.filter(ContainsLProteinChain()) results_3 = pdb_3.keys().collect() self.assertTrue('2ONX.A' in results_3) self.assertFalse('1JLP.A' in results_3) self.assertTrue('5X6H.B' in results_3) self.assertFalse('5L2G.A' in results_3) self.assertFalse('5L2G.B' in results_3) self.assertFalse('2MK1.A' in results_3)
# ## Read MMTF Hadoop sequence file and # # Create a non-redundant set(<=20% seq. identity) of L-protein chains # In[3]: path = "../../resources/mmtf_reduced_sample/" sequenceIdentity = 20 resolution = 2.0 fraction = 0.1 seed = 123 pdb = mmtfReader.read_sequence_file( path, sc).flatMap(StructureToPolymerChains()).filter( Pisces(sequenceIdentity, resolution)).filter(ContainsLProteinChain()).sample( False, fraction, seed) # ## Get content # In[4]: segmentLength = 11 data = secondaryStructureSegmentExtractor.get_dataset(pdb, segmentLength).cache() print(f"original data : {data.count()}") # ## Drop Q3 and sequence duplicates # In[5]:
# ## Read in MMTF Files # In[3]: path = "../../resources/mmtf_reduced_sample/" pdb = mmtfReader.read_sequence_file(path, sc) # ## Filter proteins that cotinas Dna chain and L protein chain # # 1) Retain pdb entires that exclusively contain L-peptide chains # 2) Retain pdb entries that exclusively contain L-Dna # In[4]: structures = pdb.filter(ContainsLProteinChain()).filter(ContainsDnaChain()) # ## Count number of entires # In[5]: count = structures.count() print(f"Number of entires that contain L-protein and L-DNA: {count}") # ## Visualize Structures # In[7]: structure_names = structures.keys().collect() view_structure(structure_names)
sc = SparkContext(conf=conf) # ## Read MMTF Hadoop sequence file and # # Create a non-redundant set(<=20% seq. identity) of L-protein chains # In[3]: path = "../../resources/mmtf_reduced_sample/" fraction = 0.05 seed = 123 pdb = mmtfReader.read_sequence_file(path, sc).flatMap( StructureToPolymerChains(False, True)).filter(ContainsLProteinChain()).sample( False, fraction, seed) # ## Extract Element "H" from Secondary Structure # In[4]: label = "H" data = secondaryStructureElementExtractor.get_dataset(pdb, label).cache() print(f"original data : {data.count()}") data.show(10, False) # ## Word2Vec encoded feature Vector # In[6]:
# ## Read MMTF Hadoop sequence file and # # Create a non-redundant set(<=20% seq. identity) of L-protein chains # In[3]: path = "../../resources/mmtf_reduced_sample/" sequenceIdentity = 20 resolution = 2.0 fraction = 0.1 seed = 123 pdb = mmtfReader .read_sequence_file(path, sc) .flatMap(StructureToPolymerChains()) .filter(Pisces(sequenceIdentity, resolution)) .filter(ContainsLProteinChain()) .sample(False, fraction, seed) # ## Get content # In[4]: segmentLength = 11 data = secondaryStructureSegmentExtractor.get_dataset(pdb, segmentLength).cache() print(f"original data : {data.count()}") # ## Drop Q3 and sequence duplicates # In[5]:
"secondaryStructureElementDemo") sc = SparkContext(conf=conf) # ## Download protein (1STP) # # ### Note: Need to use SparkContext as parameter to download Mmtf files # In[12]: pdb = mmtfReader.download_mmtf_files(['1STP'], sc).cache() # ## Map protein to polymer chains and apply LProteinChain filter # In[13]: pdb = pdb.flatMap(StructureToPolymerChains()).filter(ContainsLProteinChain()) # ## Extract secondary structure element 'E' # In[14]: ds = secondaryStructureElementExtractor.get_dataset(pdb, 'E', 6) ds.show(50, False) # ## Terminate Spark # In[15]: sc.stop()
# input parameters resolution = 2.0 minInteractions = 2 maxInteractions = 4 distanceCutoff = 3.0 bFactorCutoff = 1.645 includeWaters = True # ## Read PDB and filter by resolution and only include proteins # In[3]: pdb = mmtfReader.read_sequence_file(path, sc) pdb = pdb.filter(Resolution(minResolution=0.0, maxResolution=2.0)).filter( ContainsLProteinChain(exclusive=True)) # ## Setup criteria for metal interactions # In[4]: interactions_filter = InteractionFilter() interactions_filter.set_distance_cutoff(3.0) interactions_filter.set_normalized_b_factor_cutoff(1.645) interactions_filter.set_min_interactions(2) interactions_filter.set_max_interactions(4) interactions_filter.set_query_groups(True, ["HOH"]) interactions_filter.set_query_elements(True, "O") # Only use water oxygen interactions_filter.set_target_elements(True, ["O", "N", "S"]) # ## Exclude "uninteresting" ligands