# In[2]: conf = SparkConf().setMaster("local[*]").setAppName("FilterByReleaseDate") sc = SparkContext(conf=conf) # ## Read in MMTF Files, filter and count # In[3]: path = "../../resources/mmtf_reduced_sample/" structures = mmtfReader.read_sequence_file(path, sc).filter( ReleaseDate("2000-01-28", "2017-02-28")) print( f"Number of structure released between 2000-01-28 and 2017-02-28 is: {structures.count()}" ) # ## Visualize Structures # In[4]: structure_names = structures.keys().collect() view_structure(structure_names, style='line') # ## Terminate Spark # In[5]: sc.stop()
# ## Filter proteins that cotinas Dna chain and L protein chain # # 1) Retain pdb entires that exclusively contain L-peptide chains # 2) Retain pdb entries that exclusively contain L-Dna # In[4]: structures = pdb.filter(ContainsLProteinChain()).filter(ContainsDnaChain()) # ## Count number of entires # In[5]: count = structures.count() print(f"Number of entires that contain L-protein and L-DNA: {count}") # ## Visualize Structures # In[7]: structure_names = structures.keys().collect() view_structure(structure_names) # ## Terminate Spark # In[6]: sc.stop()
# * ExperimentalMethods.SOLUTION_NMR # * ExperimentalMethods.SOLUTION_SCATTERING # * ExperimentalMethods.THEORETICAL_MODEL # * ExperimentalMethods.X_RAY_DIFFRACTION # In[4]: pdb = pdb.filter( ExperimentalMethods(ExperimentalMethods.NEUTRON_DIFFRACTION, ExperimentalMethods.X_RAY_DIFFRACTION)) # ## Print out entries # In[5]: filtered_structures = pdb.keys().collect() print(filtered_structures) # ## Visualize 3D structures of filtered structures # In[6]: view_structure(filtered_structures) # ## Terminate Spark # In[7]: sc.stop()
structures = structures.filter(NotFilter(ContainsDnaChain())) # ## Count number of entires # In[10]: count = structures.count() print(f"PDB entires without DNA chains : {count}") # ## Visualize Structures # In[11]: view_structure(structures.keys().collect()) # ## Terminate Spark # In[7]: sc.stop()
conf = SparkConf().setMaster("local[*]").setAppName("ReadLocalMMTFReduced") sc = SparkContext(conf=conf) # ## Read in local Hadoop Sequence Files and count number of entries # In[3]: path = "../../resources/mmtf_reduced_sample/" pdb = mmtfReader.read_sequence_file(path, sc) count = pdb.count() print(f'number of pdb entries read : {count}') # In[ ]: ## Visualize Structures # In[5]: structures = pdb.keys().collect() view_structure(structures, style='sphere') # ## Terminate Spark # In[6]: sc.stop()
# ## Retain high resolution X-ray structures # In[4]: pdb = pdb.filter(ExperimentalMethods( ExperimentalMethods.X_RAY_DIFFRACTION)).filter(Resolution(0, 2.0)).filter( RFree(0, 2.0)) print(f'number of pdb entries left : {pdb.count()}') # ## Visualize Structures # In[5]: structures = pdb.keys().collect() view_structure(structures) # ## Save this subset in a Hadoop Sequence File # In[7]: write_path = "./mmtf_subset_xray" # Reduce RDD to 8 partitiions pdb = pdb.coalesce(8) mmtfWriter.write_sequence_file(write_path, sc, pdb) # ## Terminate Spark # In[8]:
path = "../../resources/mmtf_full_sample/" fraction = 0.5 seed = 123 pdb = mmtfReader.read_sequence_file(path, sc, fraction=fraction, seed=seed) count = pdb.count() print(f'number of pdb entries read : {count}') # ## Visualize Structures # In[5]: structures = pdb.keys().collect() view_structure(structures, style='stick') # ## Save this subset in a Hadoop Sequence File # In[4]: write_path = "./mmtf_subset" mmtfWriter.write_sequence_file(write_path, sc, pdb) # ## Terminate Spark # In[5]: sc.stop()
# ## Filter by deposition date # In[4]: pdb = pdb.filter(DepositionDate('1999-02-26', '1999-02-28')) # ## Count number of entires # In[5]: count = pdb.count() print( f"Number of structure desposited between 1999-02-26 and 1999-02-28 is : {count}" ) # ## View 3D structures # In[6]: pdbIds = pdb.keys().collect() view_structure(pdbIds) # ## Terminate Spark # In[7]: sc.stop()