def setUp(self): conf = SparkConf().setMaster("local[*]").setAppName( 'polymerSequenceExtractorTest') self.sc = SparkContext(conf=conf) pdbIds = ["1STP", "4HHB"] self.pdb = downloadMmtfFiles(pdbIds, self.sc)
def setUp(self): conf = SparkConf().setMaster("local[*]").setAppName('piscesTest') self.sc = SparkContext(conf=conf) # "4R4X.A" and "5X42.B" should pass filter pdbIds = ["5X42","4R4X","2ONX","1JLP"] self.pdb = downloadMmtfFiles(pdbIds,self.sc)
def setUp(self): conf = SparkConf().setMaster("local[*]").setAppName('containsDProteinChainTest') self.sc = SparkContext(conf=conf) pdbIds = ['2ONX','1JLP','5X6H','5L2G','2MK1','2V5W','5XDP','5GOD'] self.pdb = downloadMmtfFiles(pdbIds,self.sc)
def setUp(self): conf = SparkConf().setMaster("local[*]").setAppName( 'secondaryStructureSegmentExtractorTest') self.sc = SparkContext(conf=conf) pdbIds = ["1STP"] self.pdb = downloadMmtfFiles(pdbIds, self.sc)
def setUp(self): conf = SparkConf().setMaster("local[*]").setAppName( 'advancedQueryTest') self.sc = SparkContext(conf=conf) pdbIds = ["5JDE", "5CU4", "5L6W", "5UFU", "5IHB"] self.pdb = downloadMmtfFiles(pdbIds, self.sc)
def test2(self): pdbIds = ["5NV3"] self.pdb = downloadMmtfFiles(pdbIds, self.sc) pdb_2 = self.pdb.flatMap(structureToBioassembly()) \ .flatMap(structureToProteinDimers(8,20,False, True)) self.assertTrue(pdb_2.count() == 12)
def setUp(self): conf = SparkConf().setMaster("local[*]").setAppName( 'containsDProteinChainTest') pdbIds = [ "2ONX", "5VLN", "5VAI", "5JXV", "5K7N", "3PDM", "5MNX", "5I1R", "5MON", "5LCB", "3J07" ] self.sc = SparkContext(conf=conf) self.pdb = downloadMmtfFiles(pdbIds, self.sc)
def setUp(self): conf = SparkConf().setMaster("local[*]").setAppName('testrFreeFilter') self.sc = SparkContext(conf=conf) # 1AIE: all alpha protein 20 alpha out of 31 = 0.645 helical # 1E0N: all beta protein, NMR structure with 10 models, 13 beta out of 27 = 0.481 sheet # 1EM7: alpha + beta, 14 alpha + 23 beta out of 56 = 0.25 helical and 0.411 sheet # 2C7M: 2 chains, alpha + beta (DSSP in MMTF doesn't match DSSP on RCSB PDB website) pdbIds = ["1AIE", "1E0N", "1EM7", "2C7M"] self.pdb = downloadMmtfFiles(pdbIds, self.sc)
def setUp(self): conf = SparkConf().setMaster("local[*]").setAppName( 'testContainsAlternativeLocations') self.sc = SparkContext(conf=conf) # 4QXX: has alternative location ids # 2ONX: has no alternative location ids pdbIds = ['4QXX', '2ONX'] self.pdb = downloadMmtfFiles(pdbIds, self.sc) self.pdb = self.pdb.map(lambda x: (x[0], x[1].set_alt_loc_list()))
def setUp(self): conf = SparkConf().setMaster("local[*]").setAppName( 'containsDProteinChainTest') self.sc = SparkContext(conf=conf) # 5KE8: contains Zinc finger motif # 1JLP: does not contain Zinc finger motif # 5VAI: contains Walker P loop pdbIds = ['5KE8', '1JLP', '5VAI'] self.pdb = downloadMmtfFiles(pdbIds, self.sc)
def setUp(self): conf = SparkConf().setMaster("local[*]").setAppName('testrFreeFilter') self.sc = SparkContext(conf=conf) # 2ONX: 0.202 rfree x-ray resolution # 2OLX: 0.235 rfree x-ray resolution # 3REC: n/a NMR structure # 1LU3: n/a EM structure pdbIds = ['2ONX', '2OLX', '3REC', '1LU3'] self.pdb = downloadMmtfFiles(pdbIds, self.sc)
def setUp(self): conf = SparkConf().setMaster("local[*]").setAppName('testrFreeFilter') self.sc = SparkContext(conf=conf) # 4MYA: deposited on 2013-09-27 # 1O6Y: deposited on 2002-10-21 # 3VCO: deposited on 2012-01-04 # 5N0Y: deposited on 2017-02-03 pdbIds = ['1O6Y', '4MYA', '3VCO', '5N0Y'] self.pdb = downloadMmtfFiles(pdbIds, self.sc)
def setUp(self): conf = SparkConf().setMaster("local[*]").setAppName('testrFreeFilter') pdbIds = ['1O6Y', '4MYA', '3VCO', '5N0Y'] # 1O6Y: released on 2003-01-30 # 4MYA: released on 2014-01-01 # 3VCO: released on 2013-03-06 # 5N0Y: released on 2017-05-24 self.sc = SparkContext(conf=conf) self.pdb = downloadMmtfFiles(pdbIds, self.sc)
def setUp(self): conf = SparkConf().setMaster("local[*]").setAppName( 'testContainsAlternativeLocations') self.sc = SparkContext(conf=conf) # 2ONX: only L-protein chain # 1JLP: single L-protein chains with non-polymer capping group (NH2) # 5X6H: L-protein and non-std. DNA chain # 5L2G: DNA chain # 2MK1: D-saccharide pdbIds = ['2ONX', '1JLP', '5X6H', '5L2G', '2MK1'] self.pdb = downloadMmtfFiles(pdbIds, self.sc)
def test4(self): pdbIds = ["1BZ5"] # C5-B4 # C6-B3 # D7-A2 # D8-A1 # E10-E9 self.pdb = downloadMmtfFiles(pdbIds, self.sc) pdb_4 = self.pdb.flatMap(structureToBioassembly()) \ .flatMap(structureToProteinDimers(9,20,False, True)) self.assertTrue(pdb_3.count() == 5)
def setUp(self): conf = SparkConf().setMaster("local[*]").setAppName( 'containsDProteinChainTest') self.sc = SparkContext(conf=conf) # 2ONX: only L-protein chain # 1JLP: single L-protein chains with non-polymer capping group (NH2) # 5X6H: L-protein and L-DNA chain # 5L2G: L-DNA chain # 2MK1: As of V5 of PDBx/mmCIF, saccharides seem to be represented as monomers, # instead of polysaccharides, so none of these tests returns true anymore. pdbIds = ['2ONX', '1JLP', '5X6H', '5L2G', '2MK1'] self.pdb = downloadMmtfFiles(pdbIds, self.sc)
def setUp(self): conf = SparkConf().setMaster("local[*]").setAppName( 'advancedQueryTest') self.sc = SparkContext(conf=conf) # 1PEN wildtype query 100 matches: 1PEN:1 # 1OCZ two entities wildtype query 100 matches: 1OCZ:1, 1OCZ:2 # 2ONX structure result for author query # 5L6W two chains: chain L is EC 2.7.11.1, chain chain C is not EC 2.7.11.1 # 5KHU many chains, chain Q is EC 2.7.11.1 # 1F3M entity 1: chains A,B, entity 2: chains B,C, all chains are EC 2.7.11.1 pdbIds = ["1PEN", "1OCZ", "2ONX", "5L6W", "5KHU", "1F3M"] self.pdb = downloadMmtfFiles(pdbIds, self.sc)
def setUp(self): conf = SparkConf().setMaster("local[*]").setAppName('testrFreeFilter') self.sc = SparkContext(conf=conf) # 2ONX: only L-protein chain # 1JLP: single L-protein chains with non-polymer capping group (NH2) # 5X6H: L-protein and non-std. DNA chain # 5L2G: DNA chain # 2MK1: D-saccharide # 5UZT: RNA chain (with std. nucleotides) # 1AA6: contains SEC, selenocysteine (21st amino acid) # 1NTH: contains PYL, pyrrolysine (22nd amino acid) pdbIds = ['2ONX','1JLP','5X6H','5L2G','2MK1','5UZT','1AA6','1NTH'] self.pdb = downloadMmtfFiles(pdbIds, self.sc)
def setUp(self): conf = SparkConf().setMaster("local[*]").setAppName( 'containsDProteinChainTest') self.sc = SparkContext(conf=conf) # 2ONX: only L-protein chain # 1JLP: single L-protein chains with non-polymer capping group (NH2) # 5X6H: L-protein and DNA chain # 5L2G: DNA chain # 2MK1: D-saccharide # 5UX0: 2 L-protein, 2 RNA, 2 DNA chains # 2NCQ: 2 RNA chains pdbIds = ['2ONX', '1JLP', '5X6H', '5L2G', '2MK1', '5UX0', '2NCQ'] self.pdb = downloadMmtfFiles(pdbIds, self.sc)
def setUp(self): conf = SparkConf().setMaster("local[*]").setAppName( 'testContainsAlternativeLocations') self.sc = SparkContext(conf=conf) # 1STP: 1 L-protein chain: # 4HHB: 4 polymer chains # 1JLP: 1 L-protein chains with non-polymer capping group (NH2) # 5X6H: 1 L-protein and 1 DNA chain # 5L2G: 2 DNA chain # 2MK1: 0 polymer chains # -------------------- # tot: 10 chains pdbIds = ["1STP", "4HHB", "1JLP", "5X6H", "5L2G", "2MK1"] self.pdb = downloadMmtfFiles(pdbIds, self.sc)
def setUp(self): conf = SparkConf().setMaster("local[*]").setAppName( 'testContainsAlternativeLocations') self.sc = SparkContext(conf=conf) # 2ONX: only L-protein chain # 1JLP: single L-protein chains with non-polymer capping group (NH2) # 5X6H: L-protein and DNA chain (with std. nucleotides) # 5L2G: DNA chain (with non-std. nucleotide) # 2MK1: D-saccharide # 5UZT: RNA chain (with std. nucleotides) # 1AA6: contains SEC, selenocysteine (21st amino acid) # 1NTH: contains PYL, pyrrolysine (22nd amino acid) pdbIds = [ "2ONX", "1JLP", "5X6H", "5L2G", "2MK1", "5UZT", "1AA6", "1NTH" ] self.pdb = downloadMmtfFiles(pdbIds, self.sc)
def test3(self): pdbIds = ["4GIS"] # A3-A2 # A4-A1 # B5-A1 # B6-A2 # B6-B5 # B7-A3 # B7-A4 # B8-A4 # B8-B7 self.pdb = downloadMmtfFiles(pdbIds, self.sc) pdb_3 = self.pdb.flatMap(structureToBioassembly()) \ .flatMap(structureToProteinDimers(8,20,False, True)) self.assertTrue(pdb_3.count() == 9)
def setUp(self): conf = SparkConf().setMaster("local[*]").setAppName('blastClustersTest') self.sc = SparkContext(conf=conf) pdbIds = ["1O06","2ONX"] self.pdb = downloadMmtfFiles(pdbIds,self.sc)
def main(argv): #Configure Spark conf = SparkConf().setAppName(APP_NAME) conf = conf.setMaster("local[*]") conf = conf.set("spark.executor.memory", "64g") conf = conf.set("spark.driver.cores","32") sc = SparkContext(conf=conf) #Get command line input try : opts,args = getopt.getopt(argv,"p:",["--path="]) except getopt.GetoptError: print("traverse.py -p <path_to_mmtf>") sys.exit() for opt,arg in opts: if opt in ["-p","--path"]: path = arg #Mmtf sequence file reader pdbIds = ['1AQ1','5GOD'] pdb = downloadMmtfFiles(pdbIds,sc) def getChainToEntityIndex(structure): entityChainIndex = [0] * structure.num_chains for i in range(0, len(structure.entity_list)): for j in structure.entity_list[i]["chainIndexList"]: entityChainIndex[j] = i return entityChainIndex def listOfBytesToString(listOfBytes): newList = [] for i in range(0, len(listOfBytes)): newList.append(listOfBytes[i].decode("utf-8")) return newList def listToString(temp): return("[" + ", ".join(map(str, temp)) + "]") def printMmtfInfo(structure): print("*** MMMTF INFO ***") print("MmtfProducer : " + structure.mmtf_producer) print("MmtfVersion : " + structure.mmtf_version) print(); def printMetadata(structure): print("*** METADATA ***") print("StructureId : " + structure.structure_id) print("Title : " + structure.title) print("Deposition date : " + structure.deposition_date) print("Release date : " + structure.release_date) print("Experimental method(s): " + listToString(listOfBytesToString(structure.experimental_methods))) print("Resolution : " + str(structure.resolution)) print("Rfree : " + str("%0.2f" % structure.r_free)) print("Rwork : " + str("%0.2f" % structure.r_work)) print() def printCrystallographicData(structure): print("*** CRYSTALLOGRAPHIC DATA ***") print("Space group : " + structure.space_group.decode('utf-8')) print("Unit cell dimensions : " + listToString(["%0.2f" % i for i in structure.unit_cell])) print() def printBioAssemblyData(structure): print("*** BIOASSEMBLY DATA ***") print("Number bioassemblies: " + str(len(structure.bio_assembly))) for i in range(0, len(structure.bio_assembly)): print("bioassembly: " + structure.bio_assembly[i][b"name"].decode('utf-8')) transformations = structure.bio_assembly[i][b"transformList"] print(" Number transformations: " + str(len(transformations))) for j in range(0, len(transformations)): print(" transformation: " + str(j)) print(" chains: " + str(transformations[j][b"chainIndexList"])) print(" rotTransMatrix: " + str(transformations[j][b"matrix"])) def traverse(structure): print("*** STRUCTURE DATA ***") print("Number of models: " + str(structure.num_models)) print("Number of chains: " + str(structure.num_chains)) print("Number of groups: " + str(structure.num_groups)) print("Number of atoms : " + str(structure.num_atoms)) chainIndex = 0 groupIndex = 0 atomIndex = 0 for i in range(0, structure.num_models): for j in range(0, structure.chains_per_model[i]): for k in range(0, structure.groups_per_chain[chainIndex]): groupType = structure.group_type_list[groupIndex] for m in range(0, (len(structure.group_list[groupType]["atomNameList"]))): atomIndex = atomIndex + 1 groupIndex = groupIndex + 1 chainIndex = chainIndex + 1 print("chainIndex: " + str(chainIndex)) print("groupIndex: " + str(groupIndex)) print("atomIndex : " + str(atomIndex)) print() def printChainInfo(structure): print("*** CHAIN DATA ***") print("Number of chains: " + str(structure.num_chains)) chainIndex = 0 for i in range(0, structure.num_models): print("model: " + str(i+1)) for j in range(0, structure.chains_per_model[i]): chainName = structure.chain_name_list[chainIndex] chainId = structure.chain_id_list[chainIndex] groups = structure.groups_per_chain[chainIndex] print("chainName: " + chainName + ", chainId: " + chainId + ", groups: " + str(groups)) chainIndex = chainIndex + 1 print() def printChainGroupInfo(structure): print("*** CHAIN AND GROUP DATA ***") chainIndex = 0 groupIndex = 0 for i in range(0, structure.num_models): print("model: " + str(i+1)) for j in range(0, structure.chains_per_model[i]): chainName = structure.chain_name_list[chainIndex] chainId = structure.chain_id_list[chainIndex] groups = structure.groups_per_chain[chainIndex] print("chainName: " + chainName + ", chainId: " + chainId + ", groups: " + str(groups)) for k in range(0, structure.groups_per_chain[chainIndex]): groupId = structure.group_id_list[groupIndex] insertionCode = structure.ins_code_list[groupIndex] secStruct = structure.sec_struct_list[groupIndex] seqIndex = structure.sequence_index_list[groupIndex] groupType = structure.group_type_list[groupIndex] groupName = structure.group_list[groupType]["groupName"] chemCompType = structure.group_list[groupType]["chemCompType"] oneLetterCode = structure.group_list[groupType]["singleLetterCode"] numAtoms = len(structure.group_list[groupType]["atomNameList"]) numBonds = len(structure.group_list[groupType]["bondOrderList"]) print(" groupName : " + groupName) print(" oneLetterCode : " + oneLetterCode) print(" seq. index : " + str(seqIndex)) print(" numAtoms : " + str(numAtoms)) print(" numBonds : " + str(numBonds)) print(" chemCompType : " + chemCompType) print(" groupId : " + str(groupId)) print(" insertionCode : " + insertionCode) print(" DSSP secStruct.: " + dsspSecondaryStructure.getDsspCode(secStruct).getOneLetterCode()) print() groupIndex = groupIndex + 1 chainIndex = chainIndex + 1 print() def printChainEntityGroupAtomInfo(structure): print("*** CHAIN ENTITY GROUP ATOM DATA ***") chainToEntityIndex = getChainToEntityIndex(structure) chainIndex = 0 groupIndex = 0 atomIndex = 0 for i in range(0, structure.num_models): print("model: " + str(i+1)) for j in range(0, structure.chains_per_model[i]): chainName = structure.chain_name_list[chainIndex] chainId = structure.chain_id_list[chainIndex] groups = structure.groups_per_chain[chainIndex] print("chainName: " + chainName + ", chainId: " + chainId + ", groups: " + str(groups)) entityType = structure.entity_list[chainToEntityIndex[chainIndex]]["type"] entityDescription = structure.entity_list[chainToEntityIndex[chainIndex]]["description"] entitySequence = structure.entity_list[chainToEntityIndex[chainIndex]]["sequence"] print("entity type : " + entityType); print("entity description : " + entityDescription); print("entity sequence : " + entitySequence); for k in range(0, structure.groups_per_chain[chainIndex]): groupId = structure.group_id_list[groupIndex] insertionCode = structure.ins_code_list[groupIndex] secStruct = structure.sec_struct_list[groupIndex] seqIndex = structure.sequence_index_list[groupIndex] groupType = structure.group_type_list[groupIndex] groupName = structure.group_list[groupType]["groupName"] chemCompType = structure.group_list[groupType]["chemCompType"] oneLetterCode = structure.group_list[groupType]["singleLetterCode"] numAtoms = len(structure.group_list[groupType]["atomNameList"]) numBonds = len(structure.group_list[groupType]["bondOrderList"]) print(" groupName : " + groupName) print(" oneLetterCode : " + oneLetterCode) print(" seq. index : " + str(seqIndex)) print(" numAtoms : " + str(numAtoms)) print(" numBonds : " + str(numBonds)) print(" chemCompType : " + chemCompType) print(" groupId : " + str(groupId)) print(" insertionCode : " + insertionCode) print(" DSSP secStruct.: " + dsspSecondaryStructure.getDsspCode(secStruct).getOneLetterCode()) print(" Atoms : ") for m in range(0, (len(structure.group_list[groupType]["atomNameList"]))): atomId = structure.atom_id_list[atomIndex] altLocId = structure.alt_loc_list[atomIndex] x = structure.x_coord_list[atomIndex] y = structure.y_coord_list[atomIndex] z = structure.z_coord_list[atomIndex] occupancy = structure.occupancy_list[atomIndex] bFactor = structure.b_factor_list[atomIndex] atomName = structure.group_list[groupType]["atomNameList"][m] element = structure.group_list[groupType]["elementList"][m] print(" " + str(atomId) + "\t" + atomName + "\t" + altLocId + "\t" + str(x) + "\t" + str(y) + "\t" + str(z) + "\t" + str(occupancy) + "\t" + str(bFactor) + "\t" + element) atomIndex = atomIndex + 1 groupIndex = groupIndex + 1 chainIndex = chainIndex + 1 print() def TraverseStructureHierarchy(structure): structure = structure.set_alt_loc_list() print(structure.entity_list) printMmtfInfo(structure) printMetadata(structure) printCrystallographicData(structure) traverse(structure) printChainInfo(structure) printChainGroupInfo(structure) printChainEntityGroupAtomInfo(structure) printBioAssemblyData(structure) pdb.foreach(lambda t: TraverseStructureHierarchy(t[1]))
def setUp(self): conf = SparkConf().setMaster("local[*]").setAppName('wildTypeTest') self.sc = SparkContext(conf=conf) pdbIds = ["1PEN", "1OCZ", "2ONX"] self.pdb = downloadMmtfFiles(pdbIds, self.sc)