Ejemplo n.º 1
0
    def setUp(self):
        conf = SparkConf().setMaster("local[*]").setAppName(
            'polymerSequenceExtractorTest')
        self.sc = SparkContext(conf=conf)

        pdbIds = ["1STP", "4HHB"]
        self.pdb = downloadMmtfFiles(pdbIds, self.sc)
Ejemplo n.º 2
0
    def setUp(self):
        conf = SparkConf().setMaster("local[*]").setAppName('piscesTest')
        self.sc = SparkContext(conf=conf)

        # "4R4X.A" and "5X42.B" should pass filter
        pdbIds = ["5X42","4R4X","2ONX","1JLP"]
        self.pdb = downloadMmtfFiles(pdbIds,self.sc)
Ejemplo n.º 3
0
    def setUp(self):
        conf = SparkConf().setMaster("local[*]").setAppName('containsDProteinChainTest')
        self.sc = SparkContext(conf=conf)


        pdbIds = ['2ONX','1JLP','5X6H','5L2G','2MK1','2V5W','5XDP','5GOD']
        self.pdb = downloadMmtfFiles(pdbIds,self.sc)
    def setUp(self):
        conf = SparkConf().setMaster("local[*]").setAppName(
            'secondaryStructureSegmentExtractorTest')
        self.sc = SparkContext(conf=conf)

        pdbIds = ["1STP"]
        self.pdb = downloadMmtfFiles(pdbIds, self.sc)
Ejemplo n.º 5
0
    def setUp(self):
        conf = SparkConf().setMaster("local[*]").setAppName(
            'advancedQueryTest')
        self.sc = SparkContext(conf=conf)

        pdbIds = ["5JDE", "5CU4", "5L6W", "5UFU", "5IHB"]
        self.pdb = downloadMmtfFiles(pdbIds, self.sc)
    def test2(self):
        pdbIds = ["5NV3"]
        self.pdb = downloadMmtfFiles(pdbIds, self.sc)

        pdb_2 = self.pdb.flatMap(structureToBioassembly()) \
                        .flatMap(structureToProteinDimers(8,20,False, True))

        self.assertTrue(pdb_2.count() == 12)
Ejemplo n.º 7
0
 def setUp(self):
     conf = SparkConf().setMaster("local[*]").setAppName(
         'containsDProteinChainTest')
     pdbIds = [
         "2ONX", "5VLN", "5VAI", "5JXV", "5K7N", "3PDM", "5MNX", "5I1R",
         "5MON", "5LCB", "3J07"
     ]
     self.sc = SparkContext(conf=conf)
     self.pdb = downloadMmtfFiles(pdbIds, self.sc)
Ejemplo n.º 8
0
    def setUp(self):
        conf = SparkConf().setMaster("local[*]").setAppName('testrFreeFilter')
        self.sc = SparkContext(conf=conf)

        # 1AIE: all alpha protein 20 alpha out of 31 = 0.645 helical
        # 1E0N: all beta protein, NMR structure with 10 models, 13 beta out of 27 = 0.481 sheet
        # 1EM7: alpha + beta, 14 alpha + 23 beta out of 56 = 0.25 helical and 0.411 sheet
        # 2C7M: 2 chains, alpha + beta (DSSP in MMTF doesn't match DSSP on RCSB PDB website)
        pdbIds = ["1AIE", "1E0N", "1EM7", "2C7M"]
        self.pdb = downloadMmtfFiles(pdbIds, self.sc)
Ejemplo n.º 9
0
    def setUp(self):
        conf = SparkConf().setMaster("local[*]").setAppName(
            'testContainsAlternativeLocations')
        self.sc = SparkContext(conf=conf)

        # 4QXX: has alternative location ids
        # 2ONX: has no alternative location ids
        pdbIds = ['4QXX', '2ONX']
        self.pdb = downloadMmtfFiles(pdbIds, self.sc)
        self.pdb = self.pdb.map(lambda x: (x[0], x[1].set_alt_loc_list()))
    def setUp(self):
        conf = SparkConf().setMaster("local[*]").setAppName(
            'containsDProteinChainTest')
        self.sc = SparkContext(conf=conf)

        # 5KE8: contains Zinc finger motif
        # 1JLP: does not contain Zinc finger motif
        # 5VAI: contains Walker P loop
        pdbIds = ['5KE8', '1JLP', '5VAI']
        self.pdb = downloadMmtfFiles(pdbIds, self.sc)
Ejemplo n.º 11
0
    def setUp(self):
        conf = SparkConf().setMaster("local[*]").setAppName('testrFreeFilter')
        self.sc = SparkContext(conf=conf)

        # 2ONX: 0.202 rfree x-ray resolution
        # 2OLX: 0.235 rfree x-ray resolution
        # 3REC: n/a NMR structure
        # 1LU3: n/a EM structure
        pdbIds = ['2ONX', '2OLX', '3REC', '1LU3']
        self.pdb = downloadMmtfFiles(pdbIds, self.sc)
    def setUp(self):
        conf = SparkConf().setMaster("local[*]").setAppName('testrFreeFilter')
        self.sc = SparkContext(conf=conf)

        # 4MYA: deposited on 2013-09-27
        # 1O6Y: deposited on 2002-10-21
        # 3VCO: deposited on 2012-01-04
        # 5N0Y: deposited on 2017-02-03
        pdbIds = ['1O6Y', '4MYA', '3VCO', '5N0Y']
        self.pdb = downloadMmtfFiles(pdbIds, self.sc)
Ejemplo n.º 13
0
    def setUp(self):
        conf = SparkConf().setMaster("local[*]").setAppName('testrFreeFilter')
        pdbIds = ['1O6Y', '4MYA', '3VCO', '5N0Y']

        # 1O6Y: released on 2003-01-30
        # 4MYA: released on 2014-01-01
        # 3VCO: released on 2013-03-06
        # 5N0Y: released on 2017-05-24
        self.sc = SparkContext(conf=conf)
        self.pdb = downloadMmtfFiles(pdbIds, self.sc)
Ejemplo n.º 14
0
    def setUp(self):
        conf = SparkConf().setMaster("local[*]").setAppName(
            'testContainsAlternativeLocations')
        self.sc = SparkContext(conf=conf)

        # 2ONX: only L-protein chain
        # 1JLP: single L-protein chains with non-polymer capping group (NH2)
        # 5X6H: L-protein and non-std. DNA chain
        # 5L2G: DNA chain
        # 2MK1: D-saccharide
        pdbIds = ['2ONX', '1JLP', '5X6H', '5L2G', '2MK1']
        self.pdb = downloadMmtfFiles(pdbIds, self.sc)
    def test4(self):
        pdbIds = ["1BZ5"]
        # C5-B4
        # C6-B3
        # D7-A2
        # D8-A1
        # E10-E9
        self.pdb = downloadMmtfFiles(pdbIds, self.sc)

        pdb_4 = self.pdb.flatMap(structureToBioassembly()) \
                        .flatMap(structureToProteinDimers(9,20,False, True))

        self.assertTrue(pdb_3.count() == 5)
    def setUp(self):
        conf = SparkConf().setMaster("local[*]").setAppName(
            'containsDProteinChainTest')
        self.sc = SparkContext(conf=conf)

        # 2ONX: only L-protein chain
        # 1JLP: single L-protein chains with non-polymer capping group (NH2)
        # 5X6H: L-protein and L-DNA chain
        # 5L2G: L-DNA chain
        # 2MK1: As of V5 of PDBx/mmCIF, saccharides seem to be represented as monomers,
        #       instead of polysaccharides, so none of these tests returns true anymore.
        pdbIds = ['2ONX', '1JLP', '5X6H', '5L2G', '2MK1']
        self.pdb = downloadMmtfFiles(pdbIds, self.sc)
Ejemplo n.º 17
0
    def setUp(self):
        conf = SparkConf().setMaster("local[*]").setAppName(
            'advancedQueryTest')
        self.sc = SparkContext(conf=conf)

        # 1PEN wildtype query 100 matches: 1PEN:1
        # 1OCZ two entities wildtype query 100 matches: 1OCZ:1, 1OCZ:2
        # 2ONX structure result for author query
        # 5L6W two chains: chain L is EC 2.7.11.1, chain chain C is not EC 2.7.11.1
        # 5KHU many chains, chain Q is EC 2.7.11.1
        # 1F3M entity 1: chains A,B, entity 2: chains B,C, all chains are EC 2.7.11.1
        pdbIds = ["1PEN", "1OCZ", "2ONX", "5L6W", "5KHU", "1F3M"]
        self.pdb = downloadMmtfFiles(pdbIds, self.sc)
Ejemplo n.º 18
0
    def setUp(self):
        conf = SparkConf().setMaster("local[*]").setAppName('testrFreeFilter')
        self.sc = SparkContext(conf=conf)

        # 2ONX: only L-protein chain
        # 1JLP: single L-protein chains with non-polymer capping group (NH2)
        # 5X6H: L-protein and non-std. DNA chain
        # 5L2G: DNA chain
        # 2MK1: D-saccharide
        # 5UZT: RNA chain (with std. nucleotides)
        # 1AA6: contains SEC, selenocysteine (21st amino acid)
        # 1NTH: contains PYL, pyrrolysine (22nd amino acid)
        pdbIds = ['2ONX','1JLP','5X6H','5L2G','2MK1','5UZT','1AA6','1NTH']
        self.pdb = downloadMmtfFiles(pdbIds, self.sc)
Ejemplo n.º 19
0
    def setUp(self):
        conf = SparkConf().setMaster("local[*]").setAppName(
            'containsDProteinChainTest')
        self.sc = SparkContext(conf=conf)

        # 2ONX: only L-protein chain
        # 1JLP: single L-protein chains with non-polymer capping group (NH2)
        # 5X6H: L-protein and DNA chain
        # 5L2G: DNA chain
        # 2MK1: D-saccharide
        # 5UX0: 2 L-protein, 2 RNA, 2 DNA chains
        # 2NCQ: 2 RNA chains
        pdbIds = ['2ONX', '1JLP', '5X6H', '5L2G', '2MK1', '5UX0', '2NCQ']
        self.pdb = downloadMmtfFiles(pdbIds, self.sc)
    def setUp(self):
        conf = SparkConf().setMaster("local[*]").setAppName(
            'testContainsAlternativeLocations')
        self.sc = SparkContext(conf=conf)

        # 1STP: 1 L-protein chain:
        # 4HHB: 4 polymer chains
        # 1JLP: 1 L-protein chains with non-polymer capping group (NH2)
        # 5X6H: 1 L-protein and 1 DNA chain
        # 5L2G: 2 DNA chain
        # 2MK1: 0 polymer chains
        # --------------------
        # tot: 10 chains
        pdbIds = ["1STP", "4HHB", "1JLP", "5X6H", "5L2G", "2MK1"]
        self.pdb = downloadMmtfFiles(pdbIds, self.sc)
    def setUp(self):
        conf = SparkConf().setMaster("local[*]").setAppName(
            'testContainsAlternativeLocations')
        self.sc = SparkContext(conf=conf)

        # 2ONX: only L-protein chain
        # 1JLP: single L-protein chains with non-polymer capping group (NH2)
        # 5X6H: L-protein and DNA chain (with std. nucleotides)
        # 5L2G: DNA chain (with non-std. nucleotide)
        # 2MK1: D-saccharide
        # 5UZT: RNA chain (with std. nucleotides)
        # 1AA6: contains SEC, selenocysteine (21st amino acid)
        # 1NTH: contains PYL, pyrrolysine (22nd amino acid)
        pdbIds = [
            "2ONX", "1JLP", "5X6H", "5L2G", "2MK1", "5UZT", "1AA6", "1NTH"
        ]
        self.pdb = downloadMmtfFiles(pdbIds, self.sc)
    def test3(self):
        pdbIds = ["4GIS"]
        # A3-A2
        # A4-A1
        # B5-A1
        # B6-A2
        # B6-B5
        # B7-A3
        # B7-A4
        # B8-A4
        # B8-B7
        self.pdb = downloadMmtfFiles(pdbIds, self.sc)

        pdb_3 = self.pdb.flatMap(structureToBioassembly()) \
                        .flatMap(structureToProteinDimers(8,20,False, True))

        self.assertTrue(pdb_3.count() == 9)
Ejemplo n.º 23
0
    def setUp(self):
        conf = SparkConf().setMaster("local[*]").setAppName('blastClustersTest')
        self.sc = SparkContext(conf=conf)

        pdbIds = ["1O06","2ONX"]
        self.pdb = downloadMmtfFiles(pdbIds,self.sc)
Ejemplo n.º 24
0
def main(argv):

    #Configure Spark
    conf = SparkConf().setAppName(APP_NAME)
    conf = conf.setMaster("local[*]")
    conf = conf.set("spark.executor.memory", "64g")
    conf = conf.set("spark.driver.cores","32")
    sc = SparkContext(conf=conf)

    #Get command line input
    try :
        opts,args = getopt.getopt(argv,"p:",["--path="])
    except getopt.GetoptError:
        print("traverse.py -p <path_to_mmtf>")
        sys.exit()
    for opt,arg in opts:
        if opt in ["-p","--path"]:
            path = arg

    #Mmtf sequence file reader
    pdbIds = ['1AQ1','5GOD']
    pdb = downloadMmtfFiles(pdbIds,sc)

    def getChainToEntityIndex(structure):
        entityChainIndex = [0] * structure.num_chains
        for i in range(0, len(structure.entity_list)):
            for j in structure.entity_list[i]["chainIndexList"]:
                entityChainIndex[j] = i
        return entityChainIndex


    def listOfBytesToString(listOfBytes):
        newList = []
        for i in range(0, len(listOfBytes)):
            newList.append(listOfBytes[i].decode("utf-8"))
        return newList

    def listToString(temp):
        return("[" + ", ".join(map(str, temp)) + "]")


    def printMmtfInfo(structure):
        print("*** MMMTF INFO ***")
        print("MmtfProducer    : " + structure.mmtf_producer)
        print("MmtfVersion     : " + structure.mmtf_version)
        print();

    def printMetadata(structure):
        print("*** METADATA ***")
        print("StructureId           : " + structure.structure_id)
        print("Title                 : " + structure.title)
        print("Deposition date       : " + structure.deposition_date)
        print("Release date          : " + structure.release_date)
        print("Experimental method(s): " + listToString(listOfBytesToString(structure.experimental_methods)))
        print("Resolution            : " + str(structure.resolution))
        print("Rfree                 : " + str("%0.2f" % structure.r_free))
        print("Rwork                 : " + str("%0.2f" % structure.r_work))
        print()


    def printCrystallographicData(structure):
        print("*** CRYSTALLOGRAPHIC DATA ***")
        print("Space group           : " + structure.space_group.decode('utf-8'))
        print("Unit cell dimensions  : " + listToString(["%0.2f" % i for i in structure.unit_cell]))
        print()

    def printBioAssemblyData(structure):
        print("*** BIOASSEMBLY DATA ***")
        print("Number bioassemblies: " + str(len(structure.bio_assembly)))
        for i in range(0, len(structure.bio_assembly)):
            print("bioassembly: " + structure.bio_assembly[i][b"name"].decode('utf-8'))
            transformations = structure.bio_assembly[i][b"transformList"]
            print("  Number transformations: " + str(len(transformations)))
            for j in range(0, len(transformations)):
                print("    transformation: " + str(j))
                print("    chains:         " + str(transformations[j][b"chainIndexList"]))
                print("    rotTransMatrix: " + str(transformations[j][b"matrix"]))

    def traverse(structure):
        print("*** STRUCTURE DATA ***")
        print("Number of models: " + str(structure.num_models))
        print("Number of chains: " + str(structure.num_chains))
        print("Number of groups: " + str(structure.num_groups))
        print("Number of atoms : " + str(structure.num_atoms))
        chainIndex = 0
        groupIndex = 0
        atomIndex = 0
        for i in range(0, structure.num_models):
            for j in range(0, structure.chains_per_model[i]):
                for k in range(0, structure.groups_per_chain[chainIndex]):
                    groupType = structure.group_type_list[groupIndex]
                    for m in range(0, (len(structure.group_list[groupType]["atomNameList"]))):
                        atomIndex = atomIndex + 1
                    groupIndex = groupIndex + 1
                chainIndex = chainIndex + 1
        print("chainIndex: " + str(chainIndex))
        print("groupIndex: " + str(groupIndex))
        print("atomIndex : " + str(atomIndex))
        print()

    def printChainInfo(structure):
        print("*** CHAIN DATA ***")
        print("Number of chains: " + str(structure.num_chains))
        chainIndex = 0
        for i in range(0, structure.num_models):
            print("model: " + str(i+1))
            for j in range(0, structure.chains_per_model[i]):
                chainName = structure.chain_name_list[chainIndex]
                chainId = structure.chain_id_list[chainIndex]
                groups = structure.groups_per_chain[chainIndex]
                print("chainName: " + chainName + ", chainId: " + chainId + ", groups: " + str(groups))
                chainIndex = chainIndex + 1
        print()

    def printChainGroupInfo(structure):
        print("*** CHAIN AND GROUP DATA ***")
        chainIndex = 0
        groupIndex = 0
        for i in range(0, structure.num_models):
            print("model: " + str(i+1))
            for j in range(0, structure.chains_per_model[i]):
                chainName = structure.chain_name_list[chainIndex]
                chainId = structure.chain_id_list[chainIndex]
                groups = structure.groups_per_chain[chainIndex]
                print("chainName: " + chainName + ", chainId: " + chainId + ", groups: " + str(groups))
                for k in range(0, structure.groups_per_chain[chainIndex]):
                    groupId = structure.group_id_list[groupIndex]
                    insertionCode = structure.ins_code_list[groupIndex]
                    secStruct = structure.sec_struct_list[groupIndex]
                    seqIndex = structure.sequence_index_list[groupIndex]

                    groupType = structure.group_type_list[groupIndex]

                    groupName = structure.group_list[groupType]["groupName"]
                    chemCompType = structure.group_list[groupType]["chemCompType"]
                    oneLetterCode = structure.group_list[groupType]["singleLetterCode"]
                    numAtoms = len(structure.group_list[groupType]["atomNameList"])
                    numBonds = len(structure.group_list[groupType]["bondOrderList"])

                    print("   groupName      : " + groupName)
                    print("   oneLetterCode  : " + oneLetterCode)
                    print("   seq. index     : " + str(seqIndex))
                    print("   numAtoms       : " + str(numAtoms))
                    print("   numBonds       : " + str(numBonds))
                    print("   chemCompType   : " + chemCompType)
                    print("   groupId        : " + str(groupId))
                    print("   insertionCode  : " + insertionCode)
                    print("   DSSP secStruct.: " + dsspSecondaryStructure.getDsspCode(secStruct).getOneLetterCode())
                    print()
                    groupIndex = groupIndex + 1
                chainIndex = chainIndex + 1
        print()

    def printChainEntityGroupAtomInfo(structure):
        print("*** CHAIN ENTITY GROUP ATOM DATA ***")
        chainToEntityIndex = getChainToEntityIndex(structure)
        chainIndex = 0
        groupIndex = 0
        atomIndex = 0
        for i in range(0, structure.num_models):
            print("model: " + str(i+1))
            for j in range(0, structure.chains_per_model[i]):
                chainName = structure.chain_name_list[chainIndex]
                chainId = structure.chain_id_list[chainIndex]
                groups = structure.groups_per_chain[chainIndex]
                print("chainName: " + chainName + ", chainId: " + chainId + ", groups: " + str(groups))

                entityType = structure.entity_list[chainToEntityIndex[chainIndex]]["type"]
                entityDescription = structure.entity_list[chainToEntityIndex[chainIndex]]["description"]
                entitySequence = structure.entity_list[chainToEntityIndex[chainIndex]]["sequence"]
                print("entity type          : " + entityType);
                print("entity description   : " + entityDescription);
                print("entity sequence      : " + entitySequence);

                for k in range(0, structure.groups_per_chain[chainIndex]):
                    groupId = structure.group_id_list[groupIndex]
                    insertionCode = structure.ins_code_list[groupIndex]
                    secStruct = structure.sec_struct_list[groupIndex]
                    seqIndex = structure.sequence_index_list[groupIndex]

                    groupType = structure.group_type_list[groupIndex]

                    groupName = structure.group_list[groupType]["groupName"]
                    chemCompType = structure.group_list[groupType]["chemCompType"]
                    oneLetterCode = structure.group_list[groupType]["singleLetterCode"]
                    numAtoms = len(structure.group_list[groupType]["atomNameList"])
                    numBonds = len(structure.group_list[groupType]["bondOrderList"])

                    print("   groupName      : " + groupName)
                    print("   oneLetterCode  : " + oneLetterCode)
                    print("   seq. index     : " + str(seqIndex))
                    print("   numAtoms       : " + str(numAtoms))
                    print("   numBonds       : " + str(numBonds))
                    print("   chemCompType   : " + chemCompType)
                    print("   groupId        : " + str(groupId))
                    print("   insertionCode  : " + insertionCode)
                    print("   DSSP secStruct.: " + dsspSecondaryStructure.getDsspCode(secStruct).getOneLetterCode())
                    print("   Atoms          : ")

                    for m in range(0, (len(structure.group_list[groupType]["atomNameList"]))):
                        atomId = structure.atom_id_list[atomIndex]
                        altLocId = structure.alt_loc_list[atomIndex]
                        x = structure.x_coord_list[atomIndex]
                        y = structure.y_coord_list[atomIndex]
                        z = structure.z_coord_list[atomIndex]
                        occupancy = structure.occupancy_list[atomIndex]
                        bFactor = structure.b_factor_list[atomIndex]

                        atomName = structure.group_list[groupType]["atomNameList"][m]
                        element = structure.group_list[groupType]["elementList"][m]

                        print("      " + str(atomId) + "\t" + atomName + "\t" + altLocId +
                            "\t" + str(x) + "\t" + str(y) + "\t" + str(z) +
                            "\t" + str(occupancy) + "\t" + str(bFactor) + "\t" + element)
                        atomIndex = atomIndex + 1


                    groupIndex = groupIndex + 1
                chainIndex = chainIndex + 1
        print()


    def TraverseStructureHierarchy(structure):
        structure = structure.set_alt_loc_list()
        print(structure.entity_list)
        printMmtfInfo(structure)
        printMetadata(structure)
        printCrystallographicData(structure)
        traverse(structure)
        printChainInfo(structure)
        printChainGroupInfo(structure)
        printChainEntityGroupAtomInfo(structure)
        printBioAssemblyData(structure)

    pdb.foreach(lambda t: TraverseStructureHierarchy(t[1]))
Ejemplo n.º 25
0
    def setUp(self):
        conf = SparkConf().setMaster("local[*]").setAppName('wildTypeTest')
        self.sc = SparkContext(conf=conf)

        pdbIds = ["1PEN", "1OCZ", "2ONX"]
        self.pdb = downloadMmtfFiles(pdbIds, self.sc)