Example #1
0
def main():
    '''
    Demo to create a feature vector for protein fold classification.
    In this demo we try to classify a protein chain as either an
    all alpha or all beta protein based on protein sequence. We use
    n-grams and Word2Vec representation of the protein sequence as a
    feature vector.
    '''

    start = time.time()

    conf = SparkConf() \
            .setMaster("local[*]") \
            .setAppName("featuresDemo")
    sc = SparkContext(conf=conf)
    path = "../../resources/mmtf_reduced_sample/"

    # Read MMTF Hadoop sequence file and create a non-redundant set (<=40% seq. identity)
    # of L-protein chains
    sequenceIdentity = 40
    resolution = 2.0

    pdb = MmtfReader \
            .readSequenceFile(path, sc) \
            .filter(pisces(sequenceIdentity, resolution)) \
            .flatMap(structureToPolymerChains()) \
            .filter(pisces(sequenceIdentity, resolution)) \
            .filter(containsLProteinChain()) \

    # Get secondary structure content
    data = secondaryStructureExtractor.getDataset(pdb)

    # classify chains by secondary structure type
    minThreshold = 0.05
    maxThreshold = 0.15
    data = addProteinFoldType(data, minThreshold, maxThreshold)

    # add Word2Vec encoded feature vector
    encoder = proteinSequenceEncoder(data)
    n = 2  # Create 2-grams
    windowSize = 25  # 25-amino residue window size for Word2Vec
    vectorSize = 50  # dimension of feature vector
    data = encoder.overlappingNgramWord2VecEncode(n, windowSize,
                                                  vectorSize).cache()

    data.printSchema()
    data.show(25)

    # keep only a subset of relevant fields for futher processing
    data = data.select("structureChainId", "alpha", "beta", "coil", "foldType",
                       "features")

    data.write.mode("overwrite").format("parquet").save(
        "/home/marshuang80/PDB/data/demo.parquet")

    end = time.time()

    print("Time: %f  sec." % (end - start))

    sc.stop()
Example #2
0
def main():
    '''
    This class creates a dataset of sequence segments dericed from a
    non-redundant set. The dataset contains the sequence segment, the DSSP Q8
    and DSSP Q3 code of the center residue in a seuqnce segment, and a one-hot
    encoding of the sequence segment.
    The dataset is saved in a file specified by the user
    '''

    start = time.time()

    conf = SparkConf() \
            .setMaster("local[*]") \
            .setAppName("SecondaryStructureOneHotEncoderDemo")
    sc = SparkContext(conf=conf)

    # Read MMTF Hadoop sequence file and create a non-redundant set
    # (<=20% seq. identity) of L-protein chains

    path = "../../resources/mmtf_reduced_sample/"

    sequenceIdentity = 20
    resolution = 2.0
    fraction = 0.1
    seed = 123

    #.downloadMmtfFiles(["2ONX",'1JLP','5X6H','5L2G','2MK1' ],sc) \



    pdb = MmtfReader \
            .readSequenceFile(path, sc) \
            .flatMap(structureToPolymerChains()) \
            .filter(pisces(sequenceIdentity, resolution)) \
            .filter(containsLProteinChain()) \
            .sample(False, fraction, seed)

    segmentLength = 11
    data = secondaryStructureSegmentExtractor.getDataset(
        pdb, segmentLength).cache()
    print(f"original data   : {data.count()}")

    data = data.dropDuplicates(["labelQ3", "sequence"]).cache()
    print(f"- duplicate Q3/seq  : {data.count()}")

    data = data.dropDuplicates(["sequence"])
    print(f"- duplicate seq  : {data.count()}")

    encoder = proteinSequenceEncoder(data)
    data = encoder.oneHotEncode()

    data.printSchema()
    data.show(25, False)

    end = time.time()

    print("Time: %f  sec." % (end - start))

    sc.stop()
    def test1(self):
        pdb = self.pdb.filter(containsLProteinChain()) \
                      .flatMap(structureToPolymerChains()) \
                      .filter(containsLProteinChain())

        seq = secondaryStructureExtractor.getDataset(pdb)

        self.assertTrue(seq.count() == 5)
Example #4
0
    def test6(self):
        pdb_6 = self.pdb.flatMap(structureToPolymerChains())
        pdb_6 = pdb_6.filter(
            secondaryStructure(0.70, 0.75, 0.00, 0.40, 0.25, 0.50))
        results_6 = pdb_6.keys().collect()

        self.assertTrue('2C7M.A' in results_6)
        self.assertFalse('2C7M.B' in results_6)
    def test2(self):

        pdb_2 = self.pdb.filter(blastCluster(40))
        pdb_2 = pdb_2.flatMap(structureToPolymerChains())
        results_2 = pdb_2.keys().collect()

        self.assertFalse('1O06' in results_2)
        self.assertTrue('1O06.A' in results_2)
        self.assertFalse('2ONX' in results_2)
Example #6
0
def main():
    '''
    This class creates a dataset of sequence segment derived from a
    non-redundant set. The dataset contains the sequence segment, the DSSP
    Q8 and DSSP Q3 code of the center residue in a sequence segment, and a
    Word2Vec encoding of the sequence segment.
    The Data is saved in JSON file specified by the user.
    '''

    start = time.time()

    conf = SparkConf() \
            .setMaster("local[*]") \
            .setAppName("secondaryStructureWord2VecEncodeDemo")
    sc = SparkContext(conf=conf)

    # Read MMTF Hadoop sequence file and create a non-redundant set
    # (<=20% seq. identity) of L-protein chains

    path = "../../resources/mmtf_reduced_sample/"

    sequenceIdentity = 20
    resolution = 2.0
    fraction = 0.1
    seed = 123

    pdb = MmtfReader \
            .readSequenceFile(path, sc) \
            .flatMap(structureToPolymerChains()) \
            .filter(pisces(sequenceIdentity, resolution)) \
            .filter(containsLProteinChain()) \
            .sample(False, fraction, seed)

    segmentLength = 11
    data = secondaryStructureSegmentExtractor.getDataset(
        pdb, segmentLength).cache()

    # add Word2Vec encoded feature vector
    encoder = proteinSequenceEncoder(data)
    n = 2
    windowSize = (segmentLength - 1) // 2
    vectorSize = 50
    data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize)

    data.printSchema()
    data.show(25, False)

    end = time.time()

    print("Time: %f  sec." % (end - start))

    sc.stop()
    def test2(self):
        # This test runs a chain elvel query and compares chain level results
        pdb_2 = self.pdb.flatMap(structureToPolymerChains())

        whereClause = "WHERE ecNo='2.7.11.1' AND source='H**o sapiens'"
        fields = ["ecNo", "source"]
        pdb_2 = pdb_2.filter(customReportQuery(whereClause, fields))
        results_2 = pdb_2.keys().collect()

        self.assertTrue('5JDE.A' in results_2)
        self.assertTrue('5JDE.B' in results_2)
        self.assertTrue('5CU4.A' in results_2)
        self.assertTrue('5L6W.L' in results_2)
        self.assertFalse('5L6W.C' in results_2)
        self.assertFalse('5UFU.A' in results_2)
        self.assertFalse('5UFU.B' in results_2)
        self.assertFalse('5UFU.C' in results_2)
        self.assertFalse('5IHB.A' in results_2)
        self.assertFalse('5IHB.B' in results_2)
        self.assertFalse('5IHB.C' in results_2)
        self.assertFalse('5IHB.D' in results_2)
def main():
    start = time.time()

    conf = SparkConf().setMaster("local[*]") \
                      .setAppName("secondaryStructureElementDemo")
    sc = SparkContext(conf = conf)

    pdb = MmtfReader.downloadMmtfFiles(["1STP"],sc).cache()

    pdb = pdb.flatMap(structureToPolymerChains()) \
             .filter(containsLProteinChain())

    ds = secondaryStructureElementExtractor.getDataset(pdb,"E", 6)

    ds.show(50, False)

    end = time.time()

    print("Time: %f  sec." %(end-start))

    sc.stop()
    def test4(self):
        query = "<orgPdbQuery>" + \
        "<queryType>org.pdb.query.simple.EnzymeClassificationQuery</queryType>" + \
        "<Enzyme_Classification>2.7.11.1</Enzyme_Classification>" + \
    "</orgPdbQuery>"

        pdb_4 = self.pdb.flatMap(structureToPolymerChains()) \
                        .filter(advancedQuery(query))
        results_4 = pdb_4.keys().collect()

        self.assertFalse('1PEN.A' in results_4)
        self.assertFalse('1OCZ.A' in results_4)
        self.assertFalse('2ONX.A' in results_4)
        self.assertTrue('5L6W.L' in results_4)
        self.assertFalse('5L6W.C' in results_4)
        self.assertFalse('5KHU.A' in results_4)
        self.assertFalse('5KHU.B' in results_4)
        self.assertTrue('5KHU.Q' in results_4)
        self.assertTrue('1F3M.A' in results_4)
        self.assertTrue('1F3M.B' in results_4)
        self.assertTrue('1F3M.C' in results_4)
        self.assertTrue('1F3M.D' in results_4)
Example #10
0
    def test1(self):
        pdb = self.pdb.flatMap(structureToPolymerChains())
        seq = polymerSequenceExtractor.getDataset(pdb)

        self.assertTrue(seq.count() == 5)
    def test1(self):
        pdb_1 = self.pdb.flatMap(structureToPolymerChains())
        results_1 = pdb_1.keys().collect()

        self.assertTrue(len(results_1) == 10)
    def test1(self):
        pdb = self.pdb.flatMap(structureToPolymerChains())

        seq = secondaryStructureSegmentExtractor.getDataset(pdb, 25)

        self.assertTrue("DPSKDSKAQVSAAEAGITGTWYNQL" == seq.head()[1])