def main(): ''' Demo to create a feature vector for protein fold classification. In this demo we try to classify a protein chain as either an all alpha or all beta protein based on protein sequence. We use n-grams and Word2Vec representation of the protein sequence as a feature vector. ''' start = time.time() conf = SparkConf() \ .setMaster("local[*]") \ .setAppName("featuresDemo") sc = SparkContext(conf=conf) path = "../../resources/mmtf_reduced_sample/" # Read MMTF Hadoop sequence file and create a non-redundant set (<=40% seq. identity) # of L-protein chains sequenceIdentity = 40 resolution = 2.0 pdb = MmtfReader \ .readSequenceFile(path, sc) \ .filter(pisces(sequenceIdentity, resolution)) \ .flatMap(structureToPolymerChains()) \ .filter(pisces(sequenceIdentity, resolution)) \ .filter(containsLProteinChain()) \ # Get secondary structure content data = secondaryStructureExtractor.getDataset(pdb) # classify chains by secondary structure type minThreshold = 0.05 maxThreshold = 0.15 data = addProteinFoldType(data, minThreshold, maxThreshold) # add Word2Vec encoded feature vector encoder = proteinSequenceEncoder(data) n = 2 # Create 2-grams windowSize = 25 # 25-amino residue window size for Word2Vec vectorSize = 50 # dimension of feature vector data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize).cache() data.printSchema() data.show(25) # keep only a subset of relevant fields for futher processing data = data.select("structureChainId", "alpha", "beta", "coil", "foldType", "features") data.write.mode("overwrite").format("parquet").save( "/home/marshuang80/PDB/data/demo.parquet") end = time.time() print("Time: %f sec." % (end - start)) sc.stop()
def main(): ''' This class creates a dataset of sequence segments dericed from a non-redundant set. The dataset contains the sequence segment, the DSSP Q8 and DSSP Q3 code of the center residue in a seuqnce segment, and a one-hot encoding of the sequence segment. The dataset is saved in a file specified by the user ''' start = time.time() conf = SparkConf() \ .setMaster("local[*]") \ .setAppName("SecondaryStructureOneHotEncoderDemo") sc = SparkContext(conf=conf) # Read MMTF Hadoop sequence file and create a non-redundant set # (<=20% seq. identity) of L-protein chains path = "../../resources/mmtf_reduced_sample/" sequenceIdentity = 20 resolution = 2.0 fraction = 0.1 seed = 123 #.downloadMmtfFiles(["2ONX",'1JLP','5X6H','5L2G','2MK1' ],sc) \ pdb = MmtfReader \ .readSequenceFile(path, sc) \ .flatMap(structureToPolymerChains()) \ .filter(pisces(sequenceIdentity, resolution)) \ .filter(containsLProteinChain()) \ .sample(False, fraction, seed) segmentLength = 11 data = secondaryStructureSegmentExtractor.getDataset( pdb, segmentLength).cache() print(f"original data : {data.count()}") data = data.dropDuplicates(["labelQ3", "sequence"]).cache() print(f"- duplicate Q3/seq : {data.count()}") data = data.dropDuplicates(["sequence"]) print(f"- duplicate seq : {data.count()}") encoder = proteinSequenceEncoder(data) data = encoder.oneHotEncode() data.printSchema() data.show(25, False) end = time.time() print("Time: %f sec." % (end - start)) sc.stop()
def test1(self): pdb_1 = self.pdb.filter(pisces(20,2.0)) results_1 = pdb_1.keys().collect() self.assertTrue('5X42' in results_1) self.assertTrue('4R4X' in results_1) self.assertFalse('2ONX' in results_1) self.assertFalse('1JLP' in results_1)
def test2(self): pdb_2 = self.pdb.flatMap(structureToPolymerChains()) pdb_2 = pdb_2.filter(pisces(20,2.0)) results_2 = pdb_2.keys().collect() self.assertTrue('5X42.B' in results_2) self.assertTrue('4R4X.A' in results_2) self.assertFalse('5X42.A' in results_2) self.assertFalse('2ONX.A' in results_2) self.assertFalse('1JLP.A' in results_2)
def main(): ''' This class creates a dataset of sequence segment derived from a non-redundant set. The dataset contains the sequence segment, the DSSP Q8 and DSSP Q3 code of the center residue in a sequence segment, and a Word2Vec encoding of the sequence segment. The Data is saved in JSON file specified by the user. ''' start = time.time() conf = SparkConf() \ .setMaster("local[*]") \ .setAppName("secondaryStructureWord2VecEncodeDemo") sc = SparkContext(conf=conf) # Read MMTF Hadoop sequence file and create a non-redundant set # (<=20% seq. identity) of L-protein chains path = "../../resources/mmtf_reduced_sample/" sequenceIdentity = 20 resolution = 2.0 fraction = 0.1 seed = 123 pdb = MmtfReader \ .readSequenceFile(path, sc) \ .flatMap(structureToPolymerChains()) \ .filter(pisces(sequenceIdentity, resolution)) \ .filter(containsLProteinChain()) \ .sample(False, fraction, seed) segmentLength = 11 data = secondaryStructureSegmentExtractor.getDataset( pdb, segmentLength).cache() # add Word2Vec encoded feature vector encoder = proteinSequenceEncoder(data) n = 2 windowSize = (segmentLength - 1) // 2 vectorSize = 50 data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize) data.printSchema() data.show(25, False) end = time.time() print("Time: %f sec." % (end - start)) sc.stop()