Esempio n. 1
0
def main():
    '''
    Demo to create a feature vector for protein fold classification.
    In this demo we try to classify a protein chain as either an
    all alpha or all beta protein based on protein sequence. We use
    n-grams and Word2Vec representation of the protein sequence as a
    feature vector.
    '''

    start = time.time()

    conf = SparkConf() \
            .setMaster("local[*]") \
            .setAppName("featuresDemo")
    sc = SparkContext(conf=conf)
    path = "../../resources/mmtf_reduced_sample/"

    # Read MMTF Hadoop sequence file and create a non-redundant set (<=40% seq. identity)
    # of L-protein chains
    sequenceIdentity = 40
    resolution = 2.0

    pdb = MmtfReader \
            .readSequenceFile(path, sc) \
            .filter(pisces(sequenceIdentity, resolution)) \
            .flatMap(structureToPolymerChains()) \
            .filter(pisces(sequenceIdentity, resolution)) \
            .filter(containsLProteinChain()) \

    # Get secondary structure content
    data = secondaryStructureExtractor.getDataset(pdb)

    # classify chains by secondary structure type
    minThreshold = 0.05
    maxThreshold = 0.15
    data = addProteinFoldType(data, minThreshold, maxThreshold)

    # add Word2Vec encoded feature vector
    encoder = proteinSequenceEncoder(data)
    n = 2  # Create 2-grams
    windowSize = 25  # 25-amino residue window size for Word2Vec
    vectorSize = 50  # dimension of feature vector
    data = encoder.overlappingNgramWord2VecEncode(n, windowSize,
                                                  vectorSize).cache()

    data.printSchema()
    data.show(25)

    # keep only a subset of relevant fields for futher processing
    data = data.select("structureChainId", "alpha", "beta", "coil", "foldType",
                       "features")

    data.write.mode("overwrite").format("parquet").save(
        "/home/marshuang80/PDB/data/demo.parquet")

    end = time.time()

    print("Time: %f  sec." % (end - start))

    sc.stop()
Esempio n. 2
0
def main():
    '''
    This class creates a dataset of sequence segments dericed from a
    non-redundant set. The dataset contains the sequence segment, the DSSP Q8
    and DSSP Q3 code of the center residue in a seuqnce segment, and a one-hot
    encoding of the sequence segment.
    The dataset is saved in a file specified by the user
    '''

    start = time.time()

    conf = SparkConf() \
            .setMaster("local[*]") \
            .setAppName("SecondaryStructureOneHotEncoderDemo")
    sc = SparkContext(conf=conf)

    # Read MMTF Hadoop sequence file and create a non-redundant set
    # (<=20% seq. identity) of L-protein chains

    path = "../../resources/mmtf_reduced_sample/"

    sequenceIdentity = 20
    resolution = 2.0
    fraction = 0.1
    seed = 123

    #.downloadMmtfFiles(["2ONX",'1JLP','5X6H','5L2G','2MK1' ],sc) \



    pdb = MmtfReader \
            .readSequenceFile(path, sc) \
            .flatMap(structureToPolymerChains()) \
            .filter(pisces(sequenceIdentity, resolution)) \
            .filter(containsLProteinChain()) \
            .sample(False, fraction, seed)

    segmentLength = 11
    data = secondaryStructureSegmentExtractor.getDataset(
        pdb, segmentLength).cache()
    print(f"original data   : {data.count()}")

    data = data.dropDuplicates(["labelQ3", "sequence"]).cache()
    print(f"- duplicate Q3/seq  : {data.count()}")

    data = data.dropDuplicates(["sequence"])
    print(f"- duplicate seq  : {data.count()}")

    encoder = proteinSequenceEncoder(data)
    data = encoder.oneHotEncode()

    data.printSchema()
    data.show(25, False)

    end = time.time()

    print("Time: %f  sec." % (end - start))

    sc.stop()
Esempio n. 3
0
def main():
    '''
    This class creates a dataset of sequence segment derived from a
    non-redundant set. The dataset contains the sequence segment, the DSSP
    Q8 and DSSP Q3 code of the center residue in a sequence segment, and a
    Word2Vec encoding of the sequence segment.
    The Data is saved in JSON file specified by the user.
    '''

    start = time.time()

    conf = SparkConf() \
            .setMaster("local[*]") \
            .setAppName("secondaryStructureWord2VecEncodeDemo")
    sc = SparkContext(conf=conf)

    # Read MMTF Hadoop sequence file and create a non-redundant set
    # (<=20% seq. identity) of L-protein chains

    path = "../../resources/mmtf_reduced_sample/"

    sequenceIdentity = 20
    resolution = 2.0
    fraction = 0.1
    seed = 123

    pdb = MmtfReader \
            .readSequenceFile(path, sc) \
            .flatMap(structureToPolymerChains()) \
            .filter(pisces(sequenceIdentity, resolution)) \
            .filter(containsLProteinChain()) \
            .sample(False, fraction, seed)

    segmentLength = 11
    data = secondaryStructureSegmentExtractor.getDataset(
        pdb, segmentLength).cache()

    # add Word2Vec encoded feature vector
    encoder = proteinSequenceEncoder(data)
    n = 2
    windowSize = (segmentLength - 1) // 2
    vectorSize = 50
    data = encoder.overlappingNgramWord2VecEncode(n, windowSize, vectorSize)

    data.printSchema()
    data.show(25, False)

    end = time.time()

    print("Time: %f  sec." % (end - start))

    sc.stop()