def overlappingNgramWord2VecEncode(self, n = None, windowSize = None, vectorSize = None, fileName = None, sc = None):
        '''
        Encodes a protein sequence by converting it into n-grams and
        then transforming it into a Word2Vec feature vector.

        If given word2Vec file name, then this function encodes a protein
        sequence by converting it into n-grams and then transforming it using
        pre-trained word2Vec model read from that file

        Attribute:
            n (int): The number of words in an n-gram
            windowSize (int): width of the window used to slide across the \
                              squence, context words from [-window,window]
            vectorSize (int): dimension of the feature vector
            fileName (str): filename of Word2Vec model

        Returns:
            dataset with features vector added to original dataset
        '''

        # Create n-grams out of the sequence
        # E.g., 2-gram IDCGH, ... =>[ID, DC, CG, GH, ...]

        data = sequenceNgrammer.ngram(self.data, n, "ngram")

        if not (n == None and windowSize == None and vectorSize == None):
            # Convert n-grams to W2V freature vector
            # [ID, DC, CG, GH, ...] => [0.1234, 0.2394, ...]
            word2Vec = Word2Vec()
            word2Vec.setInputCol("ngram") \
                    .setOutputCol(self.outputCol) \
                    .setNumPartitions(8) \
                    .setWindowSize(windowSize) \
                    .setVectorSize(vectorSize) \

            self.model = word2Vec.fit(data)

        elif fileName != None and sc != None:
            reader = Word2VecModel()

            self.model = reader.load(sc, fileName)

            print(f"model file : {fileName} \n \
                    inputCol : {self.model.getInputCol()} \n \
                    windowSize : {self.model.getWindowSize()} \n \
                    vectorSize : {self.model.getVectorSize()}")

            self.model.setOutputCol(self.outputCol)

        else:
            raise Exception("Either provide word2Vec file (filename) + SparkContext (sc), \
                            or number of words(n) + window size(windowSize) \
                            + vector size (vetorSize), for function parameters")
            return

        return self.model.transform(data)
Esempio n. 2
0
    def shifted_3gram_word2vec_encode(self,
                                      data=None,
                                      inputCol=None,
                                      outputCol=None,
                                      windowSize=None,
                                      vectorSize=None,
                                      fileName=None,
                                      sc=None):
        '''Encodes a protein sequence as three non-overlapping 3-grams,
        trains a Word2Vec model on the 3-grams, and then averages the
        three resulting freature vectors.


        Attribute
        ---------
            data (DataFrame): input data to be encoded [None]
            inputCol (str): name of the input column [None]
            outputCol (str): name of the output column [None]
            windowSize (int): width of the window used to slide across the sequence
                              context words from -window to window
            vectorSize (int): dimension of the feature vector [None]
            fileName (string): filename of Word2VecModel [None]
            sc (SparkContext): spark context [None]

        Returns
        -------
            dataset with features vector added to original dataset

        References
        ----------
           Asgari E, Mofrad MRK (2015) Continuous Distributed Representation
           of Biological Sequences for Deep Proteomics and Genomics.
           PLOS ONE 10(11): e0141287. doi:
           https://doi.org/10.1371/journal.pone.0141287
        '''

        if data is not None:
            self.data = data

        if inputCol is not None:
            self.inputCol = inputCol

        if outputCol is not None:
            self.outputCol = outputCol

        if self.data is None:
            raise ValueError("Class variable data is not defined, please pass\
                             in a dataframe into the data parameter")

        # Create n-grams out of the sequence
        # e.g., 2-gram [IDCGH, ...] => [ID. DC, CG, GH,...]

        data = sequenceNgrammer.shifted_ngram(self.data, 3, 0, "ngram0")
        data = sequenceNgrammer.shifted_ngram(data, 3, 1, "ngram1")
        data = sequenceNgrammer.shifted_ngram(data, 3, 2, "ngram2")
        if not (windowSize == None and vectorSize == None):

            ngram0 = data.select("ngram0").withColumnRenamed("ngram0", "ngram")
            ngram1 = data.select("ngram1").withColumnRenamed("ngram1", "ngram")
            ngram2 = data.select("ngram2").withColumnRenamed("ngram2", "ngram")

            ngrams = ngram0.union(ngram1).union(ngram2)

            # Convert n-grams to W2V feature vector
            # [I D, D C, C G, G H, ... ] => [0.1234, 0.2394, .. ]
            word2Vec = Word2Vec()

            word2Vec.setInputCol("ngram") \
                    .setOutputCol("feature") \
                    .setMinCount(10) \
                    .setNumPartitions(8) \
                    .setWindowSize(windowSize) \
                    .setVectorSize(vectorSize)

            self.model = word2Vec.fit(ngrams)

        elif fileName != None and sc != None:
            reader = Word2VecModel()

            self.model = reader.load(sc, fileName)

            print(f"model file : {fileName} \n \
                    inputCol : {self.model.getInputCol()} \n \
                    windowSize : {self.model.getWindowSize()} \n \
                    vectorSize : {self.model.getVectorSize()}")

        else:
            raise Exception(
                "Either provide word2Vec file (filename) + SparkContext (sc), \
                            or window size(windowSize) + vector size (vetorSize), \
                            for function parameters")
            return

        #data = data.withColumn("feature0",self.model.transform(data.select('ngram0').withColumnRenamed("ngram0","ngram")))
        for i in reversed(range(3)):
            feature = self.model.transform(
                data.select('ngram' + str(i)).withColumnRenamed(
                    "ngram" + str(i), "ngram"))
            data = data.join(
                feature.withColumnRenamed("ngram", "ngram" + str(i)),
                "ngram" + str(i))
            data = data.withColumnRenamed("feature", "feature" + str(i))

        data = self._average_feature_vectors(data, self.outputCol)
        data.printSchema()

        cols = ['structureChainId','sequence','labelQ8','labelQ3','ngram0','ngram1',\
                'ngram2','feature0','feature1','feature2', 'features']

        data = data.select(cols)

        return data
Esempio n. 3
0
    def overlapping_ngram_word2vec_encode(self,
                                          data=None,
                                          inputCol=None,
                                          outputCol=None,
                                          n=None,
                                          windowSize=None,
                                          vectorSize=None,
                                          fileName=None,
                                          sc=None):
        '''Encodes a protein sequence by converting it into n-grams and
        then transforming it into a Word2Vec feature vector.

        If given word2Vec file name, then this function encodes a protein
        sequence by converting it into n-grams and then transforming it using
        pre-trained word2Vec model read from that file

        Attribute
        ---------
            data (DataFrame): input data to be encoded [None]
            inputCol (str): name of the input column [None]
            outputCol (str): name of the output column [None]
            n (int): The number of words in an n-gram [None]
            windowSize (int): width of the window used to slide across the \
                              squence, context words from -window to window \
                              [None]
            vectorSize (int): dimension of the feature vector [None]
            fileName (str): filename of Word2Vec model [None]

        Returns
        -------
            dataset with features vector added to original dataset
        '''

        if data is not None:
            self.data = data

        if inputCol is not None:
            self.inputCol = inputCol

        if outputCol is not None:
            self.outputCol = outputCol

        if self.data is None:
            raise ValueError("Class variable data is not defined, please pass\
                             in a dataframe into the data parameter")

        # Create n-grams out of the sequence
        # E.g., 2-gram IDCGH, ... =>[ID, DC, CG, GH, ...]

        data = sequenceNgrammer.ngram(self.data, n, "ngram")

        if not (n == None and windowSize == None and vectorSize == None):
            # Convert n-grams to W2V freature vector
            # [ID, DC, CG, GH, ...] => [0.1234, 0.2394, ...]
            word2Vec = Word2Vec()
            word2Vec.setInputCol("ngram") \
                    .setOutputCol(self.outputCol) \
                    .setNumPartitions(8) \
                    .setWindowSize(windowSize) \
                    .setVectorSize(vectorSize) \

            self.model = word2Vec.fit(data)

        elif fileName != None and sc != None:
            reader = Word2VecModel()

            self.model = reader.load(sc, fileName)

            print(f"model file : {fileName} \n \
                    inputCol : {self.model.getInputCol()} \n \
                    windowSize : {self.model.getWindowSize()} \n \
                    vectorSize : {self.model.getVectorSize()}")

            self.model.setOutputCol(self.outputCol)

        else:
            raise Exception(
                "Either provide word2Vec file (filename) + SparkContext (sc), \
                            or number of words(n) + window size(windowSize) \
                            + vector size (vetorSize), for function parameters"
            )
            return

        return self.model.transform(data)
    def shifted3GramWord2VecEncode(self, windowSize = None, vectorSize = None, fileName = None, sc = None):
        '''
        Encodes a protein sequence as three non-overlapping 3-grams,
        trains a Word2Vec model on the 3-grams, and then averages the
        three resulting freature vectors.

    	<P> Asgari E, Mofrad MRK (2015) Continuous Distributed Representation
	    of Biological Sequences for Deep Proteomics and Genomics.
	    PLOS ONE 10(11): e0141287. doi:
	    <a href="https://doi.org/10.1371/journal.pone.0141287">10.1371/journal.pone.0141287</a>

        Attribute:
            windowSize (int): width of the window used to slide across the sequence
                              context words from [-window, window]
            vectorSize (int): dimension of the feature vector
            fileName (string): filename of Word2VecModel
            sc (SparkContext): spark context

        Return:
            dataset with features vector added to original dataset
        '''

        # Create n-grams out of the sequence
        # e.g., 2-gram [IDCGH, ...] => [ID. DC, CG, GH,...]
        # TODO set input column
        #data = sequenceNgrammer.ngram(self.data, 2, "ngram")

        data = sequenceNgrammer.shiftedNgram(self.data, 3, 0, "ngram0")
        data = sequenceNgrammer.shiftedNgram(data, 3, 1, "ngram1")
        data = sequenceNgrammer.shiftedNgram(data, 3, 2, "ngram2")
        if not (windowSize == None and vectorSize == None):

            ngram0 = data.select("ngram0").withColumnRenamed("ngram0","ngram")
            ngram1 = data.select("ngram1").withColumnRenamed("ngram1","ngram")
            ngram2 = data.select("ngram2").withColumnRenamed("ngram2","ngram")

            ngrams = ngram0.union(ngram1).union(ngram2)


            # Convert n-grams to W2V feature vector
            # [I D, D C, C G, G H, ... ] => [0.1234, 0.2394, .. ]
            word2Vec = Word2Vec()

            word2Vec.setInputCol("ngram") \
                    .setOutputCol("feature") \
                    .setMinCount(10) \
                    .setNumPartitions(8) \
                    .setWindowSize(windowSize) \
                    .setVectorSize(vectorSize)

            self.model = word2Vec.fit(ngrams)

        elif fileName != None and sc != None:
            reader = Word2VecModel()

            self.model = reader.load(sc, fileName)

            print(f"model file : {fileName} \n \
                    inputCol : {self.model.getInputCol()} \n \
                    windowSize : {self.model.getWindowSize()} \n \
                    vectorSize : {self.model.getVectorSize()}")

        else:
            raise Exception("Either provide word2Vec file (filename) + SparkContext (sc), \
                            or window size(windowSize) + vector size (vetorSize), \
                            for function parameters")
            return


        #data = data.withColumn("feature0",self.model.transform(data.select('ngram0').withColumnRenamed("ngram0","ngram")))
        for i in reversed(range(3)):
            feature = self.model.transform(data.select('ngram' + str(i)).withColumnRenamed("ngram" + str(i),"ngram"))
            data = data.join(feature.withColumnRenamed("ngram","ngram" + str(i)), "ngram" + str(i))
            data = data.withColumnRenamed("feature", "feature" + str(i))


        data = self.averageFeatureVectors(data, self.outputCol)
        data.printSchema()

        cols = ['structureChainId','sequence','labelQ8','labelQ3','ngram0','ngram1',\
                'ngram2','feature0','feature1','feature2', 'features']

        data = data.select(cols)


        return data