Exemple #1
0
    def getDomains(self, sparkContext):

        # recover the species name for using in temp files
        self.species = Utils.getSpecies(self.source_path)
        domainFinder = DomainFinder.DomainFinder()

        # load source sequences into a single list
        if ("fasta" in self.source_type):
            list, file_content = Parsers.parseFastaToList(self.source_path, "")
        elif ("genbank" in self.source_type):
            list = Parsers.genBankToAminoacid(self.source_path)

        print('Processing domains...')

        # create RDD with source sequences
        sourceRDD = sparkContext.parallelize(file_content, numSlices=2000)

        if ("nucleotide" in self.source_type):
            # execute sixFrame translation for each sequence in RDD
            sourceRDD = sourceRDD.map(lambda x: SixFrameTranslator.main(x))

        # execute Pfam domain prediction for each sixFrame translation in RDD
        domainsRDD = sourceRDD.map(lambda x: domainFinder.main(x[0], x[1]))
        processedRDD = domainsRDD.map(
            lambda x: self.processDomainOutput(x[0], x[1]))

        # recover Pfam domain prediction results from RDD
        result = processedRDD.collectAsMap()

        print('Done!')

        return result
    def createNegShuffle(self, posPerc):
        files = Utils.listFilesExt(self.source_path, self.ext)
        negPerc = 100 - posPerc
        positives = len(files)
        negativeSize = int((negPerc * positives) / posPerc)
        print('Negative percentage: ' + str(negPerc) + '% \n' +
              'Negative instances: ' + str(negativeSize) + '\n' +
              'Positive percentage: ' + str(posPerc) + '% \n' +
              'Positive instances: ' + str(positives) + '\n' +
              'Total corpus size: ' + str(negativeSize + positives))

        thisDecRatio = 0.0
        count = 0
        ratio = (negativeSize / positives)
        decRatio = ratio - int(ratio)

        print('Generating...')
        for file in files:
            # add up the decimal ratio part
            thisDecRatio += round(decRatio, 2)
            # reset range
            ratioRange = int(negativeSize / positives)

            # check if decimal ratio added up to a duplicate
            if (thisDecRatio >= 1):
                ratioRange = int(ratio + thisDecRatio)
                thisDecRatio = 0

            for i in range(0, ratioRange):
                name = os.path.basename(file)
                result_file = name.split('.')[0] + '_' + str(
                    i) + '.shuffled.negative.fasta'

                if ('nuc' in self.seqType):
                    content = Parsers.genBankToNucleotide(file)
                if ('amino' in self.seqType):
                    list, content = Parsers.genBankToAminoacid(file)
                content = Utils.charGramShuffle(content, 2)
                content = '>' + name + '\n' + content

                count += 1

                Utils.writeFile(self.result_path + result_file, content)

        print('Total generated: ' + str(count) + '. Done!')