Esempio n. 1
0
    def extractRewardPerFeat(self, dataPath, outputPath, featType, sourceType,
                             rewardType):

        rewardperfeat = {}
        # tuple of shape {(file, content, id),'kmers'}
        resultLabel = Parsers.parseDatasetContents(dataPath, featType,
                                                   sourceType)
        fileindex = list(set([i[0][0] for i in resultLabel]))

        for item in resultLabel:
            filename = item[0][0]
            label = Utils.getLabel(filename)
            content = item[0][1]
            idx = fileindex.index(filename)
            occ = 1 if label == 1 else -1

            if (content in rewardperfeat):
                if ('label' in rewardType):
                    rewardperfeat[content][idx] = occ
                else:
                    rewardperfeat[content][idx] += occ
            else:
                rewardperfeat[content] = [0] * len(fileindex)
                if ('label' in rewardType):
                    rewardperfeat[content][idx] = occ
                else:
                    rewardperfeat[content][idx] += occ

        outputstr = ''
        for k, v in rewardperfeat.items():
            outputstr += k + '\t' + (',').join(map(str, v)) + '\n'
        Utils.writeFile(outputPath, outputstr[:-1])

        return rewardperfeat
Esempio n. 2
0
 def prepareData(self, path, sparkContext):
     dataset = Parsers.parseDatasetContents(path, 'domains_pfam', 'domains')
     contentRDD = sparkContext.parallelize(dataset, numSlices=1000)
     perinstanceRDD = contentRDD.map(lambda x:
                                     (x[0][2], [x[0][1]])).reduceByKey(add)
     # format tuple {filename, ([domains], fastaID(genes))}
     return perinstanceRDD.collect()
Esempio n. 3
0
    def loadGeneMap(self, sparkContext):
        content = Parsers.parseDatasetContents(self.geneMapPath, 'domains',
                                               'domains')
        contentRDD = sparkContext.parallelize(content, numSlices=1000)
        genemapRDD = contentRDD.map(
            lambda x: (x[0][2], x[0][1].split('\n'))).reduceByKey(add)
        genemap = genemapRDD.collectAsMap()

        return genemap
Esempio n. 4
0
    def loadFilterMap(self, sparkContext):
        filterList = Utils.readFileLines(self.filterList)
        # returns tuple (((file, content), 'domains'))
        content = Parsers.parseDatasetContents(self.filterMap, 'domains',
                                               'domains')

        domRDD = sparkContext.parallelize(content, numSlices=1000)
        domainsRDD = domRDD.map(lambda x: (Utils.getFileName(x[0][0]).replace(
            '.domains', ''), x[0][1]))

        # lists genes that have any domains in filterList
        # discards ".\d+" end of Pfam ID
        filter = domainsRDD.filter(lambda x: any(
            domain in filterList for domain in re.split("[\n.]", x[1])))

        result = filter.collectAsMap().keys()
        genes = sorted([i for i in result])

        print('Loaded filter:', len(genes), ' genes will be filtered from',
              len(filterList), 'domains.')
        return genes
Esempio n. 5
0
    def countOccurrence(self, dataPath, sparkContext):
        feats = self.loadFeatures()
        contentIds = []

        listContents = Parsers.parseDatasetContents(dataPath, self.featType,
                                                    self.sourceType)
        parentDir = os.path.split(os.path.dirname(listContents[0][0][0]))[1]

        for info in listContents:
            filename = info[0][0]
            content = info[0][1]
            type = info[1]
            firstLine = Utils.readFileLines(filename)[0]
            id = firstLine.replace(
                '>',
                '') if '|' in firstLine else firstLine.split('.')[0].replace(
                    '>', '')
            label = Utils.getLabel(filename)

            # avoid cases in which test synthetic genes are long and
            # in the split different clusters share same (gene) id
            for item in contentIds:
                if (id in item[0] and type in item[1]):
                    id = id + '|'

            contentIds.append(tuple([id, type, content, label]))

        sourceRDD = sparkContext.parallelize(contentIds, numSlices=1000)
        occRDD = sourceRDD.map(lambda x: self.occurrence(x, feats))

        # combine features with same ID and filter out instances with not enough features
        reducedRDD = occRDD.reduceByKey(
            lambda x, y: self.mergeFeatsSameId(x, y))

        ids = reducedRDD.map(lambda x: x[0]).collect()
        occ = reducedRDD.map(lambda x: x[1][0]).collect()
        labels = reducedRDD.map(lambda x: x[1][1]).collect()

        print('Features loaded.')
        return np.array(ids), np.array(occ), np.array(labels), parentDir
Esempio n. 6
0
    def buildDataset(self, path, sparkContext):
        result, ids, labels = [], [], []
        #files = Utils.listFilesExt(path, 'fasta')
        # define pickle protocol to bypass 4GiB pickling limit
        broadcast.Broadcast.dump = self.broadcast_dump

        dataset = Parsers.parseDatasetContents(path, self.featType, self.sourceType)
        parentDir = os.path.split(os.path.dirname(dataset[0][0][0]))[1]

        listRDD = sparkContext.parallelize(dataset, numSlices=5000)
        # X tuple in format:
        # ((fileName, content, sequenceID), featureType)
        featuresRDD = listRDD.map(lambda x: (x[0][2], self.extractor.getFeatures(x)))

        concatRDD = ''
        if("pfam" in self.featType): # concatenate by pfam ID: label positive if at least one file contains domain
            concatRDD = featuresRDD.map(lambda x: (''.join(x[1]), [x[0]])).reduceByKey(add)
        else:
            # concatenate contents by file ID
            concatRDD = featuresRDD.map(lambda x: (x)).reduceByKey(add)

        # add instance
        # X tuple in format: (file ID, [feature, feature, ...]])
        datasetRDD = concatRDD.map(lambda x: self.addInstance(x))
        dataset = datasetRDD.collectAsMap() #if "pfam" not in self.featType else datasetRDD.collect()

        # get max length among all
        maxLen = 1 if "pfam" in self.featType else int(datasetRDD.sortBy(lambda x: x[0][1], False).first()[0][1])
        self.maxLength = maxLen if maxLen > self.maxLength else self.maxLength

        # 0 = fasta.id, 1 = instance length, 2 = file name, 3 = label
        for k, v in dataset.items():
            id, label = k[0], int(k[2])
            ids.append(id)
            labels.append(label)
            result.append(v)
        print('Done building dataset.')

        return ids, result, labels, parentDir
Esempio n. 7
0
    def extractFeatures(self, dataPath, sparkContext, featPerInst):
        files, feats, kmerCounts, featPerInstance = [], [], [], []
        useKmer = True if 'kmers' in self.featType else False
        useProt = True if 'prot' in self.featType else False
        useDistinct = True if 'dist' in self.featType else False
        listContents = Parsers.parseDatasetContents(dataPath, self.featType,
                                                    self.sourceType)

        if ('dictionary' in self.featType):
            feats += sorted(self.loadDictionary(self.dictPath))
        else:
            #if('domains' in self.featType or 'kmers' in self.featType):
            featRDD = sparkContext.parallelize(listContents, numSlices=1000)
            featuresRDD = featRDD.map(lambda x: (x[1], self.getFeatures(x)))

            if (featPerInst):
                # get a list of features per instance for embeddings
                featPerInstance = featuresRDD.values().collect()
                print(len(featPerInstance), 'instances processed.')

            if (not os.path.isfile(self.featFile)):
                if (useKmer):
                    # filter RDD and return only kmers, "flatten" arrays to single list of kmers
                    kmerRDD = featuresRDD.filter(
                        lambda x: "kmer" in x[0]).flatMap(lambda x: x[1])

                    # change each element to (k, v), reduce list by keys to group
                    # + count features, filter features by minOcc
                    minOcc = int(self.minOcc)
                    countAndFilter = kmerRDD.map(lambda x: (x, 1)).reduceByKey(
                        add).filter(lambda x: x[1] >= minOcc)

                    # remove counts and collect only keys
                    kmerCounts = sorted(countAndFilter.collect())
                    feats += sorted(countAndFilter.keys().collect())

                    # filter out kmers already processed
                    featuresRDD = featuresRDD.filter(
                        lambda x: "kmer" not in x[0])

                if (useProt):
                    # filter RDD and return only prot properties
                    protRDD = featuresRDD.filter(
                        lambda x: "protanalys" in x[0])
                    # select (unique) feature names
                    feats += sorted(
                        protRDD.flatMap(lambda x: x[1]).distinct().collect())
                    featuresRDD = featuresRDD.filter(
                        lambda x: "protanalys" not in x[0])

                # get a flat list of all features
                #if(useDistinct):
                #   completeFeatures = featuresRDD.flatMap(lambda x: x[1]).distinct()
                #else:
                #completeFeatures = featuresRDD.flatMap(lambda x: x[1])
                completeFeatures = featuresRDD.flatMap(
                    lambda x: x[1]).distinct()
                feats += completeFeatures.collect()

                if (len(feats) > 1):
                    allFeatures = ''.join(str(i) + '\n' for i in feats)
                    Utils.writeFile(self.featFile, allFeatures)

                if (len(kmerCounts) > 1):
                    kmerCounts = ''.join(
                        str(i).replace('(\'', '').replace('\',', '\t').replace(
                            ')', '') + '\n' for i in kmerCounts)
                    Utils.writeFile(self.featFile + 'count', kmerCounts)

                print(len(feats), 'features extracted.')

            else:
                feats = self.loadFeatures()

        return feats, featPerInstance, kmerCounts