def extractRewardPerFeat(self, dataPath, outputPath, featType, sourceType, rewardType): rewardperfeat = {} # tuple of shape {(file, content, id),'kmers'} resultLabel = Parsers.parseDatasetContents(dataPath, featType, sourceType) fileindex = list(set([i[0][0] for i in resultLabel])) for item in resultLabel: filename = item[0][0] label = Utils.getLabel(filename) content = item[0][1] idx = fileindex.index(filename) occ = 1 if label == 1 else -1 if (content in rewardperfeat): if ('label' in rewardType): rewardperfeat[content][idx] = occ else: rewardperfeat[content][idx] += occ else: rewardperfeat[content] = [0] * len(fileindex) if ('label' in rewardType): rewardperfeat[content][idx] = occ else: rewardperfeat[content][idx] += occ outputstr = '' for k, v in rewardperfeat.items(): outputstr += k + '\t' + (',').join(map(str, v)) + '\n' Utils.writeFile(outputPath, outputstr[:-1]) return rewardperfeat
def prepareData(self, path, sparkContext): dataset = Parsers.parseDatasetContents(path, 'domains_pfam', 'domains') contentRDD = sparkContext.parallelize(dataset, numSlices=1000) perinstanceRDD = contentRDD.map(lambda x: (x[0][2], [x[0][1]])).reduceByKey(add) # format tuple {filename, ([domains], fastaID(genes))} return perinstanceRDD.collect()
def loadGeneMap(self, sparkContext): content = Parsers.parseDatasetContents(self.geneMapPath, 'domains', 'domains') contentRDD = sparkContext.parallelize(content, numSlices=1000) genemapRDD = contentRDD.map( lambda x: (x[0][2], x[0][1].split('\n'))).reduceByKey(add) genemap = genemapRDD.collectAsMap() return genemap
def loadFilterMap(self, sparkContext): filterList = Utils.readFileLines(self.filterList) # returns tuple (((file, content), 'domains')) content = Parsers.parseDatasetContents(self.filterMap, 'domains', 'domains') domRDD = sparkContext.parallelize(content, numSlices=1000) domainsRDD = domRDD.map(lambda x: (Utils.getFileName(x[0][0]).replace( '.domains', ''), x[0][1])) # lists genes that have any domains in filterList # discards ".\d+" end of Pfam ID filter = domainsRDD.filter(lambda x: any( domain in filterList for domain in re.split("[\n.]", x[1]))) result = filter.collectAsMap().keys() genes = sorted([i for i in result]) print('Loaded filter:', len(genes), ' genes will be filtered from', len(filterList), 'domains.') return genes
def countOccurrence(self, dataPath, sparkContext): feats = self.loadFeatures() contentIds = [] listContents = Parsers.parseDatasetContents(dataPath, self.featType, self.sourceType) parentDir = os.path.split(os.path.dirname(listContents[0][0][0]))[1] for info in listContents: filename = info[0][0] content = info[0][1] type = info[1] firstLine = Utils.readFileLines(filename)[0] id = firstLine.replace( '>', '') if '|' in firstLine else firstLine.split('.')[0].replace( '>', '') label = Utils.getLabel(filename) # avoid cases in which test synthetic genes are long and # in the split different clusters share same (gene) id for item in contentIds: if (id in item[0] and type in item[1]): id = id + '|' contentIds.append(tuple([id, type, content, label])) sourceRDD = sparkContext.parallelize(contentIds, numSlices=1000) occRDD = sourceRDD.map(lambda x: self.occurrence(x, feats)) # combine features with same ID and filter out instances with not enough features reducedRDD = occRDD.reduceByKey( lambda x, y: self.mergeFeatsSameId(x, y)) ids = reducedRDD.map(lambda x: x[0]).collect() occ = reducedRDD.map(lambda x: x[1][0]).collect() labels = reducedRDD.map(lambda x: x[1][1]).collect() print('Features loaded.') return np.array(ids), np.array(occ), np.array(labels), parentDir
def buildDataset(self, path, sparkContext): result, ids, labels = [], [], [] #files = Utils.listFilesExt(path, 'fasta') # define pickle protocol to bypass 4GiB pickling limit broadcast.Broadcast.dump = self.broadcast_dump dataset = Parsers.parseDatasetContents(path, self.featType, self.sourceType) parentDir = os.path.split(os.path.dirname(dataset[0][0][0]))[1] listRDD = sparkContext.parallelize(dataset, numSlices=5000) # X tuple in format: # ((fileName, content, sequenceID), featureType) featuresRDD = listRDD.map(lambda x: (x[0][2], self.extractor.getFeatures(x))) concatRDD = '' if("pfam" in self.featType): # concatenate by pfam ID: label positive if at least one file contains domain concatRDD = featuresRDD.map(lambda x: (''.join(x[1]), [x[0]])).reduceByKey(add) else: # concatenate contents by file ID concatRDD = featuresRDD.map(lambda x: (x)).reduceByKey(add) # add instance # X tuple in format: (file ID, [feature, feature, ...]]) datasetRDD = concatRDD.map(lambda x: self.addInstance(x)) dataset = datasetRDD.collectAsMap() #if "pfam" not in self.featType else datasetRDD.collect() # get max length among all maxLen = 1 if "pfam" in self.featType else int(datasetRDD.sortBy(lambda x: x[0][1], False).first()[0][1]) self.maxLength = maxLen if maxLen > self.maxLength else self.maxLength # 0 = fasta.id, 1 = instance length, 2 = file name, 3 = label for k, v in dataset.items(): id, label = k[0], int(k[2]) ids.append(id) labels.append(label) result.append(v) print('Done building dataset.') return ids, result, labels, parentDir
def extractFeatures(self, dataPath, sparkContext, featPerInst): files, feats, kmerCounts, featPerInstance = [], [], [], [] useKmer = True if 'kmers' in self.featType else False useProt = True if 'prot' in self.featType else False useDistinct = True if 'dist' in self.featType else False listContents = Parsers.parseDatasetContents(dataPath, self.featType, self.sourceType) if ('dictionary' in self.featType): feats += sorted(self.loadDictionary(self.dictPath)) else: #if('domains' in self.featType or 'kmers' in self.featType): featRDD = sparkContext.parallelize(listContents, numSlices=1000) featuresRDD = featRDD.map(lambda x: (x[1], self.getFeatures(x))) if (featPerInst): # get a list of features per instance for embeddings featPerInstance = featuresRDD.values().collect() print(len(featPerInstance), 'instances processed.') if (not os.path.isfile(self.featFile)): if (useKmer): # filter RDD and return only kmers, "flatten" arrays to single list of kmers kmerRDD = featuresRDD.filter( lambda x: "kmer" in x[0]).flatMap(lambda x: x[1]) # change each element to (k, v), reduce list by keys to group # + count features, filter features by minOcc minOcc = int(self.minOcc) countAndFilter = kmerRDD.map(lambda x: (x, 1)).reduceByKey( add).filter(lambda x: x[1] >= minOcc) # remove counts and collect only keys kmerCounts = sorted(countAndFilter.collect()) feats += sorted(countAndFilter.keys().collect()) # filter out kmers already processed featuresRDD = featuresRDD.filter( lambda x: "kmer" not in x[0]) if (useProt): # filter RDD and return only prot properties protRDD = featuresRDD.filter( lambda x: "protanalys" in x[0]) # select (unique) feature names feats += sorted( protRDD.flatMap(lambda x: x[1]).distinct().collect()) featuresRDD = featuresRDD.filter( lambda x: "protanalys" not in x[0]) # get a flat list of all features #if(useDistinct): # completeFeatures = featuresRDD.flatMap(lambda x: x[1]).distinct() #else: #completeFeatures = featuresRDD.flatMap(lambda x: x[1]) completeFeatures = featuresRDD.flatMap( lambda x: x[1]).distinct() feats += completeFeatures.collect() if (len(feats) > 1): allFeatures = ''.join(str(i) + '\n' for i in feats) Utils.writeFile(self.featFile, allFeatures) if (len(kmerCounts) > 1): kmerCounts = ''.join( str(i).replace('(\'', '').replace('\',', '\t').replace( ')', '') + '\n' for i in kmerCounts) Utils.writeFile(self.featFile + 'count', kmerCounts) print(len(feats), 'features extracted.') else: feats = self.loadFeatures() return feats, featPerInstance, kmerCounts