def summarize(self): metricFiles = Utils.listFilesExt(self.result, 'metrics') metricFiles = sorted(metricFiles) output, pos = "", "" outputFile = Utils.normalizePath(self.result) + "results.summary" if ("pos" in self.result): pos = self.result.split("pos")[1][0:2] for file in metricFiles: metrics = Utils.readFileLines(file)[2].replace("pos\t", "") filename = os.path.basename(file) classifier = filename.split("_")[0] feats = filename.split("_")[1] + "+" + filename.split("_")[2] len = filename.split("len")[1].split("_")[0] overlap = filename.split("overlap")[1].split("_")[0][0:2] evaltype = filename.split("IDs.test.")[1].replace( "eval.metrics", "").replace(".", "") if (not evaltype): evaltype = "succ0" if ("similar" in evaltype): evaltype = evaltype.replace("similar", "sim") if ("merge" in evaltype): evaltype = evaltype.replace("succ", "") line = feats + "\t" + classifier + "\t" + pos + "\t" + len + "\t" + overlap + "\t" + evaltype + "\t" + metrics + "\n" output += line Utils.writeFile(outputFile, output)
def __init__(self, config, outputPath): self.sourcePath = config.get('prediction', 'source.path') self.sourcePath = Utils.normalizePath(self.sourcePath) self.trainPath = self.sourcePath + 'train/' self.outputPath = self.sourcePath + 'metricsDL/' self.sourceType = config.get('prediction', 'source.type') self.useEmbeddings = bool(config.get('prediction', 'use.embeddings')) self.embedPath = config.get('prediction', 'embed.path') self.embedPath = Utils.normalizePath(self.embedPath) if (self.useEmbeddings): self.featType = config.get('prediction', 'feat.type') self.featSize = config.get('prediction', 'feat.size') self.minOcc = config.get('prediction', 'feat.minOcc') self.embedSize = config.getint('prediction', 'embeddings.length') self.embeddingsName = self.featType + self.featSize + 'minOcc' + str(self.minOcc) \ + str(self.embedSize) + 'd' self.dictionary = dict() self.extractor = Extractor.Extractor(config, outputPath) self.featType = config.get('prediction', 'feat.type') self.maxLength = 0
def __init__(self): self.config = Utils.loadConfig() self.path = self.config.get('prediction', 'source.path') self.path = Utils.normalizePath(self.path) self.trainPath = self.path + 'train/' self.testPath = self.path + 'test/' self.outputPath = self.path + 'metricsQLearner/models/' self.geneMapPath = self.config.get('eval', 'filter.map') self.geneMap = {} self.extractor = Extractor.Extractor(self.config, self.outputPath) self.rewardType = 'occ' self.rewardPath = self.outputPath + self.rewardType + 'PerDomains.feat' # pfam domain list self.rewardList, self.rewardIDs, self.rewardLabels = '', '', '' self.actions = ['keep', 'skip'] self.task = 'train' self.rewardTable, self.QTable = [], [] self.episodes = int(self.config.get('prediction', 'episodes')) # hyperparams self.alpha = float(self.config.get('prediction', 'alpha')) # learning rate self.gamma = float(self.config.get('prediction', 'gamma')) # discount factor self.epsilon = float(self.config.get('prediction', 'epsilon')) # exploration self.penaltyThreshold = float( self.config.get( 'prediction', 'penalty.threshold')) # negative rewards mean penalty self.keepskipThreshold = float( self.config.get('prediction', 'keepskip.threshold') ) # keep reward ratio wrt skip reward for domain to be kept self.useSimilarityWeight = False self.useCompWeights = False self.useNeighborWeight = self.config.getboolean( 'prediction', 'neighbor.weight') self.useDryIslands = self.config.getboolean('prediction', 'dry.islands') self.useAvAction = self.config.getboolean('prediction', 'average.action') self.weightsPath = self.config.get('eval', 'weights') self.weights = Utils.readFileLines( self.weightsPath ) if self.useCompWeights or self.useNeighborWeight else '' self.params = self.rewardType + '_keepgt' + str( self.keepskipThreshold) + 'skip' + '_ep' + str( self.episodes) + '_alpha' + str(self.alpha) + '_gamma' + str( self.gamma) + '_eps' + str(self.epsilon) self.params += '_neighbor' if self.useCompWeights else '' self.QTablePath = self.outputPath + 'Qtable_' + self.params + '.npy' self.rewardTablePath = self.outputPath + 'Rewards_' + self.params + '.npy' self.IDmapPath = self.outputPath + 'RewardIDsmap_' + self.params + '.map'
def __init__(self, blastTask): self.config = Utils.loadConfig() self.sourceType = self.config.get('dataPipeline', 'source.type') self.blastTask = blastTask self.blastdb = self.config.get('blaster', 'blastdb.path') self.blastdb = Utils.normalizePath(self.blastdb) self.blastdbName = self.config.get('blaster', 'blastdb.name') if(not self.blastdbName.endswith('fasta')): self.blastdbName += '.fasta' self.goTerms = True if 'goterm' in blastTask.lower() else False self.mappingFile = self.blastdb + self.blastdbName.replace('.fasta','.tab') self.mapping = '' if(self.goTerms): self.mapping = self.loadMapping()
def __init__(self): # read application configuration props self.config = Utils.loadConfig() self.path = self.config.get('prediction', 'source.path') self.path = Utils.normalizePath(self.path) self.trainPath = self.path + 'train/' self.validPath = self.path + 'validation/' self.gridCVPath = self.path + 'train_validation/' self.testPath = self.path + 'test/' self.outputPath = self.path + 'metrics/cv_gridsearchparams/' self.task = self.config.get('prediction', 'task') self.posPerc = int(self.config.get('prediction', 'pos.perc')) self.classif = self.config.get('prediction', 'classifier') os.makedirs(os.path.dirname(self.outputPath), exist_ok=True) self.extractor = Extractor.Extractor(self.config, self.outputPath) self.loader = Loader.Loader(self.config, self.outputPath) self.dimHandler = DimensionHandler.DimensionHandler( self.config, self.outputPath) self.outFile = '' self.useEmbeddings = self.config.getboolean('prediction', 'use.embeddings') self.cv = self.config.getboolean('prediction', 'use.crossvalid') if ('cross' in self.task): self.cv = True if (not 'none' in self.dimHandler.name.lower()): self.outFile = self.dimHandler.getOutFile(self.classif) self.outFile = self.outFile + '_embeddings' if self.useEmbeddings else self.outFile else: self.outFile = self.outputPath + self.classif + '_' + self.extractor.featType if ('kmers' in self.extractor.featType): kmerfeats = 'kmers' + str( self.extractor.size) + '_minOcc' + str( self.extractor.minOcc) self.outFile = self.outFile.replace('kmers', kmerfeats) #self.outFile += str(self.extractor.size) + '_minOcc' + str(self.extractor.minOcc) if ('cross' in self.task or 'grid' in self.task or self.cv): self.extractor.featFile = self.extractor.featFile.replace( '.feat', '.complete.feat' ) if 'grid' in self.task else self.extractor.featFile if ('cross' in self.task or self.cv): self.outFile += '_cv05' self.modelFile = self.outFile + '.model.pkl' self.classifier = self.setUpClassifier()
def __init__(self, config, outputPath): self.config = config self.dictPath = config.get('prediction', 'dict.path') self.featType = config.get('prediction', 'feat.type') self.nbFeatType = self.featType.count('-') + 1 self.sourceType = config.get('prediction', 'source.type') self.size = config.get('prediction', 'feat.size') self.minOcc = config.get('prediction', 'feat.minOcc') outputPath = Utils.normalizePath(outputPath) self.featFile = outputPath + self.featType self.cv = self.config.getboolean('prediction', 'use.crossvalid') self.task = self.config.get('prediction', 'task') if ('cross' in self.task): self.cv = True if self.cv: self.featFile += '.cv' if ('kmers' in self.featType): kmerfeats = 'kmers' + str(self.size) + '_minOcc' + str(self.minOcc) self.featFile = self.featFile.replace('kmers', kmerfeats) self.featFile += '.feat'
def __init__(self, source_type=None, source_path=None, result_path=None): self.config = Utils.loadConfig() self.task = self.config.get('dataPipeline', 'task') self.source_path = self.config.get( 'dataPipeline', 'source.path') if source_path is None else source_path self.source_type = self.config.get( 'dataPipeline', 'source.type') if source_type is None else source_type self.result_path = self.config.get( 'dataPipeline', 'result.path') if result_path is None else result_path self.result_path = Utils.normalizePath(self.result_path) # create if it doesnt exist os.makedirs(os.path.dirname(self.result_path), exist_ok=True) # recover the species name for using in temp files self.species = Utils.getSpecies(self.source_path) # temp dir + file used by sub-pipelines self.path = os.path.dirname(os.path.realpath(__file__)) self.path += '/temp/' os.makedirs(os.path.dirname(self.path), exist_ok=True)
def createGoDataset(self): source_type = self.config.get('dataPipeline', 'source.type') blastPath = self.config.get('blaster', 'blastdb.path') blastPath = Utils.normalizePath(blastPath) blastName = self.config.get('blaster', 'blastdb.name') blastMapping = blastPath + blastName + '.tab' datapipe = DataPipeline.DataPipeline(source_type=source_type, source_path=self.source_path, result_path=self.result_path) list, file_content = Parsers.parseFastaToList(self.source_path, "") file_content = [ content for content in file_content if not os.path.isfile( self.result_path + os.path.basename(content[0]).replace('.fasta', '.go')) ] sparkContext = SparkContext(conf=datapipe.initSpark("goDataset")) goterms = datapipe.getBLAST(file_content, sparkContext, blastTask="goTerms") count = 0 notFound = 0 for file, content in goterms.items(): length = content.split('\n') if (len(length) == 2 and not str(length[1])): notFound += 1 else: filename = os.path.basename(file) resultFile = self.result_path + filename resultFile = resultFile.replace('.fasta', '.go') Utils.writeFile(resultFile, content) count += 1 print('Done generating', str(count), 'GO term files. \nNo GO terms found for', str(notFound), 'files.')
def main(self): if ('shuffle' not in self.task and 'selectvalid' not in self.task): self.result_path = Utils.normalizePath(self.result_path) os.makedirs(os.path.dirname(self.result_path), exist_ok=True) if ('split' in self.task): self.splitAsClusters() if ('shuffle' in self.task): posPerc = self.config.get('corpusPrep', 'pos.perc') posPerc = int(posPerc) if float(posPerc).is_integer() else float( posPerc) if (self.result_path.endswith('/')): self.result_path = self.result_path[:-1] self.result_path = self.result_path + '_pos' + str(posPerc) + '/' if (not os.path.isdir(self.result_path)): os.makedirs(os.path.dirname(self.result_path), exist_ok=True) self.createNegShuffle(posPerc) else: print('Result path already exists.') if ('createdataset' in self.task): self.createDataset() if ('domain' in self.task): self.createDomainDataset() if ('goterms' in self.task): self.createGoDataset() if ('similarity' in self.task): self.createSimilarityMatrix() if ('pfamtsv' in self.task): self.createPfamTsv()
def splitAsClusters(self): self.source_path = Utils.normalizePath(self.source_path) slimIDs = self.config.getboolean('corpusPrep', 'slim.id') files = Utils.listFilesExt(self.source_path, self.ext) result = [] overlap = int((self.windowOverlap / 100) * self.length) for file in files: fileName = os.path.basename(file).split('.')[0] self.result_path = self.result_path + fileName + '_len' + str( self.length) + '_overlap' + str(self.windowOverlap) if (slimIDs): self.result_path += '_slimIDs' #self.result_path += '/' self.result_path += '.fasta' if (os.path.isfile(self.result_path)): print('File already exists: ' + self.result_path + '.\nDone.') else: file = Parsers.sortFasta(file) sequences = Parsers.parseFasta(file) content, ids, entry, overlapIds = '', '', '', '' for fasta in sequences: content += str(fasta.seq.upper()) ids += str(fasta.id) if not ids else '|' + str(fasta.id) if (slimIDs): allIds = ids.split('|') ids = allIds[0] + '|to|' + allIds[len(allIds) - 1] while (len(content) > 0): varSize = self.length - (len(entry)) if (varSize <= overlap): overlapIds += str( fasta.id) if not overlapIds else '|' + str( fasta.id) entry += content[0:varSize] if (len(entry) == self.length): # move cursor on real sequence according to variable length added content = content[varSize:] # add chunk to list if (slimIDs): allIds = ids.split("|") ids = allIds[0] + '|to|' + allIds[len(allIds) - 1] result.append('>' + ids + '\n' + entry) # make sure that entry contains overlap entry = entry[len(entry) - overlap:] if (len(content) > 0): ids = overlapIds overlapIds = '' else: ids = '' elif (len(content) > 0 and len(entry) < self.length): content = content[len(entry):] prev = 0 pos = self.length result = '\n'.join(result) Utils.writeFile(self.result_path, result) print('Done.')
def createDataset(self): neg_path = Utils.normalizePath(self.negPath) pos_path = Utils.normalizePath(self.posPath) negatives = Utils.listFilesExt(neg_path, self.ext) positives = Utils.listFilesExt(pos_path, self.ext) subject = Utils.normalizePath(self.result_path) negLen = len(negatives) posLen = len(positives) negPerc = 100 - self.posPerc negTotal = (posLen * negPerc) / self.posPerc if (negLen < negTotal): print("Not enough negative instances. Try another %") exit() else: if (not negatives or not positives): print( 'List of files was empty. ' 'Please check \'neg.path\' and \'pos.path\' in the self.config file.' ) subject += 'pos' + str(self.posPerc) if (len(subject) > 1): os.makedirs(subject, exist_ok=True) destTrain = subject + '/train/' destValid = subject + '/validation/' if (os.path.exists(destTrain) or os.path.exists(destValid)): print('Dataset already splitted for train and validation. ' '\nRename ' + destTrain + ' or ' + destValid + ' and try again.') else: os.makedirs(destTrain, exist_ok=False) os.makedirs(destValid, exist_ok=False) perc = int(self.validPerc) / 100 readme = 'Source negative: ' + neg_path + \ '\nSource positive: ' + pos_path + \ '\n# negative files: ' + str(negLen) + \ '\n(final) # negative files: ' + str(negTotal) + \ '\n# positive files: ' + str(posLen) + \ '\nValidation data percentage (from total): ' + str(self.validPerc) + '%' Utils.writeFile(subject + '/README.md', readme) # select validation files validNegatives = random.sample(negatives, int(perc * negTotal)) validPositives = random.sample(positives, int(perc * posLen)) # remove validation files from list negatives = [ f for f in negatives if f not in validNegatives ] positives = [ f for f in positives if f not in validPositives ] # select randomly corresponding nb of negatives negatives = random.sample( negatives, int(negTotal - len(validNegatives))) train = negatives + positives validation = validPositives + validNegatives for f in validation: name = os.path.basename(f) copy(f, destValid + name) for f in train: name = os.path.basename(f) copy(f, destTrain + name) print('Done splitting randomly ' + str(len(train)) + ' train and ' + str(len(validation)) + ' files.')