def __init__(self): self.config = Utils.loadConfig() self.task = self.config.get('eval', 'task') self.gold = self.config.get('eval', 'goldID.path') self.result = self.config.get('eval', 'result.path') self.threshold = float(self.config.get('eval', 'threshold')) self.sparkContext = SparkContext(conf=Utils.getSparkConf('filter')) self.Similarity = Similarity.Similarity(self.config) self.Filter = Filter.Filter(self.config, sparkContext=self.sparkContext) self.Merger = Merger.Merger(self.config) self.goldIDs = Utils.readFileLines(self.gold)[1:] self.resultFiles = Utils.listFilesExt(self.result, 'IDs.test') # total nb of gold genes self.nbGoldGenes = len(self.goldIDs) # total nb of gold clusters self.foldedGold = Utils.foldClusterData(self.goldIDs, 'gold', 0) self.goldGenes = [ gene for genes in self.foldedGold.values() for gene in genes ] self.nbGoldClusters = len(self.foldedGold) self.outputheader = 'goldClusterID\tgoldGeneID\tpredictedClusterLabel\tpredictedClusterID\n' self.scoreheader = 'goldClusterID\tpredictedClusterID\tclusterScore\n'
def summarize(self): metricFiles = Utils.listFilesExt(self.result, 'metrics') metricFiles = sorted(metricFiles) output, pos = "", "" outputFile = Utils.normalizePath(self.result) + "results.summary" if ("pos" in self.result): pos = self.result.split("pos")[1][0:2] for file in metricFiles: metrics = Utils.readFileLines(file)[2].replace("pos\t", "") filename = os.path.basename(file) classifier = filename.split("_")[0] feats = filename.split("_")[1] + "+" + filename.split("_")[2] len = filename.split("len")[1].split("_")[0] overlap = filename.split("overlap")[1].split("_")[0][0:2] evaltype = filename.split("IDs.test.")[1].replace( "eval.metrics", "").replace(".", "") if (not evaltype): evaltype = "succ0" if ("similar" in evaltype): evaltype = evaltype.replace("similar", "sim") if ("merge" in evaltype): evaltype = evaltype.replace("succ", "") line = feats + "\t" + classifier + "\t" + pos + "\t" + len + "\t" + overlap + "\t" + evaltype + "\t" + metrics + "\n" output += line Utils.writeFile(outputFile, output)
def createSimilarityMatrix(self): source_type = self.config.get('dataPipeline', 'source.type') list = Utils.listFilesExt(self.source_path, "fasta") outputFile = self.result_path + '/similarity.blast' outputRedFile = self.result_path + '/similarity.blast.similarity' similarity = "" columns = [ 'qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qcovs' ] if (not os.path.isfile(outputFile)): # generate all gene pairs within a genome allpairs = {(i, j) for i in list for j in list} # filter out duplicate pairs, e.g. (2,8) and (8,2) file_content = set(tuple(sorted(p)) for p in allpairs) datapipe = DataPipeline.DataPipeline(source_type=source_type, source_path=self.source_path, result_path=self.result_path) sparkContext = SparkContext( conf=datapipe.initSpark("blastSimilarity")) similarity = datapipe.getBLAST(file_content, sparkContext, blastTask="similarity") result = "" for entry in similarity: if (entry[1]): result += entry[1] + "\n" Utils.writeFile(outputFile, result) df = pandas.read_csv(StringIO(result), sep='\t', names=columns, index_col=False) else: df = pandas.read_csv(outputFile, sep='\t', names=columns, index_col=False) # generate leaner matrix with only selected columns, # output to new file if (not os.path.isfile(outputRedFile)): df = df[['qseqid', 'sseqid', 'pident', 'bitscore', 'qcovs']] df['id'] = df[['qseqid', 'sseqid']].agg('|'.join, axis=1) df.drop('qseqid', 1) df.drop('sseqid', 1) df = df[['id', 'pident', 'bitscore', 'qcovs']] df = df.sort_values('id') df.to_csv(sep='\t', header=True, path_or_buf=outputRedFile, index=False) print('done!')
def runCrossValid(self, x_occ, y_labels, IDs): seed = 5 np.random.seed(seed) i = 1 kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) for train, valid in kfold.split(x_occ, y_labels): # get position of features inexistent on train, remove such feats from valid # gives which indexes are greater than 0 filter = np.where(np.sum(x_occ[train], axis=0) > 0)[0] # takes only column indices from *filter* x_occT = np.take(x_occ[train], filter, axis=1) x_occV = np.take(x_occ[valid], filter, axis=1) self.classifier.fit(x_occT, y_labels[train]) output, IDoutput = self.getPredictions(IDs[valid], x_occV, y_labels[valid]) Utils.writeFile(self.outFile + 'f' + str(i) + '.valid', output) Utils.writeFile(self.outFile + 'f' + str(i) + '.IDs.valid', IDoutput) i += 1 self.classifier = self.setUpClassifier() self.classifier.fit(x_occ, y_labels) joblib.dump(self.classifier, self.modelFile) print('Model saved.')
def parseFastaToList(path, filter): thislist, files, filterIDs, filename_content = [], [], [], [] if (os.path.isfile(path)): files.append(path) else: files = Utils.listFilesExt(path, 'fasta') if (os.path.isfile(filter)): filterIDs = Utils.readFileLines(filter) else: filterIDs = filter.split('\n') for file in files: sequences = parseFasta(file) for fasta_record in sequences: output = '>' + str(fasta_record.id) + '\n' + str(fasta_record.seq) if (len(filterIDs) > 0): if (str(fasta_record.id)) not in str(filterIDs): thislist.append(output) filename_content.append(tuple([file, output])) else: thislist.append(output) filename_content.append(tuple([file, output])) return thislist, filename_content
def runGridSearch(self, x_occ, y_labels): output = 'Running grid search for ' + self.classif + ' in ' + str( len(x_occ)) + ' instances ...\n' print('Running grid search for', self.classif, 'in', str(len(x_occ)), 'instances ...\n') scores = ['f1', 'precision', 'recall'] for score in scores: output += 'Grid search for score: ---> ' + score + ' <---\n' classif = GridSearchCV(estimator=self.setUpClassifier(), param_grid=self.getGridParams(), scoring=score, cv=5, n_jobs=60) classif.fit(x_occ, y_labels) output += 'Best parameters in train set:\n' output += str(classif.best_params_) + '\n' output += 'Grid scores in train set:\n' means = classif.cv_results_['mean_test_score'] stds = classif.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, classif.cv_results_['params']): params = str(params).replace('{', '').replace('}', '') output += ("%0.3f (+/-%0.03f) |\t params %r" % (mean, std * 2, params)) + '\n' output += "\n--------------------------------------------------\n" print('Done with', score, '.') Utils.writeFile(self.outputPath + self.classif + '.gridSearch', output) print(output)
def extractRewardPerFeat(self, dataPath, outputPath, featType, sourceType, rewardType): rewardperfeat = {} # tuple of shape {(file, content, id),'kmers'} resultLabel = Parsers.parseDatasetContents(dataPath, featType, sourceType) fileindex = list(set([i[0][0] for i in resultLabel])) for item in resultLabel: filename = item[0][0] label = Utils.getLabel(filename) content = item[0][1] idx = fileindex.index(filename) occ = 1 if label == 1 else -1 if (content in rewardperfeat): if ('label' in rewardType): rewardperfeat[content][idx] = occ else: rewardperfeat[content][idx] += occ else: rewardperfeat[content] = [0] * len(fileindex) if ('label' in rewardType): rewardperfeat[content][idx] = occ else: rewardperfeat[content][idx] += occ outputstr = '' for k, v in rewardperfeat.items(): outputstr += k + '\t' + (',').join(map(str, v)) + '\n' Utils.writeFile(outputPath, outputstr[:-1]) return rewardperfeat
def parseDatasetContents(dataPath, featType, sourceType): files, result = [], [] if ('domain' in featType or 'dictionary' in featType): domainFiles = Utils.listFilesExt(dataPath, 'domains') files += domainFiles if (len(domainFiles) < 1): print('No domains / dictionary files found in', dataPath) exit() if ('kmers' in featType or 'prot' in featType): fastaFiles = Utils.listFilesExt(dataPath, 'fasta') files += fastaFiles if (len(fastaFiles) < 1): print('No fasta files found in', dataPath) exit() if ('go' in featType): goTermFiles = Utils.listFilesExt(dataPath, 'go') files += goTermFiles if (len(goTermFiles) < 1): print('No GO term files found in', dataPath) exit() for file in files: ext = os.path.splitext(file)[1] lines = Utils.readFileLines(file) #handle genes with an added version number as NRRL3_00129.1 id = lines[0].replace('>', '').replace('a', '').split('.')[0] if ('fasta' in ext): #content = Utils.readFileLines(file)[1].upper() content = lines[1].upper() content = normalizeSequence(content, sourceType) if ('kmers' in featType): result.append(((file, content, id), 'kmers')) if ('prot' in featType): result.append(((file, content, id), 'protanalys')) elif ('domain' in ext): #content = Utils.readFileLines(file)[1:] content = lines[1:] # Comment out next line to keep domain name: content = [line.split('.')[0] for line in content] content = "\n".join(content) if ('pfam' in featType): temp = content.split('\n') for entry in temp: result.append(((file, entry, id), 'domains')) else: if (content): result.append(((file, content, id), 'domains')) elif ('go' in ext): #content = Utils.readFileLines(file)[1:] content = lines[1:] content = "\n".join(content) result.append(((file, content, id), 'go')) return result
def outputStats(self): stat = ['id\trewardKeep\trewardSkip\tqvalueKeep\tqvalueSkip'] for i, id in enumerate(self.rewardIDs): stat.append(id + '\t' + str(self.rewardTable[i][0]) + '\t' + str(self.rewardTable[i][1]) + '\t' + str(self.QTable[i][0]) + '\t' + str(self.QTable[i][1])) # Utils.writeFile(self.outputPath + self.params + '.stat', '\n'.join(stat)) print('Stats saved.')
def train(self, sparkContext, outputStats): dataset = self.prepareData(self.trainPath, sparkContext) penalties = [] logging = ['Episode\tPenalty'] for ep in range(0, self.episodes): totalStates = 0 penalty = 0 for i, entry in enumerate(dataset): # check reward per cluster according to table states = entry[1] # domains only actionType = '' totalStates += len(states) #while not done: for j, state in enumerate(states): state = state.split('.')[0] stateIdx = self.rewardIDs.index(state) if (random.uniform(0, 1) < self.epsilon): actionType = 'explore' action = random.choice(self.actions) action = self.actions.index(action) else: action = np.argmax(self.QTable[stateIdx]) actionType = 'exploit' reward = self.rewardTable[stateIdx, action] # check if last state in the cluster if (j + 1 < len(states)): nextState = states[j + 1] else: nextState = states[j] nextStateIdx = self.rewardIDs.index(nextState) oldQValue = self.QTable[stateIdx, action] nextMax = np.max(self.QTable[nextStateIdx]) newQValue = oldQValue + self.alpha * ( reward + self.gamma * nextMax - oldQValue) self.QTable[stateIdx, action] = newQValue if (reward < self.penaltyThreshold): # better define penalties penalty += 1 penalties.append(penalty) np.save(self.QTablePath, self.QTable) np.save(self.rewardTablePath, self.rewardTable) Utils.writeFile(self.IDmapPath, '\n'.join(self.rewardIDs)) self.outputStats() if outputStats else '' print('Done training!')
def genBankToAminoacid(path): list = [] # only aminoacid sequence translations = '' files = [] if (os.path.isfile(path)): files.append(path) else: files = Utils.listFilesExt(path, 'gbk') for file in files: species = Utils.getSpecies(file) records = parseGenBank(file) for record in records: locus = record.id for feature in record.features: #if feature.key == "CDS": if feature.type == "CDS": id, locus_tag, gene, protein_id, translation, \ product, function, description = '','','','','','','','' for key, value in feature.qualifiers.items(): # get rid of the quotes around the qualifier # find entry ID if key == "translation": translation = value[0] elif key == "gene": gene = value[0] elif key == "locus_tag": locus_tag = value[0] elif key == "protein_id": protein_id = value[0] protein_id = protein_id.replace('/', '') elif key == "product": product = value[0] elif key == "function": function = value[0] #priority for gene ID id = locus_tag if not id and len(locus_tag) > 1 else id id = gene if not id and len(gene) > 1 else id id = protein_id if not id and len(protein_id) > 1 else id description = product if product.strip() else description description += '|' + function if function.strip( ) else description entry = '>' + locus + '|' + species + '|' + id + '|' + description + '\n' + translation if (entry not in list): list.append(entry) translations += translation return list, translations
def getRFImportance(self): pd.options.display.float_format = '{:,.8f}'.format importance = self.classifier.feature_importances_ features = self.extractor.loadFeatures() feature_importances = pd.DataFrame(importance, index=features, columns=['importance']).sort_values( 'importance', ascending=False) Utils.writeFile(self.extractor.featFile + 'importance', feature_importances.to_string())
def __init__(self): self.config = Utils.loadConfig() self.path = self.config.get('prediction', 'source.path') self.path = Utils.normalizePath(self.path) self.trainPath = self.path + 'train/' self.testPath = self.path + 'test/' self.outputPath = self.path + 'metricsQLearner/models/' self.geneMapPath = self.config.get('eval', 'filter.map') self.geneMap = {} self.extractor = Extractor.Extractor(self.config, self.outputPath) self.rewardType = 'occ' self.rewardPath = self.outputPath + self.rewardType + 'PerDomains.feat' # pfam domain list self.rewardList, self.rewardIDs, self.rewardLabels = '', '', '' self.actions = ['keep', 'skip'] self.task = 'train' self.rewardTable, self.QTable = [], [] self.episodes = int(self.config.get('prediction', 'episodes')) # hyperparams self.alpha = float(self.config.get('prediction', 'alpha')) # learning rate self.gamma = float(self.config.get('prediction', 'gamma')) # discount factor self.epsilon = float(self.config.get('prediction', 'epsilon')) # exploration self.penaltyThreshold = float( self.config.get( 'prediction', 'penalty.threshold')) # negative rewards mean penalty self.keepskipThreshold = float( self.config.get('prediction', 'keepskip.threshold') ) # keep reward ratio wrt skip reward for domain to be kept self.useSimilarityWeight = False self.useCompWeights = False self.useNeighborWeight = self.config.getboolean( 'prediction', 'neighbor.weight') self.useDryIslands = self.config.getboolean('prediction', 'dry.islands') self.useAvAction = self.config.getboolean('prediction', 'average.action') self.weightsPath = self.config.get('eval', 'weights') self.weights = Utils.readFileLines( self.weightsPath ) if self.useCompWeights or self.useNeighborWeight else '' self.params = self.rewardType + '_keepgt' + str( self.keepskipThreshold) + 'skip' + '_ep' + str( self.episodes) + '_alpha' + str(self.alpha) + '_gamma' + str( self.gamma) + '_eps' + str(self.epsilon) self.params += '_neighbor' if self.useCompWeights else '' self.QTablePath = self.outputPath + 'Qtable_' + self.params + '.npy' self.rewardTablePath = self.outputPath + 'Rewards_' + self.params + '.npy' self.IDmapPath = self.outputPath + 'RewardIDsmap_' + self.params + '.map'
def __init__(self, blastTask): self.config = Utils.loadConfig() self.sourceType = self.config.get('dataPipeline', 'source.type') self.blastTask = blastTask self.blastdb = self.config.get('blaster', 'blastdb.path') self.blastdb = Utils.normalizePath(self.blastdb) self.blastdbName = self.config.get('blaster', 'blastdb.name') if(not self.blastdbName.endswith('fasta')): self.blastdbName += '.fasta' self.goTerms = True if 'goterm' in blastTask.lower() else False self.mappingFile = self.blastdb + self.blastdbName.replace('.fasta','.tab') self.mapping = '' if(self.goTerms): self.mapping = self.loadMapping()
def getDomains(self, sparkContext): # recover the species name for using in temp files self.species = Utils.getSpecies(self.source_path) domainFinder = DomainFinder.DomainFinder() # load source sequences into a single list if ("fasta" in self.source_type): list, file_content = Parsers.parseFastaToList(self.source_path, "") elif ("genbank" in self.source_type): list = Parsers.genBankToAminoacid(self.source_path) print('Processing domains...') # create RDD with source sequences sourceRDD = sparkContext.parallelize(file_content, numSlices=2000) if ("nucleotide" in self.source_type): # execute sixFrame translation for each sequence in RDD sourceRDD = sourceRDD.map(lambda x: SixFrameTranslator.main(x)) # execute Pfam domain prediction for each sixFrame translation in RDD domainsRDD = sourceRDD.map(lambda x: domainFinder.main(x[0], x[1])) processedRDD = domainsRDD.map( lambda x: self.processDomainOutput(x[0], x[1])) # recover Pfam domain prediction results from RDD result = processedRDD.collectAsMap() print('Done!') return result
def computeMajorityVote(self, predictedClusters, thresholdPerc, thresholdLen): predictPerGene, majority, geneLengths = {}, {}, {} majorityClusters = [] threshold = thresholdPerc / 100 lengths = Utils.readFileLines(self.geneLengthPath) for i in lengths: temp = i.split('\t') geneLengths[temp[0].split('.')[0]] = int(temp[1]) # get number of times gene appears, and with which label for cluster in predictedClusters: clustergenes = cluster.replace('||', '|').replace('|\t', '\t').split('\t') label = 1 if float(clustergenes[1]) >= self.threshold else 0 genes = clustergenes[0].split('|') for gene in genes: gene = gene.split('.')[0] if (gene in predictPerGene.keys()): predictPerGene[gene].append(label) else: if (gene): predictPerGene[gene] = [label] # compute score for each gene regarding the number of times it appears for gene, label in predictPerGene.items(): majority[gene] = sum(label) / len(label) # concatenate genes scoring above threshold tempcluster, tempscore, templength = '', 0, 0 for gene, score in majority.items(): if (score >= threshold): if (tempscore >= threshold): tempcluster += gene + '|' templength += geneLengths.get(gene) else: majorityClusters.append(tempcluster[:-1] + '\t' + '0') tempcluster = gene + '|' templength = geneLengths.get(gene) else: if (tempscore < threshold): # concatenate until around threshold length for negatives if (templength < thresholdLen): tempcluster += gene + '|' templength += geneLengths.get(gene) else: majorityClusters.append(tempcluster[:-1] + '\t' + '0') tempcluster = gene + '|' templength = geneLengths.get(gene) else: majorityClusters.append(tempcluster[:-1] + '\t' + '1') tempcluster = gene + '|' templength = geneLengths.get(gene) tempscore = score print('Done computing majority vote.') return majorityClusters
def createNegShuffle(self, posPerc): files = Utils.listFilesExt(self.source_path, self.ext) negPerc = 100 - posPerc positives = len(files) negativeSize = int((negPerc * positives) / posPerc) print('Negative percentage: ' + str(negPerc) + '% \n' + 'Negative instances: ' + str(negativeSize) + '\n' + 'Positive percentage: ' + str(posPerc) + '% \n' + 'Positive instances: ' + str(positives) + '\n' + 'Total corpus size: ' + str(negativeSize + positives)) thisDecRatio = 0.0 count = 0 ratio = (negativeSize / positives) decRatio = ratio - int(ratio) print('Generating...') for file in files: # add up the decimal ratio part thisDecRatio += round(decRatio, 2) # reset range ratioRange = int(negativeSize / positives) # check if decimal ratio added up to a duplicate if (thisDecRatio >= 1): ratioRange = int(ratio + thisDecRatio) thisDecRatio = 0 for i in range(0, ratioRange): name = os.path.basename(file) result_file = name.split('.')[0] + '_' + str( i) + '.shuffled.negative.fasta' if ('nuc' in self.seqType): content = Parsers.genBankToNucleotide(file) if ('amino' in self.seqType): list, content = Parsers.genBankToAminoacid(file) content = Utils.charGramShuffle(content, 2) content = '>' + name + '\n' + content count += 1 Utils.writeFile(self.result_path + result_file, content) print('Total generated: ' + str(count) + '. Done!')
def addInstance(self, info): # receives tuple in format: # (fileID, [list of extracted features]) # for pfam only (inverted): # (pfamID, [list of fileIDs]) instanceValues = list() label = Utils.getLabel(''.join(info[1])) if 'pfam' in self.featType else Utils.getLabel(info[0]) fileID = info[0] features = [info[0]] if 'pfam' in self.featType else info[1] for feat in features: instanceValues.append(self.dictionary.get(feat)) if feat in self.dictionary else "" size = len(instanceValues) instanceID = (fileID, int(size), str(label)) return instanceID, instanceValues
def __init__(self): # read application configuration props self.config = Utils.loadConfig() self.path = self.config.get('prediction', 'source.path') self.path = Utils.normalizePath(self.path) self.trainPath = self.path + 'train/' self.validPath = self.path + 'validation/' self.gridCVPath = self.path + 'train_validation/' self.testPath = self.path + 'test/' self.outputPath = self.path + 'metrics/cv_gridsearchparams/' self.task = self.config.get('prediction', 'task') self.posPerc = int(self.config.get('prediction', 'pos.perc')) self.classif = self.config.get('prediction', 'classifier') os.makedirs(os.path.dirname(self.outputPath), exist_ok=True) self.extractor = Extractor.Extractor(self.config, self.outputPath) self.loader = Loader.Loader(self.config, self.outputPath) self.dimHandler = DimensionHandler.DimensionHandler( self.config, self.outputPath) self.outFile = '' self.useEmbeddings = self.config.getboolean('prediction', 'use.embeddings') self.cv = self.config.getboolean('prediction', 'use.crossvalid') if ('cross' in self.task): self.cv = True if (not 'none' in self.dimHandler.name.lower()): self.outFile = self.dimHandler.getOutFile(self.classif) self.outFile = self.outFile + '_embeddings' if self.useEmbeddings else self.outFile else: self.outFile = self.outputPath + self.classif + '_' + self.extractor.featType if ('kmers' in self.extractor.featType): kmerfeats = 'kmers' + str( self.extractor.size) + '_minOcc' + str( self.extractor.minOcc) self.outFile = self.outFile.replace('kmers', kmerfeats) #self.outFile += str(self.extractor.size) + '_minOcc' + str(self.extractor.minOcc) if ('cross' in self.task or 'grid' in self.task or self.cv): self.extractor.featFile = self.extractor.featFile.replace( '.feat', '.complete.feat' ) if 'grid' in self.task else self.extractor.featFile if ('cross' in self.task or self.cv): self.outFile += '_cv05' self.modelFile = self.outFile + '.model.pkl' self.classifier = self.setUpClassifier()
def getEmbeddings(self): matrix = np.zeros((self.dictLength(), self.embedSize)) embfiles = Utils.listFilesExt(self.embedPath, 'w2v') for i in embfiles: if ('kmer' in i.lower() and 'kmer' in self.featType.lower()): matrix = self.mapEmbedWeights(i, 'kmer', matrix) elif ('domain' in i.lower() and 'domain' in self.featType.lower()): matrix = self.mapEmbedWeights(i, 'domain', matrix) elif ('go' in i.lower() and 'go' in self.featType.lower()): matrix = self.mapEmbedWeights(i, 'go', matrix) return matrix
def createDomainDataset(self): useID = True files = Utils.listFilesExt(self.source_path, self.ext) files = [ fileName for fileName in files if not os.path.isfile( self.result_path + os.path.basename(fileName).replace('.fasta', '.domains')) ] source_type = self.config.get('dataPipeline', 'source.type') count = 0 countNone = 0 datapipe = DataPipeline.DataPipeline(source_type=source_type, source_path=self.source_path, result_path=self.result_path) sparkContext = SparkContext(conf=datapipe.initSpark("domainDataset")) pfamDomains = datapipe.getDomains(sparkContext) for file in files: fileName = os.path.basename(file) IDs = open(file, 'r').readline() resultFile = self.result_path + fileName.replace( '.fasta', '.domains') result = pfamDomains.get(file) #if(len(result) > 1): if (result is not None): result = result.split('\n') outF = open(resultFile, 'w') outF.write(IDs) output = "" for line in result: if (len(line.strip()) > 1): items = line.split('\t') domainID = items[5] domain = items[6] bitscore = items[11] if (useID): domain = domainID + '|' + domain outF.write(domain + '\n') output += domain + '\n' outF.close() count += 1 else: print('None for file: ', file) countNone += 1 print('Done generating', str(count), 'domain files. \nNo domain found for', str(countNone), 'files.')
def loadFilterMap(self, sparkContext): filterList = Utils.readFileLines(self.filterList) # returns tuple (((file, content), 'domains')) content = Parsers.parseDatasetContents(self.filterMap, 'domains', 'domains') domRDD = sparkContext.parallelize(content, numSlices=1000) domainsRDD = domRDD.map(lambda x: (Utils.getFileName(x[0][0]).replace( '.domains', ''), x[0][1])) # lists genes that have any domains in filterList # discards ".\d+" end of Pfam ID filter = domainsRDD.filter(lambda x: any( domain in filterList for domain in re.split("[\n.]", x[1]))) result = filter.collectAsMap().keys() genes = sorted([i for i in result]) print('Loaded filter:', len(genes), ' genes will be filtered from', len(filterList), 'domains.') return genes
def __init__(self, source_type=None, source_path=None, result_path=None): self.config = Utils.loadConfig() self.task = self.config.get('dataPipeline', 'task') self.source_path = self.config.get( 'dataPipeline', 'source.path') if source_path is None else source_path self.source_type = self.config.get( 'dataPipeline', 'source.type') if source_type is None else source_type self.result_path = self.config.get( 'dataPipeline', 'result.path') if result_path is None else result_path self.result_path = Utils.normalizePath(self.result_path) # create if it doesnt exist os.makedirs(os.path.dirname(self.result_path), exist_ok=True) # recover the species name for using in temp files self.species = Utils.getSpecies(self.source_path) # temp dir + file used by sub-pipelines self.path = os.path.dirname(os.path.realpath(__file__)) self.path += '/temp/' os.makedirs(os.path.dirname(self.path), exist_ok=True)
def genBankToFasta(): config = Utils.loadConfig() source = config.get('parsers', 'source.path') if not str(source).endswith('/'): output = source + '_fasta/' source += '/' else: source[len(source) - 1] = '' output = source + '_fasta/' source += '/' os.makedirs(os.path.dirname(output), exist_ok=True) list = genBankToAminoacid(source) content = '' for item in list: content += item + '\n' Utils.writeFile(output + 'fungi_complete' + '.fasta', content)
def loadW2V(self, embfile): ext = Utils.getFileExt(embfile) word2vec = [] if('compact') in embfile: word2vec = Word2Vec.load(embfile, mmap='r') elif('.bin' in embfile): word2vec = KeyedVectors.load_word2vec_format(embfile, binary=True, limit=2000000) else: word2vec = KeyedVectors.load_word2vec_format(embfile, binary=False, limit=2000000) return word2vec
def createPfamTsv(self): listFiles = Utils.listFilesExt(self.source_path, 'domains') head = 'sequence_id\tprotein_id\tgene_start\tgene_end\tgene_strand\tpfam_id\tin_cluster\n' contentPos = '' contentNeg = '' for file in listFiles: fileContent = Utils.readFileLines(file) id = fileContent[0].replace('>', '') fileContent = fileContent[1:] inCluster = 1 if 'bgc' in os.path.basename(file).lower() else 0 for line in fileContent: pfamId = line.split('|')[0] product = line.split('|')[1] currentLine = id + '\t' + product + '\t0\t0\t0\t' + pfamId + '\t' + str( inCluster) + '\n' if (inCluster == 1): contentPos += currentLine else: contentNeg += currentLine contentPos = head + contentPos[:-1] contentNeg = head + contentNeg[:-1] folder = os.path.basename(os.path.dirname(file)) resultPos = self.result_path + folder + '.positives.pfam.tsv' resultNeg = self.result_path + folder + '.negatives.pfam.tsv' Utils.writeFile(resultPos, contentPos) Utils.writeFile(resultNeg, contentNeg)
def countOccurrence(self, dataPath, sparkContext): feats = self.loadFeatures() contentIds = [] listContents = Parsers.parseDatasetContents(dataPath, self.featType, self.sourceType) parentDir = os.path.split(os.path.dirname(listContents[0][0][0]))[1] for info in listContents: filename = info[0][0] content = info[0][1] type = info[1] firstLine = Utils.readFileLines(filename)[0] id = firstLine.replace( '>', '') if '|' in firstLine else firstLine.split('.')[0].replace( '>', '') label = Utils.getLabel(filename) # avoid cases in which test synthetic genes are long and # in the split different clusters share same (gene) id for item in contentIds: if (id in item[0] and type in item[1]): id = id + '|' contentIds.append(tuple([id, type, content, label])) sourceRDD = sparkContext.parallelize(contentIds, numSlices=1000) occRDD = sourceRDD.map(lambda x: self.occurrence(x, feats)) # combine features with same ID and filter out instances with not enough features reducedRDD = occRDD.reduceByKey( lambda x, y: self.mergeFeatsSameId(x, y)) ids = reducedRDD.map(lambda x: x[0]).collect() occ = reducedRDD.map(lambda x: x[1][0]).collect() labels = reducedRDD.map(lambda x: x[1][1]).collect() print('Features loaded.') return np.array(ids), np.array(occ), np.array(labels), parentDir
def createGoDataset(self): source_type = self.config.get('dataPipeline', 'source.type') blastPath = self.config.get('blaster', 'blastdb.path') blastPath = Utils.normalizePath(blastPath) blastName = self.config.get('blaster', 'blastdb.name') blastMapping = blastPath + blastName + '.tab' datapipe = DataPipeline.DataPipeline(source_type=source_type, source_path=self.source_path, result_path=self.result_path) list, file_content = Parsers.parseFastaToList(self.source_path, "") file_content = [ content for content in file_content if not os.path.isfile( self.result_path + os.path.basename(content[0]).replace('.fasta', '.go')) ] sparkContext = SparkContext(conf=datapipe.initSpark("goDataset")) goterms = datapipe.getBLAST(file_content, sparkContext, blastTask="goTerms") count = 0 notFound = 0 for file, content in goterms.items(): length = content.split('\n') if (len(length) == 2 and not str(length[1])): notFound += 1 else: filename = os.path.basename(file) resultFile = self.result_path + filename resultFile = resultFile.replace('.fasta', '.go') Utils.writeFile(resultFile, content) count += 1 print('Done generating', str(count), 'GO term files. \nNo GO terms found for', str(notFound), 'files.')
def __init__(self, config, outputPath): self.sourcePath = config.get('prediction', 'source.path') self.sourcePath = Utils.normalizePath(self.sourcePath) self.trainPath = self.sourcePath + 'train/' self.outputPath = self.sourcePath + 'metricsDL/' self.sourceType = config.get('prediction', 'source.type') self.useEmbeddings = bool(config.get('prediction', 'use.embeddings')) self.embedPath = config.get('prediction', 'embed.path') self.embedPath = Utils.normalizePath(self.embedPath) if (self.useEmbeddings): self.featType = config.get('prediction', 'feat.type') self.featSize = config.get('prediction', 'feat.size') self.minOcc = config.get('prediction', 'feat.minOcc') self.embedSize = config.getint('prediction', 'embeddings.length') self.embeddingsName = self.featType + self.featSize + 'minOcc' + str(self.minOcc) \ + str(self.embedSize) + 'd' self.dictionary = dict() self.extractor = Extractor.Extractor(config, outputPath) self.featType = config.get('prediction', 'feat.type') self.maxLength = 0
def createActors(self,actors_num): actors = [] r = random.randint(0,len(Const.ActorNameListA)-actors_num) name_nums_list = range(0+r,actors_num+r) role_nums_list = range(0,actors_num) random.shuffle(name_nums_list) random.shuffle(role_nums_list) for i in range(0,actors_num): id = i role_id = Const.RolesList[role_nums_list[i]] name = UtilMethods.makeDefaultName(name_nums_list[i]) village = self actors.append(Actor(i,name,role_id,village)) return actors
def orthogroupSeqs(orthofile, seqpath, limit): orthodir = os.path.dirname(seqpath) ortholines = Utils.readFileLines(orthofile)[1:] seqpath = Utils.listFilesExt(seqpath, "fasta") threshold = limit if (limit) else len(seqpath) orthogroups = [re.split('\t|;|,', item)[1:] for item in ortholines][1:threshold + 1] sequences, output = dict(), dict() print('Loading files and seqIDs...') for seqfile in seqpath: sequences.update(SeqIO.index(seqfile, "fasta")) orthodir = orthodir + '/orthologs_threshold' + str(threshold) + '/' if os.path.isdir(orthodir): print('Orthogroup path', orthodir, 'already exists.') exit() else: os.makedirs(orthodir) print('Loading sequences per IDs...') for group in orthogroups: for id in group: id = id.strip(' ') tempseq = sequences.get(id) if (tempseq is not None and len(tempseq) > 1): thisseqfile = orthodir + tempseq.id + '.fasta' content = '>' + tempseq.id + '\n' + tempseq.seq Utils.writeFile(thisseqfile, content) # else: # print('ID not found', str(id)) print('Done writing seqs for orthogroups.') return output