def parseDatasetContents(dataPath, featType, sourceType): files, result = [], [] if ('domain' in featType or 'dictionary' in featType): domainFiles = Utils.listFilesExt(dataPath, 'domains') files += domainFiles if (len(domainFiles) < 1): print('No domains / dictionary files found in', dataPath) exit() if ('kmers' in featType or 'prot' in featType): fastaFiles = Utils.listFilesExt(dataPath, 'fasta') files += fastaFiles if (len(fastaFiles) < 1): print('No fasta files found in', dataPath) exit() if ('go' in featType): goTermFiles = Utils.listFilesExt(dataPath, 'go') files += goTermFiles if (len(goTermFiles) < 1): print('No GO term files found in', dataPath) exit() for file in files: ext = os.path.splitext(file)[1] lines = Utils.readFileLines(file) #handle genes with an added version number as NRRL3_00129.1 id = lines[0].replace('>', '').replace('a', '').split('.')[0] if ('fasta' in ext): #content = Utils.readFileLines(file)[1].upper() content = lines[1].upper() content = normalizeSequence(content, sourceType) if ('kmers' in featType): result.append(((file, content, id), 'kmers')) if ('prot' in featType): result.append(((file, content, id), 'protanalys')) elif ('domain' in ext): #content = Utils.readFileLines(file)[1:] content = lines[1:] # Comment out next line to keep domain name: content = [line.split('.')[0] for line in content] content = "\n".join(content) if ('pfam' in featType): temp = content.split('\n') for entry in temp: result.append(((file, entry, id), 'domains')) else: if (content): result.append(((file, content, id), 'domains')) elif ('go' in ext): #content = Utils.readFileLines(file)[1:] content = lines[1:] content = "\n".join(content) result.append(((file, content, id), 'go')) return result
def parseFastaToList(path, filter): thislist, files, filterIDs, filename_content = [], [], [], [] if (os.path.isfile(path)): files.append(path) else: files = Utils.listFilesExt(path, 'fasta') if (os.path.isfile(filter)): filterIDs = Utils.readFileLines(filter) else: filterIDs = filter.split('\n') for file in files: sequences = parseFasta(file) for fasta_record in sequences: output = '>' + str(fasta_record.id) + '\n' + str(fasta_record.seq) if (len(filterIDs) > 0): if (str(fasta_record.id)) not in str(filterIDs): thislist.append(output) filename_content.append(tuple([file, output])) else: thislist.append(output) filename_content.append(tuple([file, output])) return thislist, filename_content
def summarize(self): metricFiles = Utils.listFilesExt(self.result, 'metrics') metricFiles = sorted(metricFiles) output, pos = "", "" outputFile = Utils.normalizePath(self.result) + "results.summary" if ("pos" in self.result): pos = self.result.split("pos")[1][0:2] for file in metricFiles: metrics = Utils.readFileLines(file)[2].replace("pos\t", "") filename = os.path.basename(file) classifier = filename.split("_")[0] feats = filename.split("_")[1] + "+" + filename.split("_")[2] len = filename.split("len")[1].split("_")[0] overlap = filename.split("overlap")[1].split("_")[0][0:2] evaltype = filename.split("IDs.test.")[1].replace( "eval.metrics", "").replace(".", "") if (not evaltype): evaltype = "succ0" if ("similar" in evaltype): evaltype = evaltype.replace("similar", "sim") if ("merge" in evaltype): evaltype = evaltype.replace("succ", "") line = feats + "\t" + classifier + "\t" + pos + "\t" + len + "\t" + overlap + "\t" + evaltype + "\t" + metrics + "\n" output += line Utils.writeFile(outputFile, output)
def createSimilarityMatrix(self): source_type = self.config.get('dataPipeline', 'source.type') list = Utils.listFilesExt(self.source_path, "fasta") outputFile = self.result_path + '/similarity.blast' outputRedFile = self.result_path + '/similarity.blast.similarity' similarity = "" columns = [ 'qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore', 'qcovs' ] if (not os.path.isfile(outputFile)): # generate all gene pairs within a genome allpairs = {(i, j) for i in list for j in list} # filter out duplicate pairs, e.g. (2,8) and (8,2) file_content = set(tuple(sorted(p)) for p in allpairs) datapipe = DataPipeline.DataPipeline(source_type=source_type, source_path=self.source_path, result_path=self.result_path) sparkContext = SparkContext( conf=datapipe.initSpark("blastSimilarity")) similarity = datapipe.getBLAST(file_content, sparkContext, blastTask="similarity") result = "" for entry in similarity: if (entry[1]): result += entry[1] + "\n" Utils.writeFile(outputFile, result) df = pandas.read_csv(StringIO(result), sep='\t', names=columns, index_col=False) else: df = pandas.read_csv(outputFile, sep='\t', names=columns, index_col=False) # generate leaner matrix with only selected columns, # output to new file if (not os.path.isfile(outputRedFile)): df = df[['qseqid', 'sseqid', 'pident', 'bitscore', 'qcovs']] df['id'] = df[['qseqid', 'sseqid']].agg('|'.join, axis=1) df.drop('qseqid', 1) df.drop('sseqid', 1) df = df[['id', 'pident', 'bitscore', 'qcovs']] df = df.sort_values('id') df.to_csv(sep='\t', header=True, path_or_buf=outputRedFile, index=False) print('done!')
def __init__(self): self.config = Utils.loadConfig() self.task = self.config.get('eval', 'task') self.gold = self.config.get('eval', 'goldID.path') self.result = self.config.get('eval', 'result.path') self.threshold = float(self.config.get('eval', 'threshold')) self.sparkContext = SparkContext(conf=Utils.getSparkConf('filter')) self.Similarity = Similarity.Similarity(self.config) self.Filter = Filter.Filter(self.config, sparkContext=self.sparkContext) self.Merger = Merger.Merger(self.config) self.goldIDs = Utils.readFileLines(self.gold)[1:] self.resultFiles = Utils.listFilesExt(self.result, 'IDs.test') # total nb of gold genes self.nbGoldGenes = len(self.goldIDs) # total nb of gold clusters self.foldedGold = Utils.foldClusterData(self.goldIDs, 'gold', 0) self.goldGenes = [ gene for genes in self.foldedGold.values() for gene in genes ] self.nbGoldClusters = len(self.foldedGold) self.outputheader = 'goldClusterID\tgoldGeneID\tpredictedClusterLabel\tpredictedClusterID\n' self.scoreheader = 'goldClusterID\tpredictedClusterID\tclusterScore\n'
def createPfamTsv(self): listFiles = Utils.listFilesExt(self.source_path, 'domains') head = 'sequence_id\tprotein_id\tgene_start\tgene_end\tgene_strand\tpfam_id\tin_cluster\n' contentPos = '' contentNeg = '' for file in listFiles: fileContent = Utils.readFileLines(file) id = fileContent[0].replace('>', '') fileContent = fileContent[1:] inCluster = 1 if 'bgc' in os.path.basename(file).lower() else 0 for line in fileContent: pfamId = line.split('|')[0] product = line.split('|')[1] currentLine = id + '\t' + product + '\t0\t0\t0\t' + pfamId + '\t' + str( inCluster) + '\n' if (inCluster == 1): contentPos += currentLine else: contentNeg += currentLine contentPos = head + contentPos[:-1] contentNeg = head + contentNeg[:-1] folder = os.path.basename(os.path.dirname(file)) resultPos = self.result_path + folder + '.positives.pfam.tsv' resultNeg = self.result_path + folder + '.negatives.pfam.tsv' Utils.writeFile(resultPos, contentPos) Utils.writeFile(resultNeg, contentNeg)
def getEmbeddings(self): matrix = np.zeros((self.dictLength(), self.embedSize)) embfiles = Utils.listFilesExt(self.embedPath, 'w2v') for i in embfiles: if ('kmer' in i.lower() and 'kmer' in self.featType.lower()): matrix = self.mapEmbedWeights(i, 'kmer', matrix) elif ('domain' in i.lower() and 'domain' in self.featType.lower()): matrix = self.mapEmbedWeights(i, 'domain', matrix) elif ('go' in i.lower() and 'go' in self.featType.lower()): matrix = self.mapEmbedWeights(i, 'go', matrix) return matrix
def genBankToAminoacid(path): list = [] # only aminoacid sequence translations = '' files = [] if (os.path.isfile(path)): files.append(path) else: files = Utils.listFilesExt(path, 'gbk') for file in files: species = Utils.getSpecies(file) records = parseGenBank(file) for record in records: locus = record.id for feature in record.features: #if feature.key == "CDS": if feature.type == "CDS": id, locus_tag, gene, protein_id, translation, \ product, function, description = '','','','','','','','' for key, value in feature.qualifiers.items(): # get rid of the quotes around the qualifier # find entry ID if key == "translation": translation = value[0] elif key == "gene": gene = value[0] elif key == "locus_tag": locus_tag = value[0] elif key == "protein_id": protein_id = value[0] protein_id = protein_id.replace('/', '') elif key == "product": product = value[0] elif key == "function": function = value[0] #priority for gene ID id = locus_tag if not id and len(locus_tag) > 1 else id id = gene if not id and len(gene) > 1 else id id = protein_id if not id and len(protein_id) > 1 else id description = product if product.strip() else description description += '|' + function if function.strip( ) else description entry = '>' + locus + '|' + species + '|' + id + '|' + description + '\n' + translation if (entry not in list): list.append(entry) translations += translation return list, translations
def createDomainDataset(self): useID = True files = Utils.listFilesExt(self.source_path, self.ext) files = [ fileName for fileName in files if not os.path.isfile( self.result_path + os.path.basename(fileName).replace('.fasta', '.domains')) ] source_type = self.config.get('dataPipeline', 'source.type') count = 0 countNone = 0 datapipe = DataPipeline.DataPipeline(source_type=source_type, source_path=self.source_path, result_path=self.result_path) sparkContext = SparkContext(conf=datapipe.initSpark("domainDataset")) pfamDomains = datapipe.getDomains(sparkContext) for file in files: fileName = os.path.basename(file) IDs = open(file, 'r').readline() resultFile = self.result_path + fileName.replace( '.fasta', '.domains') result = pfamDomains.get(file) #if(len(result) > 1): if (result is not None): result = result.split('\n') outF = open(resultFile, 'w') outF.write(IDs) output = "" for line in result: if (len(line.strip()) > 1): items = line.split('\t') domainID = items[5] domain = items[6] bitscore = items[11] if (useID): domain = domainID + '|' + domain outF.write(domain + '\n') output += domain + '\n' outF.close() count += 1 else: print('None for file: ', file) countNone += 1 print('Done generating', str(count), 'domain files. \nNo domain found for', str(countNone), 'files.')
def createNegShuffle(self, posPerc): files = Utils.listFilesExt(self.source_path, self.ext) negPerc = 100 - posPerc positives = len(files) negativeSize = int((negPerc * positives) / posPerc) print('Negative percentage: ' + str(negPerc) + '% \n' + 'Negative instances: ' + str(negativeSize) + '\n' + 'Positive percentage: ' + str(posPerc) + '% \n' + 'Positive instances: ' + str(positives) + '\n' + 'Total corpus size: ' + str(negativeSize + positives)) thisDecRatio = 0.0 count = 0 ratio = (negativeSize / positives) decRatio = ratio - int(ratio) print('Generating...') for file in files: # add up the decimal ratio part thisDecRatio += round(decRatio, 2) # reset range ratioRange = int(negativeSize / positives) # check if decimal ratio added up to a duplicate if (thisDecRatio >= 1): ratioRange = int(ratio + thisDecRatio) thisDecRatio = 0 for i in range(0, ratioRange): name = os.path.basename(file) result_file = name.split('.')[0] + '_' + str( i) + '.shuffled.negative.fasta' if ('nuc' in self.seqType): content = Parsers.genBankToNucleotide(file) if ('amino' in self.seqType): list, content = Parsers.genBankToAminoacid(file) content = Utils.charGramShuffle(content, 2) content = '>' + name + '\n' + content count += 1 Utils.writeFile(self.result_path + result_file, content) print('Total generated: ' + str(count) + '. Done!')
def orthogroupSeqs(orthofile, seqpath, limit): orthodir = os.path.dirname(seqpath) ortholines = Utils.readFileLines(orthofile)[1:] seqpath = Utils.listFilesExt(seqpath, "fasta") threshold = limit if (limit) else len(seqpath) orthogroups = [re.split('\t|;|,', item)[1:] for item in ortholines][1:threshold + 1] sequences, output = dict(), dict() print('Loading files and seqIDs...') for seqfile in seqpath: sequences.update(SeqIO.index(seqfile, "fasta")) orthodir = orthodir + '/orthologs_threshold' + str(threshold) + '/' if os.path.isdir(orthodir): print('Orthogroup path', orthodir, 'already exists.') exit() else: os.makedirs(orthodir) print('Loading sequences per IDs...') for group in orthogroups: for id in group: id = id.strip(' ') tempseq = sequences.get(id) if (tempseq is not None and len(tempseq) > 1): thisseqfile = orthodir + tempseq.id + '.fasta' content = '>' + tempseq.id + '\n' + tempseq.seq Utils.writeFile(thisseqfile, content) # else: # print('ID not found', str(id)) print('Done writing seqs for orthogroups.') return output
def splitAsClusters(self): self.source_path = Utils.normalizePath(self.source_path) slimIDs = self.config.getboolean('corpusPrep', 'slim.id') files = Utils.listFilesExt(self.source_path, self.ext) result = [] overlap = int((self.windowOverlap / 100) * self.length) for file in files: fileName = os.path.basename(file).split('.')[0] self.result_path = self.result_path + fileName + '_len' + str( self.length) + '_overlap' + str(self.windowOverlap) if (slimIDs): self.result_path += '_slimIDs' #self.result_path += '/' self.result_path += '.fasta' if (os.path.isfile(self.result_path)): print('File already exists: ' + self.result_path + '.\nDone.') else: file = Parsers.sortFasta(file) sequences = Parsers.parseFasta(file) content, ids, entry, overlapIds = '', '', '', '' for fasta in sequences: content += str(fasta.seq.upper()) ids += str(fasta.id) if not ids else '|' + str(fasta.id) if (slimIDs): allIds = ids.split('|') ids = allIds[0] + '|to|' + allIds[len(allIds) - 1] while (len(content) > 0): varSize = self.length - (len(entry)) if (varSize <= overlap): overlapIds += str( fasta.id) if not overlapIds else '|' + str( fasta.id) entry += content[0:varSize] if (len(entry) == self.length): # move cursor on real sequence according to variable length added content = content[varSize:] # add chunk to list if (slimIDs): allIds = ids.split("|") ids = allIds[0] + '|to|' + allIds[len(allIds) - 1] result.append('>' + ids + '\n' + entry) # make sure that entry contains overlap entry = entry[len(entry) - overlap:] if (len(content) > 0): ids = overlapIds overlapIds = '' else: ids = '' elif (len(content) > 0 and len(entry) < self.length): content = content[len(entry):] prev = 0 pos = self.length result = '\n'.join(result) Utils.writeFile(self.result_path, result) print('Done.')
def createDataset(self): neg_path = Utils.normalizePath(self.negPath) pos_path = Utils.normalizePath(self.posPath) negatives = Utils.listFilesExt(neg_path, self.ext) positives = Utils.listFilesExt(pos_path, self.ext) subject = Utils.normalizePath(self.result_path) negLen = len(negatives) posLen = len(positives) negPerc = 100 - self.posPerc negTotal = (posLen * negPerc) / self.posPerc if (negLen < negTotal): print("Not enough negative instances. Try another %") exit() else: if (not negatives or not positives): print( 'List of files was empty. ' 'Please check \'neg.path\' and \'pos.path\' in the self.config file.' ) subject += 'pos' + str(self.posPerc) if (len(subject) > 1): os.makedirs(subject, exist_ok=True) destTrain = subject + '/train/' destValid = subject + '/validation/' if (os.path.exists(destTrain) or os.path.exists(destValid)): print('Dataset already splitted for train and validation. ' '\nRename ' + destTrain + ' or ' + destValid + ' and try again.') else: os.makedirs(destTrain, exist_ok=False) os.makedirs(destValid, exist_ok=False) perc = int(self.validPerc) / 100 readme = 'Source negative: ' + neg_path + \ '\nSource positive: ' + pos_path + \ '\n# negative files: ' + str(negLen) + \ '\n(final) # negative files: ' + str(negTotal) + \ '\n# positive files: ' + str(posLen) + \ '\nValidation data percentage (from total): ' + str(self.validPerc) + '%' Utils.writeFile(subject + '/README.md', readme) # select validation files validNegatives = random.sample(negatives, int(perc * negTotal)) validPositives = random.sample(positives, int(perc * posLen)) # remove validation files from list negatives = [ f for f in negatives if f not in validNegatives ] positives = [ f for f in positives if f not in validPositives ] # select randomly corresponding nb of negatives negatives = random.sample( negatives, int(negTotal - len(validNegatives))) train = negatives + positives validation = validPositives + validNegatives for f in validation: name = os.path.basename(f) copy(f, destValid + name) for f in train: name = os.path.basename(f) copy(f, destTrain + name) print('Done splitting randomly ' + str(len(train)) + ' train and ' + str(len(validation)) + ' files.')