def _load_bwts(self, bwt_dirs): msbwt = {} for ff in bwt_dirs: if not io.readable_dir(ff): continue name = os.path.basename(ff.rstrip("/")) msbwt.update({name: ms.loadBWT(ff)}) if len(msbwt): return msbwt else: return None
def _load_bwts(self, bwt_dirs): msbwt = {} for ff in bwt_dirs: if not io.readable_dir(ff): continue name = os.path.basename(ff.rstrip("/")) msbwt.update( { name: ms.loadBWT(ff) } ) if len(msbwt): return msbwt else: return None
def get_zygosity(sample, bwtfile, threshold, expected_length, ref_te): msbwt = MultiStringBWT.loadBWT(bwtfile, useMemmap=False) oc_length = 25 ed_th = .2 * oc_length kmer_list = get_kmers(sample, ref_te) zygosity_data = [] zygo_dict = {} other_context = {} for TEi_id, my_id, side, chromo, pos, strand, ref_te, context, TE, TSD in kmer_list: #if (TEi_id, ref_te) != (47,0): # continue #print TEi_id,my_id,side,chromo,pos,strand,ref_te,context,TE,TSD new_context = MultiStringBWT.reverseComplement( context) if side == 'start' else context lo, hi = msbwt.findIndicesOfStr(new_context) context_List = set() context_List = growContext(msbwt, lo, hi, '', oc_length, context_List) zygo_dict[TEi_id] = zygo_dict.get(TEi_id, set([1])) for oc in context_List: oc = MultiStringBWT.reverseComplement( oc) if side == 'start' else oc ed = lv.distance( oc, TE[:oc_length]) if side == 'start' else lv.distance( oc, TE[-oc_length:]) TE = TE[:oc_length] if side == 'start' else TE[-oc_length:] if ed > ed_th: zygo_dict[TEi_id] = zygo_dict[TEi_id] | set([0]) other_context[TEi_id] = oc zygosity_data.append([ TEi_id, my_id, side, chromo, pos, strand, ref_te, context, TE, TSD ]) individual_file = "IndividualAnalysis/%s_%s.csv" % (sample, ref_te) zygosity_data.sort(key=lambda x: (x[3], x[4])) header = [ 'TEi_id', 'my_id', 'side', 'chromo', 'pos', 'strand', 'ref_te', 'context', 'TE', 'TSD', 'other_context', 'zygosity' ] with open(individual_file, 'wb') as fp: a = csv.writer(fp, delimiter=',') a.writerows([header]) for d in zygosity_data: zygosity = 'heterozygous' if len( zygo_dict[d[0]]) == 2 else 'homozygous' d.append(other_context.get(d[0], '')) d.append(zygosity) a.writerows([d]) print "Wrote file: %s [%d lines]" % (individual_file, len(zygosity_data))
def find_Kmer(Kmer): outf = open(outdir + 'Tumor_kmers_{}.txt'.format(Kmer), 'w') msbwt_tumor = MSBWT.loadBWT(args.tumor_bwt) msbwt_normal = MSBWT.loadBWT(args.normal_bwt) logging.info("finished loading BWTs") tLow, tHigh = msbwt_tumor.findIndicesOfStr(Kmer) nLow, nHigh = msbwt_normal.findIndicesOfStr(Kmer) def Kmer_count(tLow, tHigh, nLow, nHigh, Kmer, outf): tLow, tHigh = msbwt_tumor.findIndicesOfStr(Kmer[0],(tLow, tHigh)) nLow, nHigh = msbwt_normal.findIndicesOfStr(Kmer[0],(nLow, nHigh)) tumor_count = tHigh - tLow normal_count = nHigh - nLow if tumor_count > tumor_threshold and normal_count < normal_threshold: outf.write(Kmer + '\t' + str(tumor_count) + '\t' + str(normal_count) + '\t' + Kmer + '\n') return elif tumor_count <= tumor_threshold or len(Kmer) == read_length: return for nucleotide in nucleotide_list: Kmer = nucleotide + Kmer Kmer_count(tLow, tHigh, nLow, nHigh, Kmer, outf) Kmer = Kmer[1:] for nucleotide in nucleotide_list: Kmer = nucleotide + Kmer Kmer_count(tLow, tHigh, nLow, nHigh, Kmer, outf) Kmer = Kmer[1:] outf.close()
def checkAlive(): names = [] for filename in os.listdir(app.config['BWT_ROOT']): try: bwt = MSBWT.loadBWT(app.config['BWT_ROOT'] + filename) if bwt.countOccurrencesOfSeq('T'.encode('utf-8', 'ignore')) > 0: names.append(filename.decode('utf-8')) else: continue except Exception as e: print(e) continue data = {"names": names} return Response(json.dumps(data), status=200)
def load_bwts(bwt_dirs): msbwt = [] for ff in bwt_dirs: if not io.readable_dir(ff): continue try: msbwt.append( ms.loadBWT(ff) ) except Exception as e: sys.stderr.write("Couldn't load BWT at <{}>\n".format(ff)) sys.stderr.write(str(e)) if len(msbwt): return msbwt else: return None
def load_bwts(bwt_dirs): msbwt = [] for ff in bwt_dirs: if not io.readable_dir(ff): continue try: msbwt.append(ms.loadBWT(ff)) except Exception as e: sys.stderr.write("Couldn't load BWT at <{}>\n".format(ff)) sys.stderr.write(str(e)) if len(msbwt): return msbwt else: return None
def loadBWT(name, forceLocal=False): logIt("Loading %s...\n" % name) if not forceLocal: try: logIt("Trying remote source...\n") remoteSource = findRemote(name) return CloudBwt(name, remoteSource) except Exception as e: logIt(" Failed\n" + e.message) pass try: localSource = findLocal(name) return MSBWT.loadBWT(localSource) except Exception as e: return None
def functionCaller(name, func_call): if DEBUG: print("Serving {}".format(name)) bwt = MSBWT.loadBWT(app.config['BWT_ROOT'] + name.encode('utf-8', 'ignore') + '/'.encode('utf-8', 'ignore')) args = ast.literal_eval(request.args.get('args', None)) kwargs = request.args.get('kwargs', None) async_flag = request.args.get('async', None) if args is None: return Response(status=400) if kwargs is not None: kwargs = ast.literal_eval(kwargs) else: kwargs = {} #Legacy Compatibility, disable non-blocking functionality if async_flag is None or async_flag.lower() == 'false': ar = [func_call, args, kwargs, bwt] r = executor.submit(_runLegacy, *ar) return Response(json.dumps({'result': r.result()}), status=200) tok = getToken() st = 405 try: results_lst[tok] = {} ar = [func_call, args, kwargs, bwt, tok] executor.submit(_run, *ar) results_lst[tok]['func'] = func_call results_lst[tok]['args'] = args results_lst[tok]['kwargs'] = kwargs st = 200 except: st = 405 summary = { 'data': app.config['DATA'], 'name': name, 'token': tok, 'function': func_call, 'args': args, 'kwargs': kwargs } return Response(json.dumps(summary), status=st)
def runQuery(**kwargs): pieces = kwargs["dataset"].split('-') directory = MSBWTdirs[int(pieces[0])] + '/' + '-'.join(pieces[1:]) # load the MSBWT msbwt = MSBWT.loadBWT(directory) if kwargs['forward'] == "true": forwardResults = [ msbwt.countOccurrencesOfSeq(str(kmer)) for kmer in kwargs['kmerQueries'] ] else: forwardResults = [] if kwargs['revComp'] == "true": rcResults = [ msbwt.countOccurrencesOfSeq(MSBWT.reverseComplement(str(kmer))) for kmer in kwargs['kmerQueries'] ] else: rcResults = [] return [forwardResults, rcResults]
def api_search(): sequence = str(request.values['sequence']) sample = str(request.values['sample']) count = 0 print('submit called with: {} ... {}'.format(sequence, sample)) bwt_dir = os.environ.get('BWT_DIR', None) sample_dir = os.path.join(bwt_dir, sample) print('sample_dir=', sample_dir) msbwt = MultiStringBWT.loadBWT(sample_dir) print('msbwt=', msbwt) count = msbwt.countOccurrencesOfSeq(sequence) return jsonify({ 'sequence': str(sequence), 'sample': str(sample), 'count': count, })
def extendSeed(self, seedKmer, endSeeds): ''' This function is intended to be an interactive technique for constructing transcripts, probably to be released in a future version of msbwt @param bwtFN - the filename of the BWT to load @param seedKmer - the seed sequence to use for construction @param threshold - minimum number for a path to be considered a path @param direction - True is forward, False is backward @param logger - the logger @param ''' if self.foundKmers.has_key(seedKmer): return pathK = self.settingsDict.get('kmerSize', len(seedKmer)) countK = self.settingsDict.get('countK', pathK) isMerged = self.settingsDict.get('isMerged', False) trackPairs = self.settingsDict.get('trackPairs', False) trackReads = self.settingsDict.get('trackReads', False) useMemmap = self.settingsDict.get('useMemmap', True) maxDistance = self.settingsDict.get('maxDistance', 0xFFFFFFFF) if len(seedKmer) != pathK: raise Exception('Seed k-mer incorrect length') numNodes = self.settingsDict['numNodes'] validChars = ['$', 'A', 'C', 'G', 'N', 'T'] if self.logger != None: self.logger.info('Loading ' + self.bwtDir + '...') msbwt = MultiStringBWT.loadBWT(self.bwtDir, useMemmap, self.logger) if os.path.exists(self.bwtDir + '/origins.npy'): raise Exception( "You haven\'t reimplemented the handling of origin files") origins = np.load(self.bwtDir + '/origins.npy', 'r') else: origins = None self.settingsDict['interleaveFN'] = self.bwtDir + '/inter0.npy' kmer = seedKmer firstID = len(self.nodes) self.nodes.append(PathNode(firstID, kmer, msbwt, 0, self.settingsDict)) self.foundKmers[kmer] = firstID for i, endSeed in enumerate(endSeeds): if len(endSeed) != pathK: raise Exception(endSeed + ': NOT CORRECT LENGTH') else: endID = len(self.nodes) self.nodes.append( PathNode(endID, endSeed, msbwt, 0, self.settingsDict)) self.nodes[endID].termCondition = 'END_SEED_' + str(i) self.foundKmers[endSeed] = endID if self.logger != None: self.logger.info('Beginning with seed \'' + seedKmer + '\', pathK=' + str(pathK) + ', countK=' + str(countK)) unexploredPaths = [self.nodes[firstID]] #init the kmer dictionary execID = firstID while len(unexploredPaths) > 0: #uncomment to make this smallest first unexploredPaths.sort(key=lambda node: node.minDistToSeed) #print 'UP: '+'['+','.join([str((node.minDistToSeed, node.nodeID)) for node in unexploredPaths])+']' nextNode = unexploredPaths.pop(0) if nextNode.nodeID >= numNodes: nextNode.termCondition = 'UNEXPLORED_NODE' elif nextNode.minDistToSeed >= maxDistance: nextNode.termCondition = 'UNEXPLORED_DIST' else: nextNode.execOrder = execID execID += 1 if self.logger != None: self.logger.info('Exploring new node') nextNode.firstTimeExtension(self.foundKmers, unexploredPaths, self.nodes, self.edges) if isMerged and trackReads: interleaveFN = self.bwtDir + '/inter0.npy' interleave = np.load(interleaveFN, 'r') #we only need to do this for newly processed nodes for node in self.nodes[firstID:]: dIDs = node.readSet for dID in dIDs: sourceID = interleave[dID[0]] node.sourceCounts[sourceID] = node.sourceCounts.get( sourceID, 0) + 1 if trackPairs: abtFN = self.bwtDir + '/abt.npy' abt = np.load(abtFN, 'r') #abtDict = {} #only need to process new nodes for node in self.nodes[firstID:]: dIDs = node.readSet for dID, direction in dIDs: (fID, rID) = abt[dID] if fID % 2 == 0: oFID = fID + 1 else: oFID = fID - 1 if self.abtDict.has_key((oFID, rID, 1 - direction)): otherNIDs = self.abtDict[(oFID, rID, 1 - direction)][1] for n in otherNIDs: self.nodes[n].pairedNodes[ node.nodeID] = self.nodes[n].pairedNodes.get( node.nodeID, 0) + 1 node.pairedNodes[n] = node.pairedNodes.get(n, 0) + 1 if not self.abtDict.has_key((fID, rID, direction)): self.abtDict[(fID, rID, direction)] = (dID, set([])) self.abtDict[(fID, rID, direction)][1].add(node.nodeID)
def SearchResponse(form): panel = markup.page() panel.script(type="text/javascript") panel.add(""" function getSelectedText() { var hidden, submit; var selectedText=(window.getSelection ? window.getSelection() : document.getSelection ? document.getSelection() : document.selection.createRange().text); if (selectedText == "") { alert("You must select a subsequence"); return false; } else { document.forms["SearchSelected"]["pattern"].value = selectedText; } } """) panel.script.close() panel.div(style="padding:50px 50px;") datasets = form.getvalue("dataset") if (datasets == None): panel.h3("ERROR: No datasets selected.") panel.div(align="center", style="padding: 30px 30px;") panel.input(type="button", value="New Search", onClick='self.location="./?run=msAllele"') panel.div.close() panel.div.close() return panel if isinstance(datasets, str): datasets = [datasets] pattern = form.getvalue("pattern") if (pattern == None): panel.h3("ERROR: No search pattern specified") panel.div(align="center", style="padding: 30px 30px;") panel.input(type="button", value="New Search", onClick='self.location="./?run=msAllele"') panel.div.close() panel.div.close() return panel pattern = pattern.upper() for dataset in datasets: panel.h3(dataset) bwtDirName = "%s/%s" % (MSBWTdir, dataset) filestat = os.stat(bwtDirName+"/comp_msbwt.npy") filesize = locale.format("%d", filestat.st_size, grouping=True) bwt = MultiStringBWT.loadBWT(bwtDirName) stringCount = locale.format("%d", bwt.getSymbolCount(0), grouping=True) baseCount = locale.format("%d", bwt.getTotalSize(), grouping=True) bitsPerBase = (8.0*filestat.st_size)/bwt.getTotalSize() panel.strong("%s: %s strings with %s bases and index size of %s bytes (%3.2f bits per base)<br />" % (dataset, stringCount, baseCount, filesize, bitsPerBase)) panel.strong("Target: %s<br />" % (pattern)) lo1, hi1 = bwt.findIndicesOfStr(pattern) lo2, hi2 = bwt.findIndicesOfStr(revComp(pattern)) count = hi1 - lo1 + hi2 - lo2 if (count > 10000): panel.add("Found %d times (%d forward, %d reverse-complemented)<br /><br />" % (count, hi1-lo1, hi2-lo2)) panel.span("Too much data!", style="font-size: 180%;") elif count > 0: ''' l = len(pattern) bufferLen = 101 fixedSize = 2*bufferLen-l readlist = [] for i in xrange(lo1, hi1): #pass suffix = bwt.recoverString(i) suffLen = len(suffix) end = suffix.find('$') beforePattern = suffLen-end-1 read = ('.'*(bufferLen-l-beforePattern)+ suffix[end+1:].lower()+ suffix[:l]+ suffix[l:end+1].lower()) read += '.'*(fixedSize-len(read)) readlist.append(read) for i in xrange(lo2, hi2): suffix = revComp(bwt.recoverString(i)) suffLen = len(suffix) end = suffix.find('$') beforePattern = suffLen-end-l read = ('.'*(bufferLen-l-beforePattern)+ suffix[end:-l].lower()+ suffix[-l:]+ suffix[:end].lower()) read += '.'*(fixedSize-len(read)) readlist.append(read) ''' panel.add("Found %d times (%d forward, %d reverse-complemented)<br /><br />" % (count, hi1-lo1, hi2-lo2)) panel.div(style="font-size:10px; font-family: monospace;") #margin = len(suffix)-l l = len(pattern) margin = 101-l haps = extractHaplotypes(bwt, pattern) if len(haps) > 0: consensusMain = (sorted(haps, key=lambda x: x[2][0][1], reverse=True))[0][0] panel.table(border='1') panel.tr() panel.th('Consensus') panel.th('Exact matches') panel.tr.close() extrasList = [] for consensus, readlist, counts in sorted(haps, key=lambda x: x[2][0][1], reverse=True): #panel.strong('%s<span style="color: green;">%s</span>%s<br />' % (consensus[:margin].upper(), consensus[margin:margin+l].upper(), consensus[margin+l:].upper())) if counts[0][1] > 0: panel.tr() panel.td() panel.strong() output = "" for i, base in enumerate(consensus): if i == margin: output += '<span style="color: green;">' elif i == margin+l: output += '</span>' if(base != '$') and (base != '.') and (consensus[i] != '.') and (base.upper() != consensusMain[i].upper()): output += '<span style="background-color:yellow;">%s</span>' % base.upper() else: output += base.upper() panel.add(output) panel.strong.close() panel.td.close() panel.td(str(counts[0][1])) panel.tr.close() for read in readlist[counts[0][1]:]: extrasList.append(read) if len(extrasList) > 0: consensus, dummyVar = conSeq(extrasList) panel.tr() panel.td() panel.strong() output = "" for i, base in enumerate(consensus): if i == margin: output += '<span style="color: green;">' elif i == margin+l: output += '</span>' if(base != '$') and (base != '.') and (consensus[i] != '.') and (base.upper() != consensusMain[i].upper()): output += '<span style="background-color:yellow;">%s</span>' % base.upper() else: output += base.upper() panel.add(output) panel.strong.close() panel.td.close() panel.td('0') panel.tr.close() panel.table.close() for consensus, readlist, counts in sorted(haps, key=lambda x: x[2][0][1], reverse=True): #consensus = conSeq(readlist) #panel.add(consensus) #readlist.sort(cmp=readCmp) if counts[0][1] == 0: continue read = "."*margin + "*"*l + '.'*margin panel.add(read) panel.br() for read in readlist[0:counts[0][1]]: color = "red" if (read.find('$') > read.find(pattern)) else "blue" output = "" for i, base in enumerate(read): if (i == margin): output += '<span style="color: %s;">' % color elif (i == margin+l): output += '</span>' if (base != '$') and (base != '.') and (consensus[i] != '.') and (base.upper() != consensus[i].upper()): output += '<span style="background-color:yellow;">%s</span>' % base else: output += base output += '<br />' panel.add(output) panel.strong('%s<span style="color: green;">%s</span>%s<br />' % (consensus[:margin], consensus[margin:margin+l], consensus[margin+l:])) panel.br() panel.br() if len(extrasList) > 0: consensus, dummyVar = conSeq(extrasList) panel.add(consensus) extrasList.sort(cmp=readCmp) read = "."*margin + "*"*l + '.'*margin panel.add(read) panel.br() for read in extrasList: color = "red" if (read.find('$') > read.find(pattern)) else "blue" output = "" for i, base in enumerate(read): if (i == margin): output += '<span style="color: %s;">' % color elif (i == margin+l): output += '</span>' if (base != '$') and (base != '.') and (consensus[i] != '.') and (base.upper() != consensus[i].upper()): output += '<span style="background-color:yellow;">%s</span>' % base else: output += base output += '<br />' panel.add(output) panel.strong('%s<span style="color: green;">%s</span>%s<br />' % (consensus[:margin], consensus[margin:margin+l], consensus[margin+l:])) panel.br() panel.div.close() else: panel.add("Pattern not found<br /><br />") panel.form(action="", name="SearchSelected", method="POST", enctype="multipart/form-data", onsubmit='return getSelectedText()') panel.div(align="center", style="padding: 30px 30px;") panel.input(type="submit", name="submit", value="Search Selected") panel.input(type="button", value="New Search", onClick='self.location="./?run=msAllele"') for dataset in datasets: panel.input(type="hidden", name="dataset", value=dataset) panel.input(type="hidden", name="pattern", value=pattern) panel.input(type="hidden", name="target", value="msAllele.Search") panel.div.close() panel.form.close() panel.div.close() return panel
def mainRun(): ''' This is the primary function for external typical users to run when the Command Line Interface is used ''' #start up the logger initLogger() #attempt to parse the arguments p = ap.ArgumentParser(description=util.DESC, formatter_class=ap.RawTextHelpFormatter) #version data p.add_argument('-V', '--version', action='version', version='%(prog)s' + \ ' %s in MSBWT %s' % (util.VERSION, util.PKG_VERSION)) #TODO: do we want subparsers groups by type or sorted by name? it's type currently sp = p.add_subparsers(dest='subparserID') p2 = sp.add_parser('cffq', help='create a MSBWT from FASTQ files (pp + cfpp)') p2.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)') p2.add_argument('-u', '--uniform', dest='areUniform', action='store_true', help='the input sequences have uniform length', default=False) p2.add_argument('-c', '--compressed', dest='buildCompressed', action='store_true', help='build the RLE BWT (faster, less disk I/O)', default=False) p2.add_argument('outBwtDir', type=util.newDirectory, help='the output MSBWT directory') p2.add_argument('inputFastqs', nargs='+', type=util.readableFastqFile, help='the input FASTQ files') p7 = sp.add_parser('pp', help='pre-process FASTQ files before BWT creation') p7.add_argument('-u', '--uniform', dest='areUniform', action='store_true', help='the input sequences have uniform length', default=False) p7.add_argument('outBwtDir', type=util.newDirectory, help='the output MSBWT directory') p7.add_argument('inputFastqs', nargs='+', type=util.readableFastqFile, help='the input FASTQ files') p3 = sp.add_parser('cfpp', help='create a MSBWT from pre-processed sequences and offsets') p3.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)') p3.add_argument('-u', '--uniform', dest='areUniform', action='store_true', help='the input sequences have uniform length', default=False) p3.add_argument('-c', '--compressed', dest='buildCompressed', action='store_true', help='build the RLE BWT (faster, less disk I/O)', default=False) p3.add_argument('bwtDir', type=util.existingDirectory, help='the MSBWT directory to process') p4 = sp.add_parser('merge', help='merge many MSBWTs into a single MSBWT') p4.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)') p4.add_argument('outBwtDir', type=util.newDirectory, help='the output MSBWT directory') p4.add_argument('inputBwtDirs', nargs='+', type=util.existingDirectory, help='input BWT directories to merge') p5 = sp.add_parser('query', help='search for a sequence in an MSBWT, prints sequence and seqID') p5.add_argument('inputBwtDir', type=util.existingDirectory, help='the BWT to query') p5.add_argument('kmer', type=util.validKmer, help='the input k-mer to search for') p5.add_argument('-d', '--dump-seqs', dest='dumpSeqs', action='store_true', help='print all sequences with the given kmer (default=False)', default=False) p6 = sp.add_parser('massquery', help='search for many sequences in an MSBWT') p6.add_argument('inputBwtDir', type=util.existingDirectory, help='the BWT to query') p6.add_argument('kmerFile', help='a file with one k-mer per line') p6.add_argument('outputFile', help='output file with counts per line') p6.add_argument('-r', '--rev-comp', dest='reverseComplement', action='store_true', help='also search for each kmer\'s reverse complement', default=False) p8 = sp.add_parser('compress', help='compress a MSBWT from byte/base to RLE') p8.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)') p8.add_argument('srcDir', type=util.existingDirectory, help='the source directory for the BWT to compress') p8.add_argument('dstDir', type=util.newDirectory, help='the destination directory') p9 = sp.add_parser('decompress', help='decompress a MSBWT from RLE to byte/base') p9.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)') p9.add_argument('srcDir', type=util.existingDirectory, help='the source directory for the BWT to compress') p9.add_argument('dstDir', type=util.newDirectory, help='the destination directory') args = p.parse_args() if args.subparserID == 'cffq': logger.info('Inputs:\t'+str(args.inputFastqs)) logger.info('Uniform:\t'+str(args.areUniform)) logger.info('Output:\t'+args.outBwtDir) logger.info('Output Compressed:\t'+str(args.buildCompressed)) logger.info('Processes:\t'+str(args.numProcesses)) if args.numProcesses > 1: logger.warning('Using multi-processing with slow disk accesses can lead to slower build times.') print if args.areUniform: #if they are uniform, use the method developed by Bauer et al., it's likely short Illumina seq if args.buildCompressed: MultiStringBWT.createMSBWTCompFromFastq(args.inputFastqs, args.outBwtDir, args.numProcesses, args.areUniform, logger) else: MultiStringBWT.createMSBWTFromFastq(args.inputFastqs, args.outBwtDir, args.numProcesses, args.areUniform, logger) else: #if they aren't uniform, use the merge method by Holt et al., it's likely longer PacBio seq if args.buildCompressed: logger.error('No compressed builder for non-uniform datasets, compress after creation.') else: Multimerge.createMSBWTFromFastq(args.inputFastqs, args.outBwtDir, args.numProcesses, args.areUniform, logger) elif args.subparserID == 'pp': logger.info('Inputs:\t'+str(args.inputFastqs)) logger.info('Uniform:\t'+str(args.areUniform)) logger.info('Output:\t'+args.outBwtDir) if args.areUniform: #preprocess for Bauer et al. method MultiStringBWT.preprocessFastqs(args.inputFastqs, args.outBwtDir, args.areUniform, logger) else: #preprocess for Holt et al. method numProcs = 1 Multimerge.preprocessFastqs(args.inputFastqs, args.outBwtDir, numProcs, args.areUniform, logger) elif args.subparserID == 'cfpp': logger.info('BWT dir:\t'+args.bwtDir) logger.info('Uniform:\t'+str(args.areUniform)) logger.info('Output Compressed:\t'+str(args.buildCompressed)) logger.info('Processes:\t'+str(args.numProcesses)) if args.numProcesses > 1: logger.warning('Using multi-processing with slow disk accesses can lead to slower build times.') print seqFN = args.bwtDir+'/seqs.npy' offsetFN = args.bwtDir+'/offsets.npy' bwtFN = args.bwtDir+'/msbwt.npy' if args.areUniform: #process it using the column-wise Bauer et al. method if args.buildCompressed: MSBWTCompGenCython.createMsbwtFromSeqs(args.bwtDir, args.numProcesses, logger) else: MSBWTGenCython.createMsbwtFromSeqs(args.bwtDir, args.numProcesses, logger) else: #process it using the Holt et al. merge method if args.buildCompressed: logger.error('No compressed builder for non-uniform datasets, compress after creation.') else: Multimerge.interleaveLevelMerge(args.bwtDir, args.numProcesses, args.areUniform, logger) elif args.subparserID == 'compress': logger.info('Source Directory:'+args.srcDir) logger.info('Dest Directory:'+args.dstDir) logger.info('Processes:'+str(args.numProcesses)) if args.srcDir == args.dstDir: raise Exception('Source and destination directories cannot be the same directory.') print MSBWTGen.compressBWT(args.srcDir+'/msbwt.npy', args.dstDir+'/comp_msbwt.npy', args.numProcesses, logger) elif args.subparserID == 'decompress': logger.info('Source Directory: '+args.srcDir) logger.info('Dest Directory: '+args.dstDir) logger.info('Processes: '+str(args.numProcesses)) print MSBWTGen.decompressBWT(args.srcDir, args.dstDir, args.numProcesses, logger) #TODO: remove if srcdir and dstdir are the same? elif args.subparserID == 'merge': logger.info('Inputs:\t'+str(args.inputBwtDirs)) logger.info('Output:\t'+args.outBwtDir) logger.info('Processes:\t'+str(args.numProcesses)) if args.numProcesses > 1: logger.warning('Multi-processing is not supported at this time, but will be included in a future release.') numProcs = 1 #logger.warning('Using multi-processing with slow disk accesses can lead to slower build times.') print #MSBWTGen.mergeNewMSBWT(args.outBwtDir, args.inputBwtDirs, args.numProcesses, logger) if len(args.inputBwtDirs) > 2: #this is a deprecated method, it may still work if you feel daring #MSBWTGenCython.mergeMsbwts(args.inputBwtDirs, args.outBwtDir, 1, logger) logger.error('Merging more than two MSBWTs at once is not currently supported.') else: GenericMerge.mergeTwoMSBWTs(args.inputBwtDirs[0], args.inputBwtDirs[1], args.outBwtDir, numProcs, logger) elif args.subparserID == 'query': #this is the easiest thing we can do, don't dump the standard info, just do it msbwt = MultiStringBWT.loadBWT(args.inputBwtDir, logger=logger) #always print how many are found, users can parse it out if they want r = msbwt.findIndicesOfStr(args.kmer) print r[1]-r[0] #dump the seqs if request if args.dumpSeqs: for x in xrange(r[0], r[1]): dInd = msbwt.getSequenceDollarID(x) print msbwt.recoverString(dInd)[1:]+','+str(dInd) elif args.subparserID == 'massquery': logger.info('Input:\t'+str(args.inputBwtDir)) logger.info('Queries:\t'+str(args.kmerFile)) logger.info('Output:\t'+args.outputFile) logger.info('Rev-comp:\t'+str(args.reverseComplement)) print msbwt = MultiStringBWT.loadBWT(args.inputBwtDir, logger=logger) output = open(args.outputFile, 'w+') output.write('k-mer,counts') if args.reverseComplement: output.write(',revCompCounts\n') else: output.write('\n') logger.info('Beginning queries...') for line in open(args.kmerFile, 'r'): kmer = line.strip('\n') c = msbwt.countOccurrencesOfSeq(kmer) if args.reverseComplement: rc = msbwt.countOccurrencesOfSeq(MultiStringBWT.reverseComplement(kmer)) output.write(kmer+','+str(c)+','+str(rc)+'\n') else: output.write(kmer+','+str(c)+'\n') logger.info('Queries complete.') else: print args.subparserID+" is currently not implemented, please wait for a future release."
def __init__(self): self.msbwt = MSBWT.loadBWT(sys.argv[1])
def extendSeed(self, seedKmer, endSeeds): ''' This function is intended to be an interactive technique for constructing transcripts, probably to be released in a future version of msbwt @param bwtFN - the filename of the BWT to load @param seedKmer - the seed sequence to use for construction @param threshold - minimum number for a path to be considered a path @param direction - True is forward, False is backward @param logger - the logger @param ''' if self.foundKmers.has_key(seedKmer): return pathK = self.settingsDict.get('kmerSize', len(seedKmer)) countK = self.settingsDict.get('countK', pathK) isMerged = self.settingsDict.get('isMerged', False) trackPairs = self.settingsDict.get('trackPairs', False) trackReads = self.settingsDict.get('trackReads', False) useMemmap = self.settingsDict.get('useMemmap', True) maxDistance = self.settingsDict.get('maxDistance', 0xFFFFFFFF) if len(seedKmer) != pathK: raise Exception('Seed k-mer incorrect length') numNodes = self.settingsDict['numNodes'] validChars = ['$', 'A', 'C', 'G', 'N', 'T'] if self.logger != None: self.logger.info('Loading '+self.bwtDir+'...') msbwt = MultiStringBWT.loadBWT(self.bwtDir, useMemmap, self.logger) if os.path.exists(self.bwtDir+'/origins.npy'): raise Exception("You haven\'t reimplemented the handling of origin files") origins = np.load(self.bwtDir+'/origins.npy', 'r') else: origins = None self.settingsDict['interleaveFN'] = self.bwtDir+'/inter0.npy' kmer = seedKmer firstID = len(self.nodes) self.nodes.append(PathNode(firstID, kmer, msbwt, 0, self.settingsDict)) self.foundKmers[kmer] = firstID for i, endSeed in enumerate(endSeeds): if len(endSeed) != pathK: raise Exception(endSeed+': NOT CORRECT LENGTH') else: endID = len(self.nodes) self.nodes.append(PathNode(endID, endSeed, msbwt, 0, self.settingsDict)) self.nodes[endID].termCondition = 'END_SEED_'+str(i) self.foundKmers[endSeed] = endID if self.logger != None: self.logger.info('Beginning with seed \''+seedKmer+'\', pathK='+str(pathK)+', countK='+str(countK)) unexploredPaths = [self.nodes[firstID]] #init the kmer dictionary execID = firstID while len(unexploredPaths) > 0: #uncomment to make this smallest first unexploredPaths.sort(key = lambda node: node.minDistToSeed) #print 'UP: '+'['+','.join([str((node.minDistToSeed, node.nodeID)) for node in unexploredPaths])+']' nextNode = unexploredPaths.pop(0) if nextNode.nodeID >= numNodes: nextNode.termCondition = 'UNEXPLORED_NODE' elif nextNode.minDistToSeed >= maxDistance: nextNode.termCondition = 'UNEXPLORED_DIST' else: nextNode.execOrder = execID execID += 1 if self.logger != None: self.logger.info('Exploring new node') nextNode.firstTimeExtension(self.foundKmers, unexploredPaths, self.nodes, self.edges) if isMerged and trackReads: interleaveFN = self.bwtDir+'/inter0.npy' interleave = np.load(interleaveFN, 'r') #we only need to do this for newly processed nodes for node in self.nodes[firstID:]: dIDs = node.readSet for dID in dIDs: sourceID = interleave[dID[0]] node.sourceCounts[sourceID] = node.sourceCounts.get(sourceID, 0)+1 if trackPairs: abtFN = self.bwtDir+'/abt.npy' abt = np.load(abtFN, 'r') #abtDict = {} #only need to process new nodes for node in self.nodes[firstID:]: dIDs = node.readSet for dID, direction in dIDs: (fID, rID) = abt[dID] if fID % 2 == 0: oFID = fID+1 else: oFID = fID-1 if self.abtDict.has_key((oFID, rID, 1-direction)): otherNIDs = self.abtDict[(oFID, rID, 1-direction)][1] for n in otherNIDs: self.nodes[n].pairedNodes[node.nodeID] = self.nodes[n].pairedNodes.get(node.nodeID, 0)+1 node.pairedNodes[n] = node.pairedNodes.get(n, 0)+1 if not self.abtDict.has_key((fID, rID, direction)): self.abtDict[(fID, rID, direction)] = (dID, set([])) self.abtDict[(fID, rID, direction)][1].add(node.nodeID)
def mainRun(): ''' This is the primary function for external typical users to run when the Command Line Interface is used ''' #start up the logger initLogger() #attempt to parse the arguments p = ap.ArgumentParser(description=util.DESC, formatter_class=ap.RawTextHelpFormatter) #version data p.add_argument('-V', '--version', action='version', version='%(prog)s' + \ ' %s in MSBWT %s' % (util.VERSION, util.PKG_VERSION)) #TODO: do we want subparsers groups by type or sorted by name? it's type currently sp = p.add_subparsers(dest='subparserID') p2 = sp.add_parser('cffq', help='create a MSBWT from FASTQ files (pp + cfpp)') p2.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)') p2.add_argument('-u', '--uniform', dest='areUniform', action='store_true', help='the input sequences have uniform length', default=False) p2.add_argument('-c', '--compressed', dest='buildCompressed', action='store_true', help='build the RLE BWT (faster, less disk I/O)', default=False) p2.add_argument('outBwtDir', type=util.newDirectory, help='the output MSBWT directory') p2.add_argument('inputFastqs', nargs='+', type=util.readableFastqFile, help='the input FASTQ files') p7 = sp.add_parser('pp', help='pre-process FASTQ files before BWT creation') p7.add_argument('-u', '--uniform', dest='areUniform', action='store_true', help='the input sequences have uniform length', default=False) p7.add_argument('outBwtDir', type=util.newDirectory, help='the output MSBWT directory') p7.add_argument('inputFastqs', nargs='+', type=util.readableFastqFile, help='the input FASTQ files') p3 = sp.add_parser( 'cfpp', help='create a MSBWT from pre-processed sequences and offsets') p3.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)') p3.add_argument('-u', '--uniform', dest='areUniform', action='store_true', help='the input sequences have uniform length', default=False) p3.add_argument('-c', '--compressed', dest='buildCompressed', action='store_true', help='build the RLE BWT (faster, less disk I/O)', default=False) p3.add_argument('bwtDir', type=util.existingDirectory, help='the MSBWT directory to process') p4 = sp.add_parser('merge', help='merge many MSBWTs into a single MSBWT') p4.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)') p4.add_argument('outBwtDir', type=util.newDirectory, help='the output MSBWT directory') p4.add_argument('inputBwtDirs', nargs='+', type=util.existingDirectory, help='input BWT directories to merge') p5 = sp.add_parser( 'query', help='search for a sequence in an MSBWT, prints sequence and seqID') p5.add_argument('inputBwtDir', type=util.existingDirectory, help='the BWT to query') p5.add_argument('kmer', type=util.validKmer, help='the input k-mer to search for') p5.add_argument( '-d', '--dump-seqs', dest='dumpSeqs', action='store_true', help='print all sequences with the given kmer (default=False)', default=False) p6 = sp.add_parser('massquery', help='search for many sequences in an MSBWT') p6.add_argument('inputBwtDir', type=util.existingDirectory, help='the BWT to query') p6.add_argument('kmerFile', help='a file with one k-mer per line') p6.add_argument('outputFile', help='output file with counts per line') p6.add_argument('-r', '--rev-comp', dest='reverseComplement', action='store_true', help='also search for each kmer\'s reverse complement', default=False) p8 = sp.add_parser('compress', help='compress a MSBWT from byte/base to RLE') p8.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)') p8.add_argument('srcDir', type=util.existingDirectory, help='the source directory for the BWT to compress') p8.add_argument('dstDir', type=util.newDirectory, help='the destination directory') p9 = sp.add_parser('decompress', help='decompress a MSBWT from RLE to byte/base') p9.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)') p9.add_argument('srcDir', type=util.existingDirectory, help='the source directory for the BWT to compress') p9.add_argument('dstDir', type=util.newDirectory, help='the destination directory') p10 = sp.add_parser('convert', help='convert from a raw text input to RLE') p10.add_argument('-i', metavar='inputTextFN', dest='inputTextFN', default=None, help='input text filename (default: stdin)') p10.add_argument('dstDir', type=util.newDirectory, help='the destination directory') args = p.parse_args() if args.subparserID == 'cffq': logger.info('Inputs:\t' + str(args.inputFastqs)) logger.info('Uniform:\t' + str(args.areUniform)) logger.info('Output:\t' + args.outBwtDir) logger.info('Output Compressed:\t' + str(args.buildCompressed)) logger.info('Processes:\t' + str(args.numProcesses)) if args.numProcesses > 1: logger.warning( 'Using multi-processing with slow disk accesses can lead to slower build times.' ) print if args.areUniform: #if they are uniform, use the method developed by Bauer et al., it's likely short Illumina seq if args.buildCompressed: MultiStringBWT.createMSBWTCompFromFastq( args.inputFastqs, args.outBwtDir, args.numProcesses, args.areUniform, logger) else: MultiStringBWT.createMSBWTFromFastq(args.inputFastqs, args.outBwtDir, args.numProcesses, args.areUniform, logger) else: #if they aren't uniform, use the merge method by Holt et al., it's likely longer PacBio seq if args.buildCompressed: logger.error( 'No compressed builder for non-uniform datasets, compress after creation.' ) else: Multimerge.createMSBWTFromFastq(args.inputFastqs, args.outBwtDir, args.numProcesses, args.areUniform, logger) elif args.subparserID == 'pp': logger.info('Inputs:\t' + str(args.inputFastqs)) logger.info('Uniform:\t' + str(args.areUniform)) logger.info('Output:\t' + args.outBwtDir) if args.areUniform: #preprocess for Bauer et al. method MultiStringBWT.preprocessFastqs(args.inputFastqs, args.outBwtDir, args.areUniform, logger) else: #preprocess for Holt et al. method numProcs = 1 Multimerge.preprocessFastqs(args.inputFastqs, args.outBwtDir, numProcs, args.areUniform, logger) elif args.subparserID == 'cfpp': logger.info('BWT dir:\t' + args.bwtDir) logger.info('Uniform:\t' + str(args.areUniform)) logger.info('Output Compressed:\t' + str(args.buildCompressed)) logger.info('Processes:\t' + str(args.numProcesses)) if args.numProcesses > 1: logger.warning( 'Using multi-processing with slow disk accesses can lead to slower build times.' ) print seqFN = args.bwtDir + '/seqs.npy' offsetFN = args.bwtDir + '/offsets.npy' bwtFN = args.bwtDir + '/msbwt.npy' if args.areUniform: #process it using the column-wise Bauer et al. method if args.buildCompressed: MSBWTCompGenCython.createMsbwtFromSeqs(args.bwtDir, args.numProcesses, logger) else: MSBWTGenCython.createMsbwtFromSeqs(args.bwtDir, args.numProcesses, logger) else: #process it using the Holt et al. merge method if args.buildCompressed: logger.error( 'No compressed builder for non-uniform datasets, compress after creation.' ) else: Multimerge.interleaveLevelMerge(args.bwtDir, args.numProcesses, args.areUniform, logger) elif args.subparserID == 'compress': logger.info('Source Directory:' + args.srcDir) logger.info('Dest Directory:' + args.dstDir) logger.info('Processes:' + str(args.numProcesses)) if args.srcDir == args.dstDir: raise Exception( 'Source and destination directories cannot be the same directory.' ) print MSBWTGen.compressBWT(args.srcDir + '/msbwt.npy', args.dstDir + '/comp_msbwt.npy', args.numProcesses, logger) elif args.subparserID == 'decompress': logger.info('Source Directory: ' + args.srcDir) logger.info('Dest Directory: ' + args.dstDir) logger.info('Processes: ' + str(args.numProcesses)) print MSBWTGen.decompressBWT(args.srcDir, args.dstDir, args.numProcesses, logger) #TODO: remove if srcdir and dstdir are the same? elif args.subparserID == 'merge': logger.info('Inputs:\t' + str(args.inputBwtDirs)) logger.info('Output:\t' + args.outBwtDir) logger.info('Processes:\t' + str(args.numProcesses)) if args.numProcesses > 1: logger.warning( 'Multi-processing is not supported at this time, but will be included in a future release.' ) numProcs = 1 #logger.warning('Using multi-processing with slow disk accesses can lead to slower build times.') print #MSBWTGen.mergeNewMSBWT(args.outBwtDir, args.inputBwtDirs, args.numProcesses, logger) if len(args.inputBwtDirs) > 2: #this is a deprecated method, it may still work if you feel daring #MSBWTGenCython.mergeMsbwts(args.inputBwtDirs, args.outBwtDir, 1, logger) logger.error( 'Merging more than two MSBWTs at once is not currently supported.' ) else: GenericMerge.mergeTwoMSBWTs(args.inputBwtDirs[0], args.inputBwtDirs[1], args.outBwtDir, numProcs, logger) elif args.subparserID == 'query': #this is the easiest thing we can do, don't dump the standard info, just do it msbwt = MultiStringBWT.loadBWT(args.inputBwtDir, logger=logger) #always print how many are found, users can parse it out if they want r = msbwt.findIndicesOfStr(args.kmer) print r[1] - r[0] #dump the seqs if request if args.dumpSeqs: for x in xrange(r[0], r[1]): dInd = msbwt.getSequenceDollarID(x) print msbwt.recoverString(dInd)[1:] + ',' + str(dInd) elif args.subparserID == 'massquery': logger.info('Input:\t' + str(args.inputBwtDir)) logger.info('Queries:\t' + str(args.kmerFile)) logger.info('Output:\t' + args.outputFile) logger.info('Rev-comp:\t' + str(args.reverseComplement)) print msbwt = MultiStringBWT.loadBWT(args.inputBwtDir, logger=logger) output = open(args.outputFile, 'w+') output.write('k-mer,counts') if args.reverseComplement: output.write(',revCompCounts\n') else: output.write('\n') logger.info('Beginning queries...') for line in open(args.kmerFile, 'r'): kmer = line.strip('\n') c = msbwt.countOccurrencesOfSeq(kmer) if args.reverseComplement: rc = msbwt.countOccurrencesOfSeq( MultiStringBWT.reverseComplement(kmer)) output.write(kmer + ',' + str(c) + ',' + str(rc) + '\n') else: output.write(kmer + ',' + str(c) + '\n') logger.info('Queries complete.') elif args.subparserID == 'convert': if args.inputTextFN == None: logger.info('Input: stdin') else: logger.info('Input: ' + args.inputTextFN) logger.info('Output: ' + args.dstDir) logger.info('Beginning conversion...') CompressToRLE.compressInput(args.inputTextFN, args.dstDir) logger.info('Finished conversion.') else: print args.subparserID + " is currently not implemented, please wait for a future release."