コード例 #1
0
def get_zygosity(sample, bwtfile, threshold, expected_length, ref_te):
    msbwt = MultiStringBWT.loadBWT(bwtfile, useMemmap=False)
    oc_length = 25
    ed_th = .2 * oc_length
    kmer_list = get_kmers(sample, ref_te)
    zygosity_data = []
    zygo_dict = {}
    other_context = {}
    for TEi_id, my_id, side, chromo, pos, strand, ref_te, context, TE, TSD in kmer_list:
        #if (TEi_id, ref_te) != (47,0):
        #    continue
        #print TEi_id,my_id,side,chromo,pos,strand,ref_te,context,TE,TSD
        new_context = MultiStringBWT.reverseComplement(
            context) if side == 'start' else context
        lo, hi = msbwt.findIndicesOfStr(new_context)
        context_List = set()
        context_List = growContext(msbwt, lo, hi, '', oc_length, context_List)
        zygo_dict[TEi_id] = zygo_dict.get(TEi_id, set([1]))

        for oc in context_List:
            oc = MultiStringBWT.reverseComplement(
                oc) if side == 'start' else oc
            ed = lv.distance(
                oc, TE[:oc_length]) if side == 'start' else lv.distance(
                    oc, TE[-oc_length:])
            TE = TE[:oc_length] if side == 'start' else TE[-oc_length:]

            if ed > ed_th:
                zygo_dict[TEi_id] = zygo_dict[TEi_id] | set([0])
                other_context[TEi_id] = oc
        zygosity_data.append([
            TEi_id, my_id, side, chromo, pos, strand, ref_te, context, TE, TSD
        ])

    individual_file = "IndividualAnalysis/%s_%s.csv" % (sample, ref_te)
    zygosity_data.sort(key=lambda x: (x[3], x[4]))
    header = [
        'TEi_id', 'my_id', 'side', 'chromo', 'pos', 'strand', 'ref_te',
        'context', 'TE', 'TSD', 'other_context', 'zygosity'
    ]
    with open(individual_file, 'wb') as fp:
        a = csv.writer(fp, delimiter=',')
        a.writerows([header])
        for d in zygosity_data:
            zygosity = 'heterozygous' if len(
                zygo_dict[d[0]]) == 2 else 'homozygous'
            d.append(other_context.get(d[0], ''))
            d.append(zygosity)
            a.writerows([d])
    print "Wrote file: %s [%d lines]" % (individual_file, len(zygosity_data))
コード例 #2
0
def runQuery(**kwargs):
    pieces = kwargs["dataset"].split('-')
    directory = MSBWTdirs[int(pieces[0])] + '/' + '-'.join(pieces[1:])
    # load the MSBWT
    msbwt = MSBWT.loadBWT(directory)
    if kwargs['forward'] == "true":
        forwardResults = [
            msbwt.countOccurrencesOfSeq(str(kmer))
            for kmer in kwargs['kmerQueries']
        ]
    else:
        forwardResults = []
    if kwargs['revComp'] == "true":
        rcResults = [
            msbwt.countOccurrencesOfSeq(MSBWT.reverseComplement(str(kmer)))
            for kmer in kwargs['kmerQueries']
        ]
    else:
        rcResults = []
    return [forwardResults, rcResults]
コード例 #3
0
    def _load_bwts(self, bwt_dirs):

        msbwt = {}
        for ff in bwt_dirs:
            if not io.readable_dir(ff):
                continue
            name = os.path.basename(ff.rstrip("/"))
            msbwt.update({name: ms.loadBWT(ff)})

        if len(msbwt):
            return msbwt
        else:
            return None
コード例 #4
0
ファイル: util.py プロジェクト: andrewparkermorgan/snoop
	def _load_bwts(self, bwt_dirs):

		msbwt = {}
		for ff in bwt_dirs:
			if not io.readable_dir(ff):
				continue
			name = os.path.basename(ff.rstrip("/"))
			msbwt.update( { name: ms.loadBWT(ff) } )

		if len(msbwt):
			return msbwt
		else:
			return None
コード例 #5
0
 def checkAlive():
     names = []
     for filename in os.listdir(app.config['BWT_ROOT']):
         try:
             bwt = MSBWT.loadBWT(app.config['BWT_ROOT'] + filename)
             if bwt.countOccurrencesOfSeq('T'.encode('utf-8',
                                                     'ignore')) > 0:
                 names.append(filename.decode('utf-8'))
             else:
                 continue
         except Exception as e:
             print(e)
             continue
     data = {"names": names}
     return Response(json.dumps(data), status=200)
コード例 #6
0
def find_Kmer(Kmer):
    outf = open(outdir + 'Tumor_kmers_{}.txt'.format(Kmer), 'w')

    msbwt_tumor = MSBWT.loadBWT(args.tumor_bwt)
    msbwt_normal = MSBWT.loadBWT(args.normal_bwt)

    logging.info("finished loading BWTs")

    tLow, tHigh = msbwt_tumor.findIndicesOfStr(Kmer)
    nLow, nHigh = msbwt_normal.findIndicesOfStr(Kmer)

    def Kmer_count(tLow, tHigh, nLow, nHigh, Kmer, outf):
        tLow, tHigh = msbwt_tumor.findIndicesOfStr(Kmer[0],(tLow, tHigh))
        nLow, nHigh = msbwt_normal.findIndicesOfStr(Kmer[0],(nLow, nHigh))

        tumor_count = tHigh - tLow 
        normal_count = nHigh - nLow 

        if tumor_count > tumor_threshold and normal_count < normal_threshold:
            outf.write(Kmer + '\t' + str(tumor_count) + '\t' + str(normal_count) + '\t' + Kmer + '\n')
            return

        elif tumor_count <= tumor_threshold or len(Kmer) == read_length:
            return

        for nucleotide in nucleotide_list:
            Kmer = nucleotide + Kmer
            Kmer_count(tLow, tHigh, nLow, nHigh, Kmer, outf)
            Kmer = Kmer[1:]

    for nucleotide in nucleotide_list:
        Kmer = nucleotide + Kmer
        Kmer_count(tLow, tHigh, nLow, nHigh, Kmer, outf)
        Kmer = Kmer[1:]

    outf.close()
コード例 #7
0
def load_bwts(bwt_dirs):

    msbwt = []
    for ff in bwt_dirs:
        if not io.readable_dir(ff):
            continue
        try:
            msbwt.append(ms.loadBWT(ff))
        except Exception as e:
            sys.stderr.write("Couldn't load BWT at <{}>\n".format(ff))
            sys.stderr.write(str(e))

    if len(msbwt):
        return msbwt
    else:
        return None
コード例 #8
0
ファイル: util.py プロジェクト: andrewparkermorgan/snoop
def load_bwts(bwt_dirs):

	msbwt = []
	for ff in bwt_dirs:
		if not io.readable_dir(ff):
			continue
		try:
			msbwt.append( ms.loadBWT(ff) )
		except Exception as e:
			sys.stderr.write("Couldn't load BWT at <{}>\n".format(ff))
			sys.stderr.write(str(e))

	if len(msbwt):
		return msbwt
	else:
		return None
コード例 #9
0
def loadBWT(name, forceLocal=False):
    logIt("Loading %s...\n" % name)

    if not forceLocal:
        try:
            logIt("Trying remote source...\n")
            remoteSource = findRemote(name)
            return CloudBwt(name, remoteSource)
        except Exception as e:
            logIt(" Failed\n" + e.message)
            pass
    try:
        localSource = findLocal(name)
        return MSBWT.loadBWT(localSource)
    except Exception as e:
        return None
コード例 #10
0
    def functionCaller(name, func_call):

        if DEBUG:
            print("Serving {}".format(name))

        bwt = MSBWT.loadBWT(app.config['BWT_ROOT'] +
                            name.encode('utf-8', 'ignore') +
                            '/'.encode('utf-8', 'ignore'))

        args = ast.literal_eval(request.args.get('args', None))
        kwargs = request.args.get('kwargs', None)
        async_flag = request.args.get('async', None)

        if args is None:
            return Response(status=400)
        if kwargs is not None:
            kwargs = ast.literal_eval(kwargs)
        else:
            kwargs = {}

        #Legacy Compatibility, disable non-blocking functionality
        if async_flag is None or async_flag.lower() == 'false':
            ar = [func_call, args, kwargs, bwt]
            r = executor.submit(_runLegacy, *ar)
            return Response(json.dumps({'result': r.result()}), status=200)

        tok = getToken()
        st = 405
        try:
            results_lst[tok] = {}
            ar = [func_call, args, kwargs, bwt, tok]
            executor.submit(_run, *ar)
            results_lst[tok]['func'] = func_call
            results_lst[tok]['args'] = args
            results_lst[tok]['kwargs'] = kwargs
            st = 200
        except:
            st = 405
        summary = {
            'data': app.config['DATA'],
            'name': name,
            'token': tok,
            'function': func_call,
            'args': args,
            'kwargs': kwargs
        }
        return Response(json.dumps(summary), status=st)
コード例 #11
0
def api_search():
    sequence = str(request.values['sequence'])
    sample = str(request.values['sample'])
    count = 0

    print('submit called with: {} ... {}'.format(sequence, sample))

    bwt_dir = os.environ.get('BWT_DIR', None)
    sample_dir = os.path.join(bwt_dir, sample)
    print('sample_dir=', sample_dir)

    msbwt = MultiStringBWT.loadBWT(sample_dir)

    print('msbwt=', msbwt)
    count = msbwt.countOccurrencesOfSeq(sequence)

    return jsonify({
        'sequence': str(sequence),
        'sample': str(sample),
        'count': count,
    })
コード例 #12
0
ファイル: TranscriptBuilder.py プロジェクト: txje/msbwt
    def firstTimeExtension(self, foundKmers, unexploredPaths, nodes, edges):
        '''
        @param foundKmers - Each kmer we find will be checked against this and added if not present
        @param unexploredPaths - if we find a new path split, we add the things here, also merges are important to add here
        @param nodes - the list of nodes if we find a new one
        '''
        pc = ''
        kmer = self.seq
        terminate = False
        while not terminate:
            if len(kmer) != self.pathK:
                print('ERROR: DIFFERENT SIZED K-MER ' + str(len(kmer)))
                raise Exception('ERROR')

            #First, perform all the counts of paths going both forwards and backwards
            counts = {}
            revCounts = {}

            #maxV - the count of the (k+1)-mer with maxC on it, total is the total counts of valid chars
            maxV = 0
            maxC = ''
            total = 0

            #count the number of forward and reversed paths
            numPaths = 0
            numRevPaths = 0

            for c in self.validChars:
                counts[c] = self.msbwt.countOccurrencesOfSeq(
                    kmer + c) + self.msbwt.countOccurrencesOfSeq(
                        MultiStringBWT.reverseComplement(kmer + c))
                revCounts[c] = self.msbwt.countOccurrencesOfSeq(
                    c + kmer) + self.msbwt.countOccurrencesOfSeq(
                        MultiStringBWT.reverseComplement(c + kmer))

                if self.drawDollarTerminals or c != '$':
                    total += counts[c]
                    if counts[c] > maxV:
                        maxV = counts[c]
                        maxC = c

                    if counts[c] >= self.pathThreshold:
                        numPaths += 1

                    #if we have evidence from the counts OR if the previous character was known to be that character
                    if revCounts[c] >= self.pathThreshold or c == pc:
                        numRevPaths += 1

            #check if we have incoming edges, in which case we need to end this block
            if numRevPaths > 1 and kmer != self.seq:

                #remove the last kmer, because it's actually in the new node we merge into
                self.seq = self.seq[0:-1]

                #this will lead to repeating the same counts later, but that's okay
                newID = len(nodes)
                newHistMers = set([])
                nodes.append(
                    PathNode(newID, kmer, self.msbwt,
                             self.minDistToSeed + len(self.pileups),
                             self.settingsDict))
                edges.append(
                    PathEdge(len(edges), self.nodeID, newID, revCounts[pc],
                             pc + ', ' + str(revCounts)))
                self.termCondition = 'MERGE_' + str(newID)
                foundKmers[kmer] = newID

                unexploredPaths.append(nodes[newID])

                #print 'Ending block for merge'
                terminate = True

            elif total == 0:
                #print 'No strings found.'
                self.termCondition = 'TERMINAL'
                terminate = True
            else:
                #the kmer was found in this block and it may have multiple extensions
                foundKmers[kmer] = self.nodeID
                revMer = MultiStringBWT.reverseComplement(kmer)
                if foundKmers.has_key(revMer):
                    otherID = foundKmers[revMer]
                    self.inversionSet.add(otherID)
                    nodes[otherID].inversionSet.add(self.nodeID)

                r1 = self.msbwt.findIndicesOfStr(kmer[-self.countK:])
                r2 = self.msbwt.findIndicesOfStr(
                    MultiStringBWT.reverseComplement(kmer[-self.countK:]))
                kmerCount = (r1[1] - r1[0]) + (r2[1] - r2[0])
                self.pileups.append(kmerCount)
                perc = float(maxV) / total

                if self.trackReads == True:
                    for i in range(r1[0], r1[1]):
                        self.readSet.add(
                            (int(self.msbwt.getSequenceDollarID(i)), 0))
                    for i in range(r2[0], r2[1]):
                        self.readSet.add(
                            (int(self.msbwt.getSequenceDollarID(i)), 1))

                #if kmerCount > self.overloadThreshold:
                if self.pileups[0] > self.overloadThreshold:
                    #this path is too heavy, we probably won't figure out what's going on downstream
                    self.termCondition = 'OVERLOAD'
                    terminate = True

                elif numPaths > 1:
                    self.termCondition = 'SPLIT'
                    for c in self.validChars:
                        if counts[c] >= self.pathThreshold:
                            newKmer = kmer[1:] + c
                            if foundKmers.has_key(newKmer):
                                otherNID = foundKmers[newKmer]
                                nodes[otherNID].minDistToSeed = min(
                                    nodes[otherNID].minDistToSeed,
                                    self.minDistToSeed + len(self.pileups))
                                edges.append(
                                    PathEdge(len(edges), self.nodeID, otherNID,
                                             counts[c],
                                             c + ': ' + str(counts[c])))

                            else:
                                if self.drawDollarTerminals or c != '$':
                                    newID = len(nodes)
                                    newHistMers = set([])
                                    nodes.append(
                                        PathNode(
                                            newID, newKmer, self.msbwt,
                                            self.minDistToSeed +
                                            len(self.pileups),
                                            self.settingsDict))
                                    edges.append(
                                        PathEdge(len(edges), self.nodeID,
                                                 newID, counts[c],
                                                 c + ': ' + str(counts[c])))
                                    foundKmers[newKmer] = newID

                                    if c != '$':
                                        unexploredPaths.append(nodes[newID])
                                    else:
                                        nodes[newID].termCondition = '$ Ext'

                    terminate = True
                else:
                    #this is data pertaining to this k-mer
                    #print ':\t'+kmer+maxC+'\t'+str(perc)+'\t'+str(maxV)+'/'+str(total)+'\t'+str(total-maxV)+'\t'
                    pc = kmer[0]
                    kmer = kmer[1:] + maxC
                    #check if we've found the new k-mer before
                    if foundKmers.has_key(kmer):
                        otherNID = foundKmers[kmer]
                        nodes[otherNID].minDistToSeed = min(
                            nodes[otherNID].minDistToSeed,
                            self.minDistToSeed + len(self.pileups))
                        if counts[maxC] >= self.pathThreshold:
                            edges.append(
                                PathEdge(len(edges), self.nodeID, otherNID,
                                         counts[maxC],
                                         pc + ': ' + str(counts[maxC])))
                            self.termCondition = 'MERGE_' + str(otherNID)
                        else:
                            edges.append(
                                PathEdge(len(edges), self.nodeID, otherNID,
                                         counts[maxC],
                                         pc + ': ' + str(counts[maxC]),
                                         'dashed'))
                            self.termCondition = 'MERGE_' + str(
                                otherNID) + ', THRESHOLD'

                        terminate = True
                    else:
                        self.seq += maxC
                        if maxC == '$':
                            self.termCondition = '$ Max'
                            terminate = True
コード例 #13
0
def build_bridge(msbwt,
                 seedKmer,
                 targetKmer,
                 tMin=1,
                 branchLim=10,
                 maxBranchLen=250):
    """
	Assemble the short "bridge" between two sequences expected to occur nearby on the template.
	@param msbwt - the MSBWT to use for searchs
	@param seedKmer - a k-mer to seed our bridging
	@param targetKmer - the target we are trying to bridge to
	@param tMin - the minimum k-count needed to consider the path
	@param branchLim - the maximum number of branches we will test
	@param maxBranchLen - the maximum length of a branch before giving up
	@return (ret, numBranched)
		ret - a list of bridges discovered; for most cases this is a list of length one
		numBranched - the number of branches we explored; if numBranched >= branchLim, this function was not 100% exhaustive
	"""

    #initialize to our input kmer
    ret = []
    possBridges = [dna.unmask(dna.ungap(seedKmer))]
    targetKmer = dna.unmask(dna.ungap(targetKmer))
    kmerLen = len(seedKmer)

    #set up some easy values
    validChars = "ACGT"
    counts = np.zeros(dtype='<u8', shape=(len(validChars), ))
    numBranched = 0

    #print (seedKmer, targetKmer)

    #while we have things to explore, and we haven't explored too many, and we don't have a ridiculous number of possibilities
    while len(possBridges) > 0 and numBranched < branchLim:
        #get the bridge, the kmer, and the reverse kmer
        currBridge = possBridges.pop()
        numBranched += 1

        currKmer = currBridge[len(currBridge) - kmerLen:]
        revKmer = MultiStringBWT.reverseComplement(currKmer)

        #try to extend it on out
        while len(currBridge) < maxBranchLen:
            #get the counts for each possible extension
            for i, c in enumerate(validChars):
                counts[i] = msbwt.countOccurrencesOfSeq(
                    currKmer +
                    c) + msbwt.countOccurrencesOfSeq(dna.revcomp(c) + revKmer)

            #get the highest one
            maxPos = np.argmax(counts)
            maxSym = validChars[maxPos]

            #make sure the highest is high enough for us to consider it
            if counts[maxPos] >= tMin:
                if len(possBridges) < branchLim:
                    #go through all the other possible extensions
                    for i, c in enumerate(validChars):
                        if i != maxPos and counts[i] >= tMin:
                            #add the ones we aren't exploring right now if they're high enough
                            possBridges.append(currBridge + c)

                #make sure the highest isn't too high
                #this extension meets our requirement so shift over to loop back around
                currBridge += maxSym
                currKmer = currKmer[1:] + maxSym
                revKmer = dna.revcomp(maxSym) + revKmer[0:len(revKmer) - 1]
            else:
                #our BEST doesn't pass the threshold on this path, stop following
                break

            if currKmer.startswith(targetKmer):
                ret.append(currBridge)

    #return all our possibilities
    return (ret, numBranched)
コード例 #14
0
ファイル: TranscriptBuilder.py プロジェクト: txje/msbwt
    def extendSeed(self, seedKmer, endSeeds):
        '''
        This function is intended to be an interactive technique for constructing transcripts, probably to be released
        in a future version of msbwt
        @param bwtFN - the filename of the BWT to load
        @param seedKmer - the seed sequence to use for construction
        @param threshold - minimum number for a path to be considered a path
        @param direction - True is forward, False is backward
        @param logger - the logger
        @param 
        '''

        if self.foundKmers.has_key(seedKmer):
            return

        pathK = self.settingsDict.get('kmerSize', len(seedKmer))
        countK = self.settingsDict.get('countK', pathK)
        isMerged = self.settingsDict.get('isMerged', False)
        trackPairs = self.settingsDict.get('trackPairs', False)
        trackReads = self.settingsDict.get('trackReads', False)
        useMemmap = self.settingsDict.get('useMemmap', True)
        maxDistance = self.settingsDict.get('maxDistance', 0xFFFFFFFF)

        if len(seedKmer) != pathK:
            raise Exception('Seed k-mer incorrect length')

        numNodes = self.settingsDict['numNodes']
        validChars = ['$', 'A', 'C', 'G', 'N', 'T']

        if self.logger != None:
            self.logger.info('Loading ' + self.bwtDir + '...')
        msbwt = MultiStringBWT.loadBWT(self.bwtDir, useMemmap, self.logger)
        if os.path.exists(self.bwtDir + '/origins.npy'):
            raise Exception(
                "You haven\'t reimplemented the handling of origin files")
            origins = np.load(self.bwtDir + '/origins.npy', 'r')
        else:
            origins = None

        self.settingsDict['interleaveFN'] = self.bwtDir + '/inter0.npy'

        kmer = seedKmer

        firstID = len(self.nodes)
        self.nodes.append(PathNode(firstID, kmer, msbwt, 0, self.settingsDict))
        self.foundKmers[kmer] = firstID

        for i, endSeed in enumerate(endSeeds):
            if len(endSeed) != pathK:
                raise Exception(endSeed + ': NOT CORRECT LENGTH')
            else:
                endID = len(self.nodes)
                self.nodes.append(
                    PathNode(endID, endSeed, msbwt, 0, self.settingsDict))
                self.nodes[endID].termCondition = 'END_SEED_' + str(i)
                self.foundKmers[endSeed] = endID

        if self.logger != None:
            self.logger.info('Beginning with seed \'' + seedKmer +
                             '\', pathK=' + str(pathK) + ', countK=' +
                             str(countK))

        unexploredPaths = [self.nodes[firstID]]

        #init the kmer dictionary
        execID = firstID

        while len(unexploredPaths) > 0:
            #uncomment to make this smallest first
            unexploredPaths.sort(key=lambda node: node.minDistToSeed)
            #print 'UP: '+'['+','.join([str((node.minDistToSeed, node.nodeID)) for node in unexploredPaths])+']'
            nextNode = unexploredPaths.pop(0)

            if nextNode.nodeID >= numNodes:
                nextNode.termCondition = 'UNEXPLORED_NODE'
            elif nextNode.minDistToSeed >= maxDistance:
                nextNode.termCondition = 'UNEXPLORED_DIST'
            else:
                nextNode.execOrder = execID
                execID += 1

                if self.logger != None:
                    self.logger.info('Exploring new node')
                nextNode.firstTimeExtension(self.foundKmers, unexploredPaths,
                                            self.nodes, self.edges)

        if isMerged and trackReads:
            interleaveFN = self.bwtDir + '/inter0.npy'
            interleave = np.load(interleaveFN, 'r')

            #we only need to do this for newly processed nodes
            for node in self.nodes[firstID:]:
                dIDs = node.readSet
                for dID in dIDs:
                    sourceID = interleave[dID[0]]
                    node.sourceCounts[sourceID] = node.sourceCounts.get(
                        sourceID, 0) + 1

        if trackPairs:
            abtFN = self.bwtDir + '/abt.npy'
            abt = np.load(abtFN, 'r')

            #abtDict = {}

            #only need to process new nodes
            for node in self.nodes[firstID:]:
                dIDs = node.readSet

                for dID, direction in dIDs:
                    (fID, rID) = abt[dID]
                    if fID % 2 == 0:
                        oFID = fID + 1
                    else:
                        oFID = fID - 1

                    if self.abtDict.has_key((oFID, rID, 1 - direction)):
                        otherNIDs = self.abtDict[(oFID, rID, 1 - direction)][1]
                        for n in otherNIDs:
                            self.nodes[n].pairedNodes[
                                node.nodeID] = self.nodes[n].pairedNodes.get(
                                    node.nodeID, 0) + 1
                            node.pairedNodes[n] = node.pairedNodes.get(n,
                                                                       0) + 1

                    if not self.abtDict.has_key((fID, rID, direction)):
                        self.abtDict[(fID, rID, direction)] = (dID, set([]))
                    self.abtDict[(fID, rID, direction)][1].add(node.nodeID)
コード例 #15
0
def SearchResponse(form):
	panel = markup.page()

	panel.script(type="text/javascript")
	panel.add("""
		function getSelectedText() {
			var hidden, submit;
			var selectedText=(window.getSelection ? window.getSelection() : document.getSelection ? document.getSelection() : document.selection.createRange().text);
			if (selectedText == "") {
				alert("You must select a subsequence");
				return false;
			} else {
				document.forms["SearchSelected"]["pattern"].value = selectedText;
			}
		}
	""")
	panel.script.close()

	panel.div(style="padding:50px 50px;")
	datasets = form.getvalue("dataset")
	if (datasets == None):
		panel.h3("ERROR: No datasets selected.")
		panel.div(align="center", style="padding: 30px 30px;")
		panel.input(type="button", value="New Search", onClick='self.location="./?run=msAllele"')
		panel.div.close()
		panel.div.close()
		return panel
	
	if isinstance(datasets, str):
		datasets = [datasets]

	pattern = form.getvalue("pattern")
	if (pattern == None):
		panel.h3("ERROR: No search pattern specified")
		panel.div(align="center", style="padding: 30px 30px;")
		panel.input(type="button", value="New Search", onClick='self.location="./?run=msAllele"')
		panel.div.close()
		panel.div.close()
		return panel
	pattern = pattern.upper()

	for dataset in datasets:
		panel.h3(dataset)
		bwtDirName = "%s/%s" % (MSBWTdir, dataset)
		filestat = os.stat(bwtDirName+"/comp_msbwt.npy")
		filesize = locale.format("%d", filestat.st_size, grouping=True)
		bwt = MultiStringBWT.loadBWT(bwtDirName)
		stringCount = locale.format("%d", bwt.getSymbolCount(0), grouping=True)
		baseCount = locale.format("%d", bwt.getTotalSize(), grouping=True)
		bitsPerBase = (8.0*filestat.st_size)/bwt.getTotalSize()
		panel.strong("%s: %s strings with %s bases and index size of %s bytes (%3.2f bits per base)<br />" % (dataset, stringCount, baseCount, filesize, bitsPerBase))
		panel.strong("Target: %s<br />" % (pattern))
		
		lo1, hi1 = bwt.findIndicesOfStr(pattern)
		lo2, hi2 = bwt.findIndicesOfStr(revComp(pattern))
		count = hi1 - lo1 + hi2 - lo2
		if (count > 10000):
			panel.add("Found %d times (%d forward, %d reverse-complemented)<br /><br />" % (count, hi1-lo1, hi2-lo2))
			panel.span("Too much data!", style="font-size: 180%;")
		elif count > 0:
			
			'''
			l = len(pattern)
			bufferLen = 101
			fixedSize = 2*bufferLen-l
			readlist = []
			
			for i in xrange(lo1, hi1):
				#pass
				suffix = bwt.recoverString(i)
				suffLen = len(suffix)
				end = suffix.find('$')
				beforePattern = suffLen-end-1
				read = ('.'*(bufferLen-l-beforePattern)+
						suffix[end+1:].lower()+
						suffix[:l]+
						suffix[l:end+1].lower())
				read += '.'*(fixedSize-len(read))
				readlist.append(read)
			
			for i in xrange(lo2, hi2):
				suffix = revComp(bwt.recoverString(i))
				suffLen = len(suffix)
				end = suffix.find('$')
				beforePattern = suffLen-end-l
				read = ('.'*(bufferLen-l-beforePattern)+
						suffix[end:-l].lower()+
						suffix[-l:]+
						suffix[:end].lower())
				read += '.'*(fixedSize-len(read))
				readlist.append(read)
			'''
			panel.add("Found %d times (%d forward, %d reverse-complemented)<br /><br />" % (count, hi1-lo1, hi2-lo2))
			panel.div(style="font-size:10px; font-family: monospace;")
			#margin = len(suffix)-l
			l = len(pattern)
			margin = 101-l

			haps = extractHaplotypes(bwt, pattern)
			if len(haps) > 0:
				consensusMain = (sorted(haps, key=lambda x: x[2][0][1], reverse=True))[0][0]

			panel.table(border='1')
			panel.tr()
			panel.th('Consensus')
			panel.th('Exact matches')
			panel.tr.close()
			extrasList = []
			for consensus, readlist, counts in sorted(haps, key=lambda x: x[2][0][1], reverse=True):
				#panel.strong('%s<span style="color: green;">%s</span>%s<br />' % (consensus[:margin].upper(), consensus[margin:margin+l].upper(), consensus[margin+l:].upper()))
				if counts[0][1] > 0:
					panel.tr()
					panel.td()
					panel.strong()
					output = ""
					for i, base in enumerate(consensus):
						if i == margin:
							output += '<span style="color: green;">'
						elif i == margin+l:
							output += '</span>'

						if(base != '$') and (base != '.') and (consensus[i] != '.') and (base.upper() != consensusMain[i].upper()):
							output += '<span style="background-color:yellow;">%s</span>' % base.upper()
						else:
							output += base.upper()
					panel.add(output)
					panel.strong.close()
					panel.td.close()
					panel.td(str(counts[0][1]))
					panel.tr.close()
				
				for read in readlist[counts[0][1]:]:
					extrasList.append(read)
			
			if len(extrasList) > 0:
				consensus, dummyVar = conSeq(extrasList)
				panel.tr()
				panel.td()
				panel.strong()
				output = ""
				for i, base in enumerate(consensus):
					if i == margin:
						output += '<span style="color: green;">'
					elif i == margin+l:
						output += '</span>'
					
					if(base != '$') and (base != '.') and (consensus[i] != '.') and (base.upper() != consensusMain[i].upper()):
						output += '<span style="background-color:yellow;">%s</span>' % base.upper()
					else:
						output += base.upper()
				panel.add(output)
				panel.strong.close()
				panel.td.close()
				panel.td('0')
				panel.tr.close()
			panel.table.close()
			
			for consensus, readlist, counts in sorted(haps, key=lambda x: x[2][0][1], reverse=True):
				#consensus = conSeq(readlist)
				#panel.add(consensus)
				#readlist.sort(cmp=readCmp)
				if counts[0][1] == 0:
					continue
				
				read = "."*margin + "*"*l + '.'*margin
				panel.add(read)
				panel.br()
				for read in readlist[0:counts[0][1]]:
					color = "red" if (read.find('$') > read.find(pattern)) else "blue"
					output = ""
					for i, base in enumerate(read):
						if (i == margin):
							output += '<span style="color: %s;">' % color
						elif (i == margin+l):
							output += '</span>'
						if (base != '$') and (base != '.') and (consensus[i] != '.') and (base.upper() != consensus[i].upper()):
							output += '<span style="background-color:yellow;">%s</span>' % base
						else:
							output += base
					output += '<br />'
					panel.add(output)
				panel.strong('%s<span style="color: green;">%s</span>%s<br />' % (consensus[:margin], consensus[margin:margin+l], consensus[margin+l:]))
				panel.br()
				panel.br()
		
			if len(extrasList) > 0:
				consensus, dummyVar = conSeq(extrasList)
				panel.add(consensus)
				extrasList.sort(cmp=readCmp)
				read = "."*margin + "*"*l + '.'*margin
				panel.add(read)
				panel.br()
				for read in extrasList:
					color = "red" if (read.find('$') > read.find(pattern)) else "blue"
					output = ""
					for i, base in enumerate(read):
						if (i == margin):
							output += '<span style="color: %s;">' % color
						elif (i == margin+l):
							output += '</span>'
						if (base != '$') and (base != '.') and (consensus[i] != '.') and (base.upper() != consensus[i].upper()):
							output += '<span style="background-color:yellow;">%s</span>' % base
						else:
							output += base
					output += '<br />'
					panel.add(output)
				panel.strong('%s<span style="color: green;">%s</span>%s<br />' % (consensus[:margin], consensus[margin:margin+l], consensus[margin+l:]))
				panel.br()
		
			panel.div.close()
		else:
			panel.add("Pattern not found<br /><br />")
	panel.form(action="", name="SearchSelected", method="POST", enctype="multipart/form-data", onsubmit='return getSelectedText()')
	panel.div(align="center", style="padding: 30px 30px;")
	panel.input(type="submit", name="submit", value="Search Selected")
	panel.input(type="button", value="New Search", onClick='self.location="./?run=msAllele"')
	for dataset in datasets:
		panel.input(type="hidden", name="dataset", value=dataset)
	panel.input(type="hidden", name="pattern", value=pattern)
	panel.input(type="hidden", name="target", value="msAllele.Search")
	panel.div.close()
	panel.form.close()
	panel.div.close()
	return panel
コード例 #16
0
def extractHaplotypes(bwt, kmer):
	forwardIndices = bwt.findIndicesOfStr(kmer)
	revComp = MultiStringBWT.reverseComplement(kmer)
	reverseIndices = bwt.findIndicesOfStr(revComp)
	
	readLen = 101
	patternLen = len(kmer)
	totalBuffLen = 2*readLen-patternLen
	
	modifiedSeqs = []
	for i in xrange(forwardIndices[0], forwardIndices[1]):
		readSeq = bwt.recoverString(i)
		dollarPos = readSeq.find('$')
		
		#calcualte how many tailing '.' we need first, then construct the string from that info
		afterPattern = readLen-dollarPos-1
		modSeq = ('.'*(readLen-patternLen-afterPattern)+
				  readSeq[dollarPos+1:].lower()+
				  readSeq[0:patternLen]+
				  readSeq[patternLen:dollarPos+1].lower()+
				  '.'*(afterPattern))
		modifiedSeqs.append(modSeq)
	
	for i in xrange(reverseIndices[0], reverseIndices[1]):
		revCompSeq = bwt.recoverString(i)
		readSeq = MultiStringBWT.reverseComplement(revCompSeq)
		dollarPos = readSeq.find('$')
		
		#beforePattern = readLen-dollarPos
		afterPattern = readLen-dollarPos-patternLen
		modSeq = ('.'*(readLen-patternLen-afterPattern)+
				  readSeq[dollarPos:-patternLen].lower()+
				  readSeq[-patternLen:]+
				  readSeq[0:dollarPos].lower()+
				  '.'*(afterPattern))
		modifiedSeqs.append(modSeq)

	finishedHaps = []

	previousConsensus = 'A'*totalBuffLen
	currentConsensus, currentScorer = conSeq(modifiedSeqs)
	currSeqs = modifiedSeqs

	while len(currSeqs) > 0 and compareShiftedSeqs(previousConsensus, currentConsensus) > 0:
		nextSeqs = []
		consensusSeqs = []
		
		#we will fill in consensus Seqs downstream
		finishedHaps.append((currentConsensus, consensusSeqs, []))
		
		#first get all exact matching reads
		for seq in currSeqs:
			if compareShiftedSeqs(seq, currentConsensus) == 0:
				consensusSeqs.append(seq)
			else:
				nextSeqs.append(seq)
		
		finishedHaps[-1][2].append((0, len(consensusSeqs)))
		
		#update these things
		previousConsensus = currentConsensus
		currSeqs = nextSeqs
		currentConsensus, currentScorer = conSeq(currSeqs)
		
		#check if the next consensus is identical
		acceptedScore = 1
		while len(currSeqs) > 0 and compareShiftedSeqs(currentConsensus, previousConsensus) == 0:
			#print 'triggered', acceptedScore
			nextNextSeqs = []
			minScore = 0xFFFFFFFFFFFFFFFF
			for seq in nextSeqs:
				calcScore = scoreShiftedSeqs(seq, currentConsensus, currentScorer)
				if calcScore < minScore and calcScore > acceptedScore:
					minScore = calcScore
				
				if calcScore <= acceptedScore:
					consensusSeqs.append(seq)
				else:
					nextNextSeqs.append(seq)
			finishedHaps[-1][2].append((acceptedScore, len(nextSeqs)-len(nextNextSeqs)))
			
			nextSeqs = nextNextSeqs
			currSeqs = nextSeqs
			currentConsensus, currentScorer = conSeq(currSeqs)
			
			#acceptedScore += 1
			acceptedScore = minScore

	for seq in currSeqs:
		consensusSeqs.append(seq)

	return finishedHaps
コード例 #17
0
def TestForUnique(sample, side, bowtie_dir, species, expected_length):
    global genome
    bt2cmd = "bowtie2 -x %s/%s --no-head -r --end-to-end -k 4 %s.seq > %s.sam"
    designfile = "tmp/bowtie_data/%s_%s.csv" % (sample, side)

    t = designfile.rfind('.')
    root = designfile[:t]
    outfile = root + ".seq"

    fp = open(outfile, 'wb')
    design = pd.read_csv(filepath_or_buffer=designfile, sep=',')
    N = design.shape[0]
    probes = {}
    distinct_context = set()
    for index, row in enumerate(design.values):
        seq = row[1]
        fp.write(seq + "\n")
        distinct_context.add(seq)
    print "TOtal distinct contexts: %d" % (len(distinct_context))
    fp.close()
    print "Wrote %s (%d lines)" % (outfile, N)
    sys.stdout.flush()
    code = subprocess.call(bt2cmd % (bowtie_dir, species, root, root),
                           shell=True)
    if (code == 0):
        print "Alignment completed"
    else:
        print "Alignment failed:" + (bt2cmd %
                                     (bowtie_dir, species, root, root))
        return

    samfile = outfile.replace('.seq', '.sam')
    columns = [str(i) for i in range(20)]
    df = pd.read_csv(filepath_or_buffer=samfile,
                     names=columns,
                     sep='\t',
                     header=None)
    df = df.drop_duplicates(subset=['0'], keep=False)
    #df.to_csv("tmp/bowtie_data/test.csv")
    data = df.iloc[:, ].values
    unique_locations = {}
    unmapped = 0
    pos_set = set()
    new_data = []
    unique = set()
    for fields in data:
        index = fields[0]
        chromo = fields[2]
        pos = fields[3]
        flags = int(fields[1])
        if chromo == '*':
            unmapped += 1
            continue
        alignment_score = -100
        if fields[11].find("AS:i:") == 0:
            alignment_score = int(fields[11].split(":")[-1])
        if alignment_score < 0:
            continue

        strand = '-' if flags & 16 else '+'
        new_seq = revcomp(fields[9]) if flags & 16 else fields[9]
        new_data.append([new_seq, chromo, pos, strand])
        unique.add(new_seq)

    locationfile = outfile.replace('.seq', '_location.csv')
    with open(locationfile, 'wb') as fp:
        a = csv.writer(fp, delimiter=',')
        header = ['context', 'chromo', 'pos', 'strand']
        a.writerows([header])
        for d in new_data:
            a.writerows([d])

    print "Wrote file: %s [%d lines]" % (locationfile, len(new_data))
    print "unmapped: %d" % unmapped
    print "unique: %d" % len(unique)
    ########

    df1 = pd.read_csv(filepath_or_buffer=designfile, sep=',')
    df2 = pd.read_csv(filepath_or_buffer=locationfile, sep=',')

    result = pd.merge(df1, df2, how='right', on=['context'])
    data = result.iloc[:, ].values
    new_data = []

    # ['id', 'side', 'context', 'chromo', 'pos', 'strand']
    for d in data:
        [my_id, context, te, chromo, pos, strand] = d[0:6]
        #chromo = chromo[3:]
        if chromo not in genome.keys():
            continue
        plen = len(context)

        if strand == '+' and side == 'start':
            ref_prefix = genome[chromo][pos:pos + plen]
            ref_suffix = genome[chromo][pos + plen:pos + plen + 25]
            other_context = genome[chromo][pos + plen:pos + plen + 25]
            pos = pos + plen
        if strand == '+' and side == 'end':
            ref_prefix = genome[chromo][pos - 25:pos]
            ref_suffix = genome[chromo][pos:pos + plen]
            other_context = genome[chromo][pos - 25:pos]
        if strand == '-' and side == 'start':
            ref_prefix = genome[chromo][pos:pos + 25]
            ref_suffix = genome[chromo][pos - plen:pos]
            other_context = genome[chromo][pos - 25:pos]
        if strand == '-' and side == 'end':
            ref_prefix = genome[chromo][pos + plen:pos + plen + 25]
            ref_suffix = genome[chromo][pos:pos + plen]
            other_context = genome[chromo][pos + plen:pos + plen + 25]
            pos = pos + plen
        other_context = MultiStringBWT.reverseComplement(
            other_context) if strand == '-' else other_context
        if strand == '-':
            ref_prefix = MultiStringBWT.reverseComplement(ref_prefix)
            ref_suffix = MultiStringBWT.reverseComplement(ref_suffix)

        ed_th = .2 * 25
        if side == 'start':
            ed = lv.distance(ref_suffix[:25], te[:25])
            ref_te = 1 if ed <= ed_th else 0
        if side == 'end':
            ed = lv.distance(ref_prefix[-25:], te[-25:])
            ref_te = 1 if ed <= ed_th else 0
        new_data.append([
            my_id, context, te, ref_te, ref_prefix, ref_suffix, chromo, pos,
            strand,
            len(context)
        ])
        #new_data.append([my_id,context,te,ref_prefix,ref_suffix,chromo,pos,strand])

    finalfile = locationfile.replace('location', 'UNIQUE')
    with open(finalfile, 'wb') as fp:
        a = csv.writer(fp, delimiter=',')
        header = [
            'my_id', 'context', 'TE', 'ref_te', 'ref_prefix', 'ref_suffix',
            'chromo', 'pos', 'strand', 'clen'
        ]
        a.writerows([header])
        for d in new_data:
            a.writerows([d])
    remove_duplicates(finalfile)
    return
    command = "rm ./tmp/bowtie_data/*.seq"
    rval = os.system(command)
    command = "rm ./tmp/bowtie_data/*.sam"
    rval = os.system(command)
    command = "rm ./tmp/bowtie_data/*_location.csv"
    rval = os.system(command)
コード例 #18
0
ファイル: CommandLineInterface.py プロジェクト: Rinoahu/msbwt
def mainRun():
    '''
    This is the primary function for external typical users to run when the Command Line Interface is used
    '''
    #start up the logger
    initLogger()
    
    #attempt to parse the arguments
    p = ap.ArgumentParser(description=util.DESC, formatter_class=ap.RawTextHelpFormatter)
    
    #version data
    p.add_argument('-V', '--version', action='version', version='%(prog)s' + \
                   ' %s in MSBWT %s' % (util.VERSION, util.PKG_VERSION))
    
    #TODO: do we want subparsers groups by type or sorted by name? it's type currently
    
    sp = p.add_subparsers(dest='subparserID')
    p2 = sp.add_parser('cffq', help='create a MSBWT from FASTQ files (pp + cfpp)')
    p2.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)')
    p2.add_argument('-u', '--uniform', dest='areUniform', action='store_true', help='the input sequences have uniform length', default=False)
    p2.add_argument('-c', '--compressed', dest='buildCompressed', action='store_true', help='build the RLE BWT (faster, less disk I/O)', default=False)
    p2.add_argument('outBwtDir', type=util.newDirectory, help='the output MSBWT directory')
    p2.add_argument('inputFastqs', nargs='+', type=util.readableFastqFile, help='the input FASTQ files')
    
    p7 = sp.add_parser('pp', help='pre-process FASTQ files before BWT creation')
    p7.add_argument('-u', '--uniform', dest='areUniform', action='store_true', help='the input sequences have uniform length', default=False)
    p7.add_argument('outBwtDir', type=util.newDirectory, help='the output MSBWT directory')
    p7.add_argument('inputFastqs', nargs='+', type=util.readableFastqFile, help='the input FASTQ files')
    
    p3 = sp.add_parser('cfpp', help='create a MSBWT from pre-processed sequences and offsets')
    p3.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)')
    p3.add_argument('-u', '--uniform', dest='areUniform', action='store_true', help='the input sequences have uniform length', default=False)
    p3.add_argument('-c', '--compressed', dest='buildCompressed', action='store_true', help='build the RLE BWT (faster, less disk I/O)', default=False)
    p3.add_argument('bwtDir', type=util.existingDirectory, help='the MSBWT directory to process')
    
    p4 = sp.add_parser('merge', help='merge many MSBWTs into a single MSBWT')
    p4.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)')
    p4.add_argument('outBwtDir', type=util.newDirectory, help='the output MSBWT directory')
    p4.add_argument('inputBwtDirs', nargs='+', type=util.existingDirectory, help='input BWT directories to merge')
    
    p5 = sp.add_parser('query', help='search for a sequence in an MSBWT, prints sequence and seqID')
    p5.add_argument('inputBwtDir', type=util.existingDirectory, help='the BWT to query')
    p5.add_argument('kmer', type=util.validKmer, help='the input k-mer to search for')
    p5.add_argument('-d', '--dump-seqs', dest='dumpSeqs', action='store_true', help='print all sequences with the given kmer (default=False)', default=False)
    
    p6 = sp.add_parser('massquery', help='search for many sequences in an MSBWT')
    p6.add_argument('inputBwtDir', type=util.existingDirectory, help='the BWT to query')
    p6.add_argument('kmerFile', help='a file with one k-mer per line')
    p6.add_argument('outputFile', help='output file with counts per line')
    p6.add_argument('-r', '--rev-comp', dest='reverseComplement', action='store_true', help='also search for each kmer\'s reverse complement', default=False)
    
    p8 = sp.add_parser('compress', help='compress a MSBWT from byte/base to RLE')
    p8.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)')
    p8.add_argument('srcDir', type=util.existingDirectory, help='the source directory for the BWT to compress')
    p8.add_argument('dstDir', type=util.newDirectory, help='the destination directory')
    
    p9 = sp.add_parser('decompress', help='decompress a MSBWT from RLE to byte/base')
    p9.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)')
    p9.add_argument('srcDir', type=util.existingDirectory, help='the source directory for the BWT to compress')
    p9.add_argument('dstDir', type=util.newDirectory, help='the destination directory')
    
    args = p.parse_args()
    
    if args.subparserID == 'cffq':
        logger.info('Inputs:\t'+str(args.inputFastqs))
        logger.info('Uniform:\t'+str(args.areUniform))
        logger.info('Output:\t'+args.outBwtDir)
        logger.info('Output Compressed:\t'+str(args.buildCompressed))
        logger.info('Processes:\t'+str(args.numProcesses))
        if args.numProcesses > 1:
            logger.warning('Using multi-processing with slow disk accesses can lead to slower build times.')
        print
        if args.areUniform:
            #if they are uniform, use the method developed by Bauer et al., it's likely short Illumina seq
            if args.buildCompressed:
                MultiStringBWT.createMSBWTCompFromFastq(args.inputFastqs, args.outBwtDir, args.numProcesses, args.areUniform, logger)
            else:
                MultiStringBWT.createMSBWTFromFastq(args.inputFastqs, args.outBwtDir, args.numProcesses, args.areUniform, logger)
        else:
            #if they aren't uniform, use the merge method by Holt et al., it's likely longer PacBio seq
            if args.buildCompressed:
                logger.error('No compressed builder for non-uniform datasets, compress after creation.')
            else:
                Multimerge.createMSBWTFromFastq(args.inputFastqs, args.outBwtDir, args.numProcesses, args.areUniform, logger)
        
    elif args.subparserID == 'pp':
        logger.info('Inputs:\t'+str(args.inputFastqs))
        logger.info('Uniform:\t'+str(args.areUniform))
        logger.info('Output:\t'+args.outBwtDir)
        if args.areUniform:
            #preprocess for Bauer et al. method
            MultiStringBWT.preprocessFastqs(args.inputFastqs, args.outBwtDir, args.areUniform, logger)
        else:
            #preprocess for Holt et al. method
            numProcs = 1
            Multimerge.preprocessFastqs(args.inputFastqs, args.outBwtDir, numProcs, args.areUniform, logger)
        
    elif args.subparserID == 'cfpp':
        logger.info('BWT dir:\t'+args.bwtDir)
        logger.info('Uniform:\t'+str(args.areUniform))
        logger.info('Output Compressed:\t'+str(args.buildCompressed))
        logger.info('Processes:\t'+str(args.numProcesses))
        if args.numProcesses > 1:
            logger.warning('Using multi-processing with slow disk accesses can lead to slower build times.')
        print
        seqFN = args.bwtDir+'/seqs.npy'
        offsetFN = args.bwtDir+'/offsets.npy'
        bwtFN = args.bwtDir+'/msbwt.npy'
        
        if args.areUniform:
            #process it using the column-wise Bauer et al. method
            if args.buildCompressed:
                MSBWTCompGenCython.createMsbwtFromSeqs(args.bwtDir, args.numProcesses, logger)
            else:
                MSBWTGenCython.createMsbwtFromSeqs(args.bwtDir, args.numProcesses, logger)
        else:
            #process it using the Holt et al. merge method
            if args.buildCompressed:
                logger.error('No compressed builder for non-uniform datasets, compress after creation.')
            else:
                Multimerge.interleaveLevelMerge(args.bwtDir, args.numProcesses, args.areUniform, logger)
        
    elif args.subparserID == 'compress':
        logger.info('Source Directory:'+args.srcDir)
        logger.info('Dest Directory:'+args.dstDir)
        logger.info('Processes:'+str(args.numProcesses))
        if args.srcDir == args.dstDir:
            raise Exception('Source and destination directories cannot be the same directory.')
        print
        MSBWTGen.compressBWT(args.srcDir+'/msbwt.npy', args.dstDir+'/comp_msbwt.npy', args.numProcesses, logger)
        
    elif args.subparserID == 'decompress':
        logger.info('Source Directory: '+args.srcDir)
        logger.info('Dest Directory: '+args.dstDir)
        logger.info('Processes: '+str(args.numProcesses))
        print
        MSBWTGen.decompressBWT(args.srcDir, args.dstDir, args.numProcesses, logger)
        #TODO: remove if srcdir and dstdir are the same?
        
    elif args.subparserID == 'merge':
        logger.info('Inputs:\t'+str(args.inputBwtDirs))
        logger.info('Output:\t'+args.outBwtDir)
        logger.info('Processes:\t'+str(args.numProcesses))
        if args.numProcesses > 1:
            logger.warning('Multi-processing is not supported at this time, but will be included in a future release.')
            numProcs = 1
            #logger.warning('Using multi-processing with slow disk accesses can lead to slower build times.')
        print
        #MSBWTGen.mergeNewMSBWT(args.outBwtDir, args.inputBwtDirs, args.numProcesses, logger)
        if len(args.inputBwtDirs) > 2:
            #this is a deprecated method, it may still work if you feel daring
            #MSBWTGenCython.mergeMsbwts(args.inputBwtDirs, args.outBwtDir, 1, logger)
            logger.error('Merging more than two MSBWTs at once is not currently supported.')
        else:
            GenericMerge.mergeTwoMSBWTs(args.inputBwtDirs[0], args.inputBwtDirs[1], args.outBwtDir, numProcs, logger)
        
    elif args.subparserID == 'query':
        #this is the easiest thing we can do, don't dump the standard info, just do it
        msbwt = MultiStringBWT.loadBWT(args.inputBwtDir, logger=logger)
        
        #always print how many are found, users can parse it out if they want
        r = msbwt.findIndicesOfStr(args.kmer)
        print r[1]-r[0]
        
        #dump the seqs if request
        if args.dumpSeqs:
            for x in xrange(r[0], r[1]):
                dInd = msbwt.getSequenceDollarID(x)
                print msbwt.recoverString(dInd)[1:]+','+str(dInd)
    
    elif args.subparserID == 'massquery':
        logger.info('Input:\t'+str(args.inputBwtDir))
        logger.info('Queries:\t'+str(args.kmerFile))
        logger.info('Output:\t'+args.outputFile)
        logger.info('Rev-comp:\t'+str(args.reverseComplement))
        print
        msbwt = MultiStringBWT.loadBWT(args.inputBwtDir, logger=logger)
        
        output = open(args.outputFile, 'w+')
        output.write('k-mer,counts')
        if args.reverseComplement:
            output.write(',revCompCounts\n')
        else:
            output.write('\n')
        
        logger.info('Beginning queries...')
        for line in open(args.kmerFile, 'r'):
            kmer = line.strip('\n')
            c = msbwt.countOccurrencesOfSeq(kmer)
            if args.reverseComplement:
                rc = msbwt.countOccurrencesOfSeq(MultiStringBWT.reverseComplement(kmer))
                output.write(kmer+','+str(c)+','+str(rc)+'\n')
            else:
                output.write(kmer+','+str(c)+'\n')
        logger.info('Queries complete.')
        
    else:
        print args.subparserID+" is currently not implemented, please wait for a future release."
コード例 #19
0
def mainRun():
    '''
    This is the primary function for external typical users to run when the Command Line Interface is used
    '''
    #start up the logger
    initLogger()

    #attempt to parse the arguments
    p = ap.ArgumentParser(description=util.DESC,
                          formatter_class=ap.RawTextHelpFormatter)

    #version data
    p.add_argument('-V', '--version', action='version', version='%(prog)s' + \
                   ' %s in MSBWT %s' % (util.VERSION, util.PKG_VERSION))

    #TODO: do we want subparsers groups by type or sorted by name? it's type currently

    sp = p.add_subparsers(dest='subparserID')
    p2 = sp.add_parser('cffq',
                       help='create a MSBWT from FASTQ files (pp + cfpp)')
    p2.add_argument('-p',
                    metavar='numProcesses',
                    dest='numProcesses',
                    type=int,
                    default=1,
                    help='number of processes to run (default: 1)')
    p2.add_argument('-u',
                    '--uniform',
                    dest='areUniform',
                    action='store_true',
                    help='the input sequences have uniform length',
                    default=False)
    p2.add_argument('-c',
                    '--compressed',
                    dest='buildCompressed',
                    action='store_true',
                    help='build the RLE BWT (faster, less disk I/O)',
                    default=False)
    p2.add_argument('outBwtDir',
                    type=util.newDirectory,
                    help='the output MSBWT directory')
    p2.add_argument('inputFastqs',
                    nargs='+',
                    type=util.readableFastqFile,
                    help='the input FASTQ files')

    p7 = sp.add_parser('pp',
                       help='pre-process FASTQ files before BWT creation')
    p7.add_argument('-u',
                    '--uniform',
                    dest='areUniform',
                    action='store_true',
                    help='the input sequences have uniform length',
                    default=False)
    p7.add_argument('outBwtDir',
                    type=util.newDirectory,
                    help='the output MSBWT directory')
    p7.add_argument('inputFastqs',
                    nargs='+',
                    type=util.readableFastqFile,
                    help='the input FASTQ files')

    p3 = sp.add_parser(
        'cfpp', help='create a MSBWT from pre-processed sequences and offsets')
    p3.add_argument('-p',
                    metavar='numProcesses',
                    dest='numProcesses',
                    type=int,
                    default=1,
                    help='number of processes to run (default: 1)')
    p3.add_argument('-u',
                    '--uniform',
                    dest='areUniform',
                    action='store_true',
                    help='the input sequences have uniform length',
                    default=False)
    p3.add_argument('-c',
                    '--compressed',
                    dest='buildCompressed',
                    action='store_true',
                    help='build the RLE BWT (faster, less disk I/O)',
                    default=False)
    p3.add_argument('bwtDir',
                    type=util.existingDirectory,
                    help='the MSBWT directory to process')

    p4 = sp.add_parser('merge', help='merge many MSBWTs into a single MSBWT')
    p4.add_argument('-p',
                    metavar='numProcesses',
                    dest='numProcesses',
                    type=int,
                    default=1,
                    help='number of processes to run (default: 1)')
    p4.add_argument('outBwtDir',
                    type=util.newDirectory,
                    help='the output MSBWT directory')
    p4.add_argument('inputBwtDirs',
                    nargs='+',
                    type=util.existingDirectory,
                    help='input BWT directories to merge')

    p5 = sp.add_parser(
        'query',
        help='search for a sequence in an MSBWT, prints sequence and seqID')
    p5.add_argument('inputBwtDir',
                    type=util.existingDirectory,
                    help='the BWT to query')
    p5.add_argument('kmer',
                    type=util.validKmer,
                    help='the input k-mer to search for')
    p5.add_argument(
        '-d',
        '--dump-seqs',
        dest='dumpSeqs',
        action='store_true',
        help='print all sequences with the given kmer (default=False)',
        default=False)

    p6 = sp.add_parser('massquery',
                       help='search for many sequences in an MSBWT')
    p6.add_argument('inputBwtDir',
                    type=util.existingDirectory,
                    help='the BWT to query')
    p6.add_argument('kmerFile', help='a file with one k-mer per line')
    p6.add_argument('outputFile', help='output file with counts per line')
    p6.add_argument('-r',
                    '--rev-comp',
                    dest='reverseComplement',
                    action='store_true',
                    help='also search for each kmer\'s reverse complement',
                    default=False)

    p8 = sp.add_parser('compress',
                       help='compress a MSBWT from byte/base to RLE')
    p8.add_argument('-p',
                    metavar='numProcesses',
                    dest='numProcesses',
                    type=int,
                    default=1,
                    help='number of processes to run (default: 1)')
    p8.add_argument('srcDir',
                    type=util.existingDirectory,
                    help='the source directory for the BWT to compress')
    p8.add_argument('dstDir',
                    type=util.newDirectory,
                    help='the destination directory')

    p9 = sp.add_parser('decompress',
                       help='decompress a MSBWT from RLE to byte/base')
    p9.add_argument('-p',
                    metavar='numProcesses',
                    dest='numProcesses',
                    type=int,
                    default=1,
                    help='number of processes to run (default: 1)')
    p9.add_argument('srcDir',
                    type=util.existingDirectory,
                    help='the source directory for the BWT to compress')
    p9.add_argument('dstDir',
                    type=util.newDirectory,
                    help='the destination directory')

    p10 = sp.add_parser('convert', help='convert from a raw text input to RLE')
    p10.add_argument('-i',
                     metavar='inputTextFN',
                     dest='inputTextFN',
                     default=None,
                     help='input text filename (default: stdin)')
    p10.add_argument('dstDir',
                     type=util.newDirectory,
                     help='the destination directory')

    args = p.parse_args()

    if args.subparserID == 'cffq':
        logger.info('Inputs:\t' + str(args.inputFastqs))
        logger.info('Uniform:\t' + str(args.areUniform))
        logger.info('Output:\t' + args.outBwtDir)
        logger.info('Output Compressed:\t' + str(args.buildCompressed))
        logger.info('Processes:\t' + str(args.numProcesses))
        if args.numProcesses > 1:
            logger.warning(
                'Using multi-processing with slow disk accesses can lead to slower build times.'
            )
        print
        if args.areUniform:
            #if they are uniform, use the method developed by Bauer et al., it's likely short Illumina seq
            if args.buildCompressed:
                MultiStringBWT.createMSBWTCompFromFastq(
                    args.inputFastqs, args.outBwtDir, args.numProcesses,
                    args.areUniform, logger)
            else:
                MultiStringBWT.createMSBWTFromFastq(args.inputFastqs,
                                                    args.outBwtDir,
                                                    args.numProcesses,
                                                    args.areUniform, logger)
        else:
            #if they aren't uniform, use the merge method by Holt et al., it's likely longer PacBio seq
            if args.buildCompressed:
                logger.error(
                    'No compressed builder for non-uniform datasets, compress after creation.'
                )
            else:
                Multimerge.createMSBWTFromFastq(args.inputFastqs,
                                                args.outBwtDir,
                                                args.numProcesses,
                                                args.areUniform, logger)

    elif args.subparserID == 'pp':
        logger.info('Inputs:\t' + str(args.inputFastqs))
        logger.info('Uniform:\t' + str(args.areUniform))
        logger.info('Output:\t' + args.outBwtDir)
        if args.areUniform:
            #preprocess for Bauer et al. method
            MultiStringBWT.preprocessFastqs(args.inputFastqs, args.outBwtDir,
                                            args.areUniform, logger)
        else:
            #preprocess for Holt et al. method
            numProcs = 1
            Multimerge.preprocessFastqs(args.inputFastqs, args.outBwtDir,
                                        numProcs, args.areUniform, logger)

    elif args.subparserID == 'cfpp':
        logger.info('BWT dir:\t' + args.bwtDir)
        logger.info('Uniform:\t' + str(args.areUniform))
        logger.info('Output Compressed:\t' + str(args.buildCompressed))
        logger.info('Processes:\t' + str(args.numProcesses))
        if args.numProcesses > 1:
            logger.warning(
                'Using multi-processing with slow disk accesses can lead to slower build times.'
            )
        print
        seqFN = args.bwtDir + '/seqs.npy'
        offsetFN = args.bwtDir + '/offsets.npy'
        bwtFN = args.bwtDir + '/msbwt.npy'

        if args.areUniform:
            #process it using the column-wise Bauer et al. method
            if args.buildCompressed:
                MSBWTCompGenCython.createMsbwtFromSeqs(args.bwtDir,
                                                       args.numProcesses,
                                                       logger)
            else:
                MSBWTGenCython.createMsbwtFromSeqs(args.bwtDir,
                                                   args.numProcesses, logger)
        else:
            #process it using the Holt et al. merge method
            if args.buildCompressed:
                logger.error(
                    'No compressed builder for non-uniform datasets, compress after creation.'
                )
            else:
                Multimerge.interleaveLevelMerge(args.bwtDir, args.numProcesses,
                                                args.areUniform, logger)

    elif args.subparserID == 'compress':
        logger.info('Source Directory:' + args.srcDir)
        logger.info('Dest Directory:' + args.dstDir)
        logger.info('Processes:' + str(args.numProcesses))
        if args.srcDir == args.dstDir:
            raise Exception(
                'Source and destination directories cannot be the same directory.'
            )
        print
        MSBWTGen.compressBWT(args.srcDir + '/msbwt.npy',
                             args.dstDir + '/comp_msbwt.npy',
                             args.numProcesses, logger)

    elif args.subparserID == 'decompress':
        logger.info('Source Directory: ' + args.srcDir)
        logger.info('Dest Directory: ' + args.dstDir)
        logger.info('Processes: ' + str(args.numProcesses))
        print
        MSBWTGen.decompressBWT(args.srcDir, args.dstDir, args.numProcesses,
                               logger)
        #TODO: remove if srcdir and dstdir are the same?

    elif args.subparserID == 'merge':
        logger.info('Inputs:\t' + str(args.inputBwtDirs))
        logger.info('Output:\t' + args.outBwtDir)
        logger.info('Processes:\t' + str(args.numProcesses))
        if args.numProcesses > 1:
            logger.warning(
                'Multi-processing is not supported at this time, but will be included in a future release.'
            )
            numProcs = 1
            #logger.warning('Using multi-processing with slow disk accesses can lead to slower build times.')
        print
        #MSBWTGen.mergeNewMSBWT(args.outBwtDir, args.inputBwtDirs, args.numProcesses, logger)
        if len(args.inputBwtDirs) > 2:
            #this is a deprecated method, it may still work if you feel daring
            #MSBWTGenCython.mergeMsbwts(args.inputBwtDirs, args.outBwtDir, 1, logger)
            logger.error(
                'Merging more than two MSBWTs at once is not currently supported.'
            )
        else:
            GenericMerge.mergeTwoMSBWTs(args.inputBwtDirs[0],
                                        args.inputBwtDirs[1], args.outBwtDir,
                                        numProcs, logger)

    elif args.subparserID == 'query':
        #this is the easiest thing we can do, don't dump the standard info, just do it
        msbwt = MultiStringBWT.loadBWT(args.inputBwtDir, logger=logger)

        #always print how many are found, users can parse it out if they want
        r = msbwt.findIndicesOfStr(args.kmer)
        print r[1] - r[0]

        #dump the seqs if request
        if args.dumpSeqs:
            for x in xrange(r[0], r[1]):
                dInd = msbwt.getSequenceDollarID(x)
                print msbwt.recoverString(dInd)[1:] + ',' + str(dInd)

    elif args.subparserID == 'massquery':
        logger.info('Input:\t' + str(args.inputBwtDir))
        logger.info('Queries:\t' + str(args.kmerFile))
        logger.info('Output:\t' + args.outputFile)
        logger.info('Rev-comp:\t' + str(args.reverseComplement))
        print
        msbwt = MultiStringBWT.loadBWT(args.inputBwtDir, logger=logger)

        output = open(args.outputFile, 'w+')
        output.write('k-mer,counts')
        if args.reverseComplement:
            output.write(',revCompCounts\n')
        else:
            output.write('\n')

        logger.info('Beginning queries...')
        for line in open(args.kmerFile, 'r'):
            kmer = line.strip('\n')
            c = msbwt.countOccurrencesOfSeq(kmer)
            if args.reverseComplement:
                rc = msbwt.countOccurrencesOfSeq(
                    MultiStringBWT.reverseComplement(kmer))
                output.write(kmer + ',' + str(c) + ',' + str(rc) + '\n')
            else:
                output.write(kmer + ',' + str(c) + '\n')
        logger.info('Queries complete.')

    elif args.subparserID == 'convert':
        if args.inputTextFN == None:
            logger.info('Input: stdin')
        else:
            logger.info('Input: ' + args.inputTextFN)
        logger.info('Output: ' + args.dstDir)
        logger.info('Beginning conversion...')
        CompressToRLE.compressInput(args.inputTextFN, args.dstDir)
        logger.info('Finished conversion.')

    else:
        print args.subparserID + " is currently not implemented, please wait for a future release."
コード例 #20
0
ファイル: TranscriptBuilder.py プロジェクト: Rinoahu/msbwt
 def firstTimeExtension(self, foundKmers, unexploredPaths, nodes, edges):
     '''
     @param foundKmers - Each kmer we find will be checked against this and added if not present
     @param unexploredPaths - if we find a new path split, we add the things here, also merges are important to add here
     @param nodes - the list of nodes if we find a new one
     '''
     pc = ''
     kmer = self.seq
     terminate = False
     while not terminate:
         if len(kmer) != self.pathK:
             print 'ERROR: DIFFERENT SIZED K-MER '+str(len(kmer))
             raise Exception('ERROR')
     
         #First, perform all the counts of paths going both forwards and backwards
         counts = {}
         revCounts = {}
         
         #maxV - the count of the (k+1)-mer with maxC on it, total is the total counts of valid chars
         maxV = 0
         maxC = ''
         total = 0
         
         #count the number of forward and reversed paths
         numPaths = 0
         numRevPaths = 0
         
         for c in self.validChars:
             counts[c] = self.msbwt.countOccurrencesOfSeq(kmer+c)+self.msbwt.countOccurrencesOfSeq(MultiStringBWT.reverseComplement(kmer+c))
             revCounts[c] = self.msbwt.countOccurrencesOfSeq(c+kmer)+self.msbwt.countOccurrencesOfSeq(MultiStringBWT.reverseComplement(c+kmer))
             
             if self.drawDollarTerminals or c != '$':
                 total += counts[c]
                 if counts[c] > maxV:
                     maxV = counts[c]
                     maxC = c
                 
                 if counts[c] >= self.pathThreshold:
                     numPaths += 1
                     
                 #if we have evidence from the counts OR if the previous character was known to be that character
                 if revCounts[c] >= self.pathThreshold or c == pc:
                     numRevPaths += 1
             
         #check if we have incoming edges, in which case we need to end this block
         if numRevPaths > 1 and kmer != self.seq:
             
             #remove the last kmer, because it's actually in the new node we merge into
             self.seq = self.seq[0:-1]
             
             #this will lead to repeating the same counts later, but that's okay
             newID = len(nodes)
             newHistMers = set([])
             nodes.append(PathNode(newID, kmer, self.msbwt, self.minDistToSeed+len(self.pileups), self.settingsDict))
             edges.append(PathEdge(len(edges), self.nodeID, newID, revCounts[pc], pc+', '+str(revCounts)))
             self.termCondition = 'MERGE_'+str(newID)
             foundKmers[kmer] = newID
             
             unexploredPaths.append(nodes[newID])
             
             #print 'Ending block for merge'
             terminate = True
             
         elif total == 0:
             #print 'No strings found.'
             self.termCondition = 'TERMINAL'
             terminate = True
         else:
             #the kmer was found in this block and it may have multiple extensions
             foundKmers[kmer] = self.nodeID
             revMer = MultiStringBWT.reverseComplement(kmer)
             if foundKmers.has_key(revMer):
                 otherID = foundKmers[revMer]
                 self.inversionSet.add(otherID)
                 nodes[otherID].inversionSet.add(self.nodeID)
             
             r1 = self.msbwt.findIndicesOfStr(kmer[-self.countK:])
             r2 = self.msbwt.findIndicesOfStr(MultiStringBWT.reverseComplement(kmer[-self.countK:]))
             kmerCount = (r1[1]-r1[0])+(r2[1]-r2[0])
             self.pileups.append(kmerCount)
             perc = float(maxV)/total
             
             if self.trackReads == True:
                 for i in xrange(r1[0], r1[1]):
                     self.readSet.add((int(self.msbwt.getSequenceDollarID(i)), 0))
                 for i in xrange(r2[0], r2[1]):
                     self.readSet.add((int(self.msbwt.getSequenceDollarID(i)), 1))
             
             #if kmerCount > self.overloadThreshold:
             if self.pileups[0] > self.overloadThreshold:
                 #this path is too heavy, we probably won't figure out what's going on downstream
                 self.termCondition = 'OVERLOAD'
                 terminate = True
                 
             elif numPaths > 1:
                 self.termCondition = 'SPLIT'
                 for c in self.validChars:
                     if counts[c] >= self.pathThreshold:
                         newKmer = kmer[1:]+c
                         if foundKmers.has_key(newKmer):
                             otherNID = foundKmers[newKmer]
                             nodes[otherNID].minDistToSeed = min(nodes[otherNID].minDistToSeed, self.minDistToSeed+len(self.pileups))
                             edges.append(PathEdge(len(edges), self.nodeID, otherNID, counts[c], c+': '+str(counts[c])))
                         
                         else:
                             if self.drawDollarTerminals or c != '$':
                                 newID = len(nodes)
                                 newHistMers = set([])
                                 nodes.append(PathNode(newID, newKmer, self.msbwt, self.minDistToSeed+len(self.pileups), self.settingsDict))
                                 edges.append(PathEdge(len(edges), self.nodeID, newID, counts[c], c+': '+str(counts[c])))
                                 foundKmers[newKmer] = newID
                                 
                                 if c != '$':
                                     unexploredPaths.append(nodes[newID])
                                 else:
                                     nodes[newID].termCondition = '$ Ext'
                             
                 terminate = True
             else:
                 #this is data pertaining to this k-mer
                 #print ':\t'+kmer+maxC+'\t'+str(perc)+'\t'+str(maxV)+'/'+str(total)+'\t'+str(total-maxV)+'\t'
                 pc = kmer[0]
                 kmer = kmer[1:]+maxC
                 #check if we've found the new k-mer before
                 if foundKmers.has_key(kmer):
                     otherNID = foundKmers[kmer]
                     nodes[otherNID].minDistToSeed = min(nodes[otherNID].minDistToSeed, self.minDistToSeed+len(self.pileups))
                     if counts[maxC] >= self.pathThreshold:
                         edges.append(PathEdge(len(edges), self.nodeID, otherNID, counts[maxC], pc+': '+str(counts[maxC])))
                         self.termCondition = 'MERGE_'+str(otherNID)
                     else:
                         edges.append(PathEdge(len(edges), self.nodeID, otherNID, counts[maxC], pc+': '+str(counts[maxC]), 'dashed'))
                         self.termCondition = 'MERGE_'+str(otherNID)+', THRESHOLD'
                         
                     terminate = True
                 else:
                     self.seq += maxC
                     if maxC == '$':
                         self.termCondition = '$ Max'
                         terminate = True
コード例 #21
0
ファイル: TranscriptBuilder.py プロジェクト: Rinoahu/msbwt
 def extendSeed(self, seedKmer, endSeeds):
     '''
     This function is intended to be an interactive technique for constructing transcripts, probably to be released
     in a future version of msbwt
     @param bwtFN - the filename of the BWT to load
     @param seedKmer - the seed sequence to use for construction
     @param threshold - minimum number for a path to be considered a path
     @param direction - True is forward, False is backward
     @param logger - the logger
     @param 
     '''
     
     if self.foundKmers.has_key(seedKmer):
         return
     
     pathK = self.settingsDict.get('kmerSize', len(seedKmer))
     countK = self.settingsDict.get('countK', pathK)
     isMerged = self.settingsDict.get('isMerged', False)
     trackPairs = self.settingsDict.get('trackPairs', False)
     trackReads = self.settingsDict.get('trackReads', False)
     useMemmap = self.settingsDict.get('useMemmap', True)
     maxDistance = self.settingsDict.get('maxDistance', 0xFFFFFFFF)
     
     if len(seedKmer) != pathK:
         raise Exception('Seed k-mer incorrect length')
     
     numNodes = self.settingsDict['numNodes']
     validChars = ['$', 'A', 'C', 'G', 'N', 'T']
     
     if self.logger != None:
         self.logger.info('Loading '+self.bwtDir+'...')
     msbwt = MultiStringBWT.loadBWT(self.bwtDir, useMemmap, self.logger)
     if os.path.exists(self.bwtDir+'/origins.npy'):
         raise Exception("You haven\'t reimplemented the handling of origin files")
         origins = np.load(self.bwtDir+'/origins.npy', 'r')
     else:
         origins = None
     
     self.settingsDict['interleaveFN'] = self.bwtDir+'/inter0.npy'
     
     kmer = seedKmer
     
     firstID = len(self.nodes)
     self.nodes.append(PathNode(firstID, kmer, msbwt, 0, self.settingsDict))
     self.foundKmers[kmer] = firstID
     
     for i, endSeed in enumerate(endSeeds):
         if len(endSeed) != pathK:
             raise Exception(endSeed+': NOT CORRECT LENGTH')
         else:
             endID = len(self.nodes)
             self.nodes.append(PathNode(endID, endSeed, msbwt, 0, self.settingsDict))
             self.nodes[endID].termCondition = 'END_SEED_'+str(i)
             self.foundKmers[endSeed] = endID
     
     if self.logger != None:
         self.logger.info('Beginning with seed \''+seedKmer+'\', pathK='+str(pathK)+', countK='+str(countK))
     
     unexploredPaths = [self.nodes[firstID]]
     
     #init the kmer dictionary
     execID = firstID
     
     while len(unexploredPaths) > 0:
         #uncomment to make this smallest first
         unexploredPaths.sort(key = lambda node: node.minDistToSeed)
         #print 'UP: '+'['+','.join([str((node.minDistToSeed, node.nodeID)) for node in unexploredPaths])+']'
         nextNode = unexploredPaths.pop(0)
         
         if nextNode.nodeID >= numNodes:
             nextNode.termCondition = 'UNEXPLORED_NODE'
         elif nextNode.minDistToSeed >= maxDistance:
             nextNode.termCondition = 'UNEXPLORED_DIST'
         else:
             nextNode.execOrder = execID
             execID += 1
             
             if self.logger != None:
                 self.logger.info('Exploring new node')
             nextNode.firstTimeExtension(self.foundKmers, unexploredPaths, self.nodes, self.edges)
         
     if isMerged and trackReads:
         interleaveFN = self.bwtDir+'/inter0.npy'
         interleave = np.load(interleaveFN, 'r')
         
         #we only need to do this for newly processed nodes
         for node in self.nodes[firstID:]:
             dIDs = node.readSet
             for dID in dIDs:
                 sourceID = interleave[dID[0]]
                 node.sourceCounts[sourceID] = node.sourceCounts.get(sourceID, 0)+1
             
     if trackPairs:
         abtFN = self.bwtDir+'/abt.npy'
         abt = np.load(abtFN, 'r')
                 
         #abtDict = {}
         
         #only need to process new nodes
         for node in self.nodes[firstID:]:
             dIDs = node.readSet
             
             for dID, direction in dIDs:
                 (fID, rID) = abt[dID]
                 if fID % 2 == 0:
                     oFID = fID+1
                 else:
                     oFID = fID-1
                 
                 if self.abtDict.has_key((oFID, rID, 1-direction)):
                     otherNIDs = self.abtDict[(oFID, rID, 1-direction)][1]
                     for n in otherNIDs:
                         self.nodes[n].pairedNodes[node.nodeID] = self.nodes[n].pairedNodes.get(node.nodeID, 0)+1
                         node.pairedNodes[n] = node.pairedNodes.get(n, 0)+1
                 
                 if not self.abtDict.has_key((fID, rID, direction)):
                     self.abtDict[(fID, rID, direction)] = (dID, set([]))
                 self.abtDict[(fID, rID, direction)][1].add(node.nodeID)
コード例 #22
0
ファイル: core.py プロジェクト: mnajarian/msbwtCloud
 def __init__(self):
     self.msbwt = MSBWT.loadBWT(sys.argv[1])
コード例 #23
0
def get_kmer_count(msbwt, kmer):
    c1 = msbwt.countOccurrencesOfSeq(kmer)
    c2 = msbwt.countOccurrencesOfSeq(MultiStringBWT.reverseComplement(kmer))
    return c1 + c2